Commit 6c606481 authored by Roman Sarrazin-Gendron's avatar Roman Sarrazin-Gendron
Browse files

added rfam ss

parent 51e70e5e
......@@ -210,7 +210,7 @@ def define_acceptable_columns(counts):
acceptable_scores = []
acceptable_cols = []
noise = []
threshold = 3+0.05 * get_tot_counts(counts)
threshold = 3+0.0 * get_tot_counts(counts)
for i in counts:
if counts[i]>threshold and i!=-1000:
acceptable_cols.append(i)
......@@ -292,7 +292,11 @@ def find_good_columns(module_columns_by_family, family,module,graphs):
def check_columns_ss(good_cols,family,module,graphs):
final_cols = [[] for x in good_cols]
for i in range(len(good_cols[0])):
min_n_cols = 1000
for i in good_cols:
if len(i)<min_n_cols:
min_n_cols=len(i)
for i in range(min_n_cols):
this_cols = [x[i] for x in good_cols]
print("GETTING SS FOR COLUMNS:", this_cols)
it_fits = verify_columns_ss(this_cols, family, module,graphs)
......@@ -404,7 +408,7 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
if family_match=="None":
continue
#print(matches[i][family_match])
if matches[CURRENT_MODULE][family_match][0]>4 and family_match==CURRENTLY_STUDIED_FAMILY:
if matches[CURRENT_MODULE][family_match][0]>3 and family_match==CURRENTLY_STUDIED_FAMILY:
for k in set(matches[CURRENT_MODULE][family_match][1]):
if k[0] not in bugged:
if newfam==True:
......@@ -579,10 +583,10 @@ if __name__ == "__main__":
rfam_by_PDB[PDB_id].append(i)
aligned_modulegraphs = pickle.load(open("full3_rna3dmotif_aligned_modulegraphs.cPickle",'rb'))
PDB_positions = pickle.load(open("full3_rna3dmotif_PDB_positions.cPickle",'rb'))
PDBs = pickle.load(open("full3_rna3dmotif_PDB_names.cPickle",'rb'))
graphs = pickle.load(open("full3_rna3dmotif_one_of_each_graph.cPickle",'rb'))
aligned_modulegraphs = pickle.load(open("all_full3_rna3dmotif_aligned_modulegraphs.cPickle",'rb'))
PDB_positions = pickle.load(open("all_full3_rna3dmotif_PDB_positions.cPickle",'rb'))
PDBs = pickle.load(open("all_full3_rna3dmotif_PDB_names.cPickle",'rb'))
graphs = pickle.load(open("all_full3_rna3dmotif_one_of_each_graph.cPickle",'rb'))
rfam_extra_sequences = []
format_aligned_modulegraphs = []
format_PDB_positions = []
......@@ -616,7 +620,7 @@ if __name__ == "__main__":
f.write("")
time.sleep(10)
if matches[module][family_match][0]>4:
if matches[module][family_match][0]>3:
#if family_match==2541:
#print("ZIPCODE",module_addresses)
module_seqs = get_module_seqs_for_family(family_match,matches,module_addresses,module,graphs)
......@@ -641,7 +645,7 @@ if __name__ == "__main__":
print("CORRECTED NUMBER OF EXTRA SEQUENCES : ",len(new_seq))
final_sequences.append(new_seq)
#pickle.dump(rfam_extra_sequences,open("test3_rfam.cPickle",'wb'))
pickle.dump(final_sequences,open("full3_rna3dmotif_rfam.cPickle",'wb'))
pickle.dump(final_sequences,open("all2_full3_rna3dmotif_rfam.cPickle",'wb'))
#pickle.dump(format_aligned_modulegraphs,open("3dmotif_aligned_modulegraphs.cPickle",'wb'))
#$pickle.dump(format_graphs,open("3dmotif_one_of_each_graph.cPickle",'wb'))
#pickle.dump(format_PDB_names,open("3dmotif_PDB_names.cPickle",'wb'))
......
......@@ -103,7 +103,7 @@ for i in os.listdir('.'):
kk = kk+1
if found==False:
graphs.append([g])
IDs.append([i[:-5]])
IDs.append([[i[:-5]]])
print("FINISHED")
print(len(graphs))
sig = 0
......
......@@ -350,6 +350,7 @@ boring=["CWW","B53","S55","S33"]
PDBs = []
motif_pos = []
for j in motif:
print(j)
short_PDBs = []
not_boring=False
#print(i['names'])
......@@ -396,7 +397,10 @@ for j in motif:
g_list = i[0]
#main_graph = i['graph']
main_graph = i[0][1]
print(i[1])
this_pdb = [kk for kk in [zzz[0] for zzz in i[1]]]
print(this_pdb)
exit()
pdb_names = set([pdb[0:4] for pdb in this_pdb])
if len(pdb_names)<20:
continue
......@@ -433,10 +437,10 @@ for j in motif:
print("TERRIBLE")
print(main_graph.nodes)
continue
pickle.dump(full_aln,open("all_full3_rna3dmotif_aligned_modulegraphs.cPickle",'wb'))
pickle.dump(graphs,open("all_full3_rna3dmotif_one_of_each_graph.cPickle",'wb'))
pickle.dump(PDBs,open("all_full3_rna3dmotif_PDB_names.cPickle",'wb'))
pickle.dump(motif_pos,open("all_full3_rna3dmotif_PDB_positions.cPickle",'wb'))
pickle.dump(full_aln,open("final_rna3dmotif_aligned_modulegraphs.cPickle",'wb'))
pickle.dump(graphs,open("final_rna3dmotif_one_of_each_graph.cPickle",'wb'))
pickle.dump(PDBs,open("final_rna3dmotif_PDB_names.cPickle",'wb'))
pickle.dump(motif_pos,open("final_rna3dmotif_PDB_positions.cPickle",'wb'))
print("NUMBER OF MOTIFS")
......
......@@ -2,8 +2,8 @@ import pickle
import networkx as nx
from matplotlib import pyplot as plt
g = pickle.load(open("all_graphs_pickled/" + "1KH6" + ".nxpickled", "rb"))
PDBid = "1KH6.A"
#g = pickle.load(open("all_graphs_pickled/" + "1FFK" + ".nxpickled", "rb"))
PDBid = "1EBR.A"
#print("PDB")
#print(PDBid)
PDB, chain = PDBid.split(".")
......
import pickle
import networkx as nx
from collections import OrderedDict as od
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO
from Bio import SeqIO
aligned_modulegraphs = pickle.load(open("../models/all_21+_rna3dmotif_106_aligned_modulegraphs.cPickle", 'rb'))
PDB_positions = pickle.load(open("../models/all_21+_rna3dmotif_106__PDB_positions.cPickle", 'rb'))
PDBs = pickle.load(open("../models/all_21+_rna3dmotif_106__PDB_names.cPickle", 'rb'))
graphs = pickle.load(open("../models/all_21+_rna3dmotif_106_one_of_each_graph.cPickle", 'rb'))
module = 1
aln = aligned_modulegraphs[module]
g = graphs[module]
def get_ss_from_graph(graph):
#print(graph.nodes())
#print(graph.edges(data=True))
ss = []
edges = list(graph.edges(data=True))
for i in edges:
if i[2]["label"]=="cWW" or i[2]["label"]=="CWW":
if i[0]<i[1]:
a = list(graph.nodes()).index(i[0])
b = list(graph.nodes()).index(i[1])
ss.append((a,b))
return ss
def get_addresses(aln):
addresses = od()
for i in range(len(aln.values().__iter__().__next__())):
addresses[i] = []
keys = aln.keys()
sortedkeys = (keys)
#print(keys)
for j in sortedkeys:
addresses[i].append(aln[j][i])
return(addresses)
addresses = get_addresses(aln)
def get_seqs (g,aln,addresses):
seqs = {}
for i in range(len(aln.values().__iter__().__next__())):
seqs[i] = ""
for j in range(min(addresses[i]),max(addresses[i])+1):
if j in g[i].nodes():
index = list(g[i].nodes()).index(j)
nuc = list(g[i].nodes(data=True))[index][1]['nuc']
seqs[i] = seqs[i] + nuc
else:
seqs[i] = seqs[i] + '-'
return seqs
print("ADDRESSES",addresses)
print("SEQS")
seqs = get_seqs(g,aln,addresses)
print(seqs)
def get_consensus_sequence(seqs):
consensus =[]
for i in range(len(seqs[0])):
this_column = [seqs[o][i] for o in seqs]
best = max(set(this_column), key=this_column.count)
consensus.append(best)
return consensus
with open("seq.fasta", "a") as f:
for i in seqs:
f.write(">seq" + str(i) + "\n")
f.write(str(seqs[i]) + "\n")
#a =ClustalwCommandline("../models/clustalw2", infile="seq.fasta", outfile="aligned.clustal")
#print(a)
#a()
#alignment = SeqIO.parse("seq.fasta","fasta")
#SeqIO.write(alignment,open("module_"+str(module)+".stk","w"),"stockholm")
#cons = get_consensus_sequence(seqs)
print(graphs[module][2].nodes(data=True))
ss= get_ss_from_graph(graphs[module][0])
print(ss)
print(PDBs[module][2])
print(PDB_positions[module][2])
# STOCKHOLM 1.0
#=GF SQ 292
seq0 CUUCGG
#=GS seq0 AC seq0
#=GS seq0 DE seq0
seq1 CUUCGG
#=GS seq1 AC seq1
#=GS seq1 DE seq1
seq2 CUUCGG
#=GS seq2 AC seq2
#=GS seq2 DE seq2
seq3 CUUCGG
#=GS seq3 AC seq3
#=GS seq3 DE seq3
seq4 CUUCGG
#=GS seq4 AC seq4
#=GS seq4 DE seq4
seq5 CUACGG
#=GS seq5 AC seq5
#=GS seq5 DE seq5
seq6 CUACGG
#=GS seq6 AC seq6
#=GS seq6 DE seq6
seq7 CUUCGG
#=GS seq7 AC seq7
#=GS seq7 DE seq7
seq8 CUUCGG
#=GS seq8 AC seq8
#=GS seq8 DE seq8
seq9 CUUCGG
#=GS seq9 AC seq9
#=GS seq9 DE seq9
seq10 CUACGG
#=GS seq10 AC seq10
#=GS seq10 DE seq10
seq11 CUACGG
#=GS seq11 AC seq11
#=GS seq11 DE seq11
seq12 CUUCGG
#=GS seq12 AC seq12
#=GS seq12 DE seq12
seq13 CUACGG
#=GS seq13 AC seq13
#=GS seq13 DE seq13
seq14 CUACGG
#=GS seq14 AC seq14
#=GS seq14 DE seq14
seq15 CUUCGG
#=GS seq15 AC seq15
#=GS seq15 DE seq15
seq16 CUACGG
#=GS seq16 AC seq16
#=GS seq16 DE seq16
seq17 CUACGG
#=GS seq17 AC seq17
#=GS seq17 DE seq17
seq18 CUUCGG
#=GS seq18 AC seq18
#=GS seq18 DE seq18
seq19 CUACGG
#=GS seq19 AC seq19
#=GS seq19 DE seq19
seq20 CUACGG
#=GS seq20 AC seq20
#=GS seq20 DE seq20
seq21 CUUCGG
#=GS seq21 AC seq21
#=GS seq21 DE seq21
seq22 CUACGG
#=GS seq22 AC seq22
#=GS seq22 DE seq22
seq23 CUACGG
#=GS seq23 AC seq23
#=GS seq23 DE seq23
seq24 CUUCGG
#=GS seq24 AC seq24
#=GS seq24 DE seq24
seq25 CUACGG
#=GS seq25 AC seq25
#=GS seq25 DE seq25
seq26 CUACGG
#=GS seq26 AC seq26
#=GS seq26 DE seq26
seq27 CUUCGG
#=GS seq27 AC seq27
#=GS seq27 DE seq27
seq28 CUACGG
#=GS seq28 AC seq28
#=GS seq28 DE seq28
seq29 CUACGG
#=GS seq29 AC seq29
#=GS seq29 DE seq29
seq30 CUUCGG
#=GS seq30 AC seq30
#=GS seq30 DE seq30
seq31 CUACGG
#=GS seq31 AC seq31
#=GS seq31 DE seq31
seq32 CUACGG
#=GS seq32 AC seq32
#=GS seq32 DE seq32
seq33 CUUCGG
#=GS seq33 AC seq33
#=GS seq33 DE seq33
seq34 CUACGG
#=GS seq34 AC seq34
#=GS seq34 DE seq34
seq35 CUACGG
#=GS seq35 AC seq35
#=GS seq35 DE seq35
seq36 CUUCGG
#=GS seq36 AC seq36
#=GS seq36 DE seq36
seq37 CUACGG
#=GS seq37 AC seq37
#=GS seq37 DE seq37
seq38 CUACGG
#=GS seq38 AC seq38
#=GS seq38 DE seq38
seq39 CUUCGG
#=GS seq39 AC seq39
#=GS seq39 DE seq39
seq40 CUUCGG
#=GS seq40 AC seq40
#=GS seq40 DE seq40
seq41 CUUCGG
#=GS seq41 AC seq41
#=GS seq41 DE seq41
seq42 CUACGG
#=GS seq42 AC seq42
#=GS seq42 DE seq42
seq43 CUACGG
#=GS seq43 AC seq43
#=GS seq43 DE seq43
seq44 CUUCGG
#=GS seq44 AC seq44
#=GS seq44 DE seq44
seq45 CUUCGG
#=GS seq45 AC seq45
#=GS seq45 DE seq45
seq46 CUUCGG
#=GS seq46 AC seq46
#=GS seq46 DE seq46
seq47 CUUCGG
#=GS seq47 AC seq47
#=GS seq47 DE seq47
seq48 CUUCGG
#=GS seq48 AC seq48
#=GS seq48 DE seq48
seq49 CUUCGG
#=GS seq49 AC seq49
#=GS seq49 DE seq49
seq50 CUUCGG
#=GS seq50 AC seq50
#=GS seq50 DE seq50
seq51 CUUCGG
#=GS seq51 AC seq51
#=GS seq51 DE seq51
seq52 CUUCGG
#=GS seq52 AC seq52
#=GS seq52 DE seq52
seq53 CUUCGG
#=GS seq53 AC seq53
#=GS seq53 DE seq53
seq54 CUUCGG
#=GS seq54 AC seq54
#=GS seq54 DE seq54
seq55 CUUCGG
#=GS seq55 AC seq55
#=GS seq55 DE seq55
seq56 CUUCGG
#=GS seq56 AC seq56
#=GS seq56 DE seq56
seq57 CUUCGG
#=GS seq57 AC seq57
#=GS seq57 DE seq57
seq58 CUUCGG
#=GS seq58 AC seq58
#=GS seq58 DE seq58
seq59 CUUCGG
#=GS seq59 AC seq59
#=GS seq59 DE seq59
seq60 CUUCGG
#=GS seq60 AC seq60
#=GS seq60 DE seq60
seq61 CUUCGG
#=GS seq61 AC seq61
#=GS seq61 DE seq61
seq62 CUACGG
#=GS seq62 AC seq62
#=GS seq62 DE seq62
seq63 CUACGG
#=GS seq63 AC seq63
#=GS seq63 DE seq63
seq64 CUACGG
#=GS seq64 AC seq64
#=GS seq64 DE seq64
seq65 CUUCGG
#=GS seq65 AC seq65
#=GS seq65 DE seq65
seq66 CUACGG
#=GS seq66 AC seq66
#=GS seq66 DE seq66
seq67 CUACGG
#=GS seq67 AC seq67
#=GS seq67 DE seq67
seq68 CUUCGG
#=GS seq68 AC seq68
#=GS seq68 DE seq68
seq69 CUACGG
#=GS seq69 AC seq69
#=GS seq69 DE seq69
seq70 CUUCGG
#=GS seq70 AC seq70
#=GS seq70 DE seq70
seq71 CUACGG
#=GS seq71 AC seq71
#=GS seq71 DE seq71
seq72 CUUCGG
#=GS seq72 AC seq72
#=GS seq72 DE seq72
seq73 CUUCGG
#=GS seq73 AC seq73
#=GS seq73 DE seq73
seq74 CUUCGG
#=GS seq74 AC seq74
#=GS seq74 DE seq74
seq75 CUACGG
#=GS seq75 AC seq75
#=GS seq75 DE seq75
seq76 CUACGG
#=GS seq76 AC seq76
#=GS seq76 DE seq76
seq77 CUUCGG
#=GS seq77 AC seq77
#=GS seq77 DE seq77
seq78 CUACGG
#=GS seq78 AC seq78
#=GS seq78 DE seq78
seq79 CUUCGG
#=GS seq79 AC seq79
#=GS seq79 DE seq79
seq80 CUUCGG
#=GS seq80 AC seq80
#=GS seq80 DE seq80
seq81 CUACGG
#=GS seq81 AC seq81
#=GS seq81 DE seq81
seq82 CUUCGG
#=GS seq82 AC seq82
#=GS seq82 DE seq82
seq83 CUACGG
#=GS seq83 AC seq83
#=GS seq83 DE seq83
seq84 CUUCGG
#=GS seq84 AC seq84
#=GS seq84 DE seq84
seq85 CUUCGG
#=GS seq85 AC seq85
#=GS seq85 DE seq85
seq86 CUUCGG
#=GS seq86 AC seq86
#=GS seq86 DE seq86
seq87 CUUCGG
#=GS seq87 AC seq87
#=GS seq87 DE seq87
seq88 CUUCGG
#=GS seq88 AC seq88
#=GS seq88 DE seq88
seq89 CUUCGG
#=GS seq89 AC seq89
#=GS seq89 DE seq89
seq90 CUUCGG
#=GS seq90 AC seq90
#=GS seq90 DE seq90
seq91 CUUCGG
#=GS seq91 AC seq91
#=GS seq91 DE seq91
seq92 UCACGG
#=GS seq92 AC seq92
#=GS seq92 DE seq92
seq93 CUACGG
#=GS seq93 AC seq93
#=GS seq93 DE seq93
seq94 CUACGG
#=GS seq94 AC seq94
#=GS seq94 DE seq94
seq95 CUUCGG
#=GS seq95 AC seq95
#=GS seq95 DE seq95
seq96 CUUCGG
#=GS seq96 AC seq96
#=GS seq96 DE seq96
seq97 CUUCGG
#=GS seq97 AC seq97
#=GS seq97 DE seq97
seq98 UUACGG
#=GS seq98 AC seq98
#=GS seq98 DE seq98
seq99 CUUCGG
#=GS seq99 AC seq99
#=GS seq99 DE seq99
seq100 CUUCGG
#=GS seq100 AC seq100
#=GS seq100 DE seq100
seq101 CUUCGG
#=GS seq101 AC seq101
#=GS seq101 DE seq101
seq102 CUUCGG
#=GS seq102 AC seq102
#=GS seq102 DE seq102
seq103 CUUCGG
#=GS seq103 AC seq103
#=GS seq103 DE seq103
seq104 CUUCGG
#=GS seq104 AC seq104
#=GS seq104 DE seq104
seq105 CUUCGG
#=GS seq105 AC seq105
#=GS seq105 DE seq105
seq106 CUUCGG
#=GS seq106 AC seq106
#=GS seq106 DE seq106
seq107 CUUCGG
#=GS seq107 AC seq107
#=GS seq107 DE seq107
seq108 CUUCGG
#=GS seq108 AC seq108
#=GS seq108 DE seq108
seq109 CUUCGG
#=GS seq109 AC seq109
#=GS seq109 DE seq109
seq110 CUUCGG
#=GS seq110 AC seq110
#=GS seq110 DE seq110
seq111 CUUCGG
#=GS seq111 AC seq111
#=GS seq111 DE seq111
seq112 CUACGG
#=GS seq112 AC seq112
#=GS seq112 DE seq112
seq113 CUUCGG
#=GS seq113 AC seq113
#=GS seq113 DE seq113
seq114 CUUCGG
#=GS seq114 AC seq114
#=GS seq114 DE seq114
seq115 CUUCGG
#=GS seq115 AC seq115
#=GS seq115 DE seq115
seq116 CUACGG
#=GS seq116 AC seq116
#=GS seq116 DE seq116
seq117 CUUCGG
#=GS seq117 AC seq117
#=GS seq117 DE seq117
seq118 CUUCGG
#=GS seq118 AC seq118
#=GS seq118 DE seq118
seq119 CUACGG
#=GS seq119 AC seq119
#=GS seq119 DE seq119
seq120 CUACGG
#=GS seq120 AC seq120
#=GS seq120 DE seq120
seq121 CUACGG
#=GS seq121 AC seq121
#=GS seq121 DE seq121
seq122 CUACGG
#=GS seq122 AC seq122
#=GS seq122 DE seq122
seq123 CUACGG
#=GS seq123 AC seq123
#=GS seq123 DE seq123
seq124 CUACGG
#=GS seq124 AC seq124
#=GS seq124 DE seq124
seq125 CUACGG
#=GS seq125 AC seq125
#=GS seq125 DE seq125
seq126 CUUCGG
#=GS seq126 AC seq126
#=GS seq126 DE seq126
seq127 CUUCGG
#=GS seq127 AC seq127
#=GS seq127 DE seq127
seq128 CUUCGG
#=GS seq128 AC seq128
#=GS seq128 DE seq128
seq129 CUUCGG
#=GS seq129 AC seq129
#=GS seq129 DE seq129
seq130 CUUCGG
#=GS seq130 AC seq130
#=GS seq130 DE seq130
seq131 CUUCGG
#=GS seq131 AC seq131
#=GS seq131 DE seq131
seq132 CUACGG
#=GS seq132 AC seq132
#=GS seq132 DE seq132
seq133 CUUCGG
#=GS seq133 AC seq133
#=GS seq133 DE seq133
seq134 CUUCGG
#=GS seq134 AC seq134
#=GS seq134 DE seq134
seq135 CUUCGG
#=GS seq135 AC seq135
#=GS seq135 DE seq135
seq136 CUACGG
#=GS seq136 AC seq136
#=GS seq136 DE seq136
seq137 CUUCGG
#=GS seq137 AC seq137
#=GS seq137 DE seq137
seq138 CUUCGG
#=GS seq138 AC seq138
#=GS seq138 DE seq138
seq139 CUUCGG
#=GS seq139 AC seq139
#=GS seq139 DE seq139
seq140 CUUCGG
#=GS seq140 AC seq140
#=GS seq140 DE seq140
seq141 CUACGG
#=GS seq141 AC seq141
#=GS seq141 DE seq141
seq142 CUUCGG
#=GS seq142 AC seq142
#=GS seq142 DE seq142
seq143 CUACGG
#=GS seq143 AC seq143