Commit ba8ecbd6 authored by Roman Sarrazin-Gendron's avatar Roman Sarrazin-Gendron
Browse files

full3_rna3dmotif code and results

parent 7521a1d3
import pickle
import networkx as nx
from matplotlib import pyplot as plt
graphs = pickle.load(open("rna3dmotif_one_of_each_graph.cPickle", "rb"))
pdb_positions = pickle.load(open("rna3dmotif_PDB_positions.cPickle", "rb"))
g = graphs[3][1]
nx.draw(g)
plt.show()
print(pdb_positions[3])
"""
modules = pickle.load(open("all3dmotif_graphs.cPickle",'rb'))
sig_g=[]
good_mods = []
......@@ -20,3 +27,4 @@ for i in range(len(modules)):
print(n)
pickle.dump(sig_g,open("best3dmotif_graphs.cPickle",'wb'))
"""
\ No newline at end of file
This diff is collapsed.
......@@ -48,18 +48,8 @@ def get_PDB_sequence(PDBid):
#print(node)
#print(node[0][0],chain)
if node[0][0]==chain:
nodecode = (node[1]["fr3d"][1])
if not nodecode.isdigit():
print("FR3D ADDRESS IS NOT A NUMBER")
print(node)
fixed_nodecode = node[0][1]
#fixed_nodecode = ""
nodelist = list(nodecode)
#for i in nodelist:
# if str(i).isdigit():
# fixed_nodecode = fixed_nodecode+str(i)
nodecode = fixed_nodecode
nodes.append((int(nodecode),node[1]["real_nt"]))
nodecode = (node[0][1])
nodes.append((int(nodecode),node[1]["nt"]))
sortednodes = sorted(list(nodes))
nuc_by_node={}
missing_nuc = False
......@@ -220,7 +210,7 @@ def define_acceptable_columns(counts):
acceptable_scores = []
acceptable_cols = []
noise = []
threshold = 1.5+0.1 * get_tot_counts(counts)
threshold = 3+0.05 * get_tot_counts(counts)
for i in counts:
if counts[i]>threshold and i!=-1000:
acceptable_cols.append(i)
......@@ -248,6 +238,7 @@ def get_bps_from_ss(ss, columns):
starts.append(i)
if ss[i]==")" or ss[i]==">" or ss[i]=="}":
ends.append(i)
ends.reverse()
#struct = [(columns[starts[i]],columns[ends[i]]) for i in range(min(len(starts),len(ends)))]
if min(len(starts),len(ends))==0:
return ()
......@@ -259,7 +250,12 @@ def verify_columns_ss(columns,family,module, graphs):
rfam_bps = get_bps_from_ss(rfam_ss,columns)
graph_ss=get_ss_from_graph(graphs[module][0])
print("COMPARING :",rfam_bps,graph_ss)
if rfam_ss==graph_ss:
compatible = True
for bp in graph_ss:
if bp not in rfam_bps:
compatible=False
if compatible:
return True
else:
return False
......@@ -401,14 +397,14 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
profile = "RF"+format(CURRENTLY_STUDIED_FAMILY,"05d")+".clustal"
bugged = ["3BBX"]
rfam_module_seqs = []
print(matches)
#print(matches)
for family_match in matches[CURRENT_MODULE]:
#print("WRITTEN",written)
written = []
if family_match=="None":
continue
#print(matches[i][family_match])
if matches[CURRENT_MODULE][family_match][0]>10 and family_match==CURRENTLY_STUDIED_FAMILY:
if matches[CURRENT_MODULE][family_match][0]>4 and family_match==CURRENTLY_STUDIED_FAMILY:
for k in set(matches[CURRENT_MODULE][family_match][1]):
if k[0] not in bugged:
if newfam==True:
......@@ -538,9 +534,9 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
print("STRUCTURE SEQUENCE: ",sequence)
print("RFAM SEQUENCE : ", experiment_seq)
for record in aln:
#print('FINDING PURE RFAM SEQ:',record)
if "seq" not in record.id:
#print("COLUMNS",good_cols)
# print("COLUMNS",good_cols)
seq = list(str(record.seq))
for i in range(len(good_cols[0])):
experiment_seq = []
......@@ -583,10 +579,10 @@ if __name__ == "__main__":
rfam_by_PDB[PDB_id].append(i)
aligned_modulegraphs = pickle.load(open("test_aligned_modulegraphs.cPickle",'rb'))
PDB_positions = pickle.load(open("test_PDB_positions.cPickle",'rb'))
PDBs = pickle.load(open("test_PDB_names.cPickle",'rb'))
graphs = pickle.load(open("test_one_of_each_graph.cPickle",'rb'))
aligned_modulegraphs = pickle.load(open("full3_rna3dmotif_aligned_modulegraphs.cPickle",'rb'))
PDB_positions = pickle.load(open("full3_rna3dmotif_PDB_positions.cPickle",'rb'))
PDBs = pickle.load(open("full3_rna3dmotif_PDB_names.cPickle",'rb'))
graphs = pickle.load(open("full3_rna3dmotif_one_of_each_graph.cPickle",'rb'))
rfam_extra_sequences = []
format_aligned_modulegraphs = []
format_PDB_positions = []
......@@ -599,10 +595,14 @@ if __name__ == "__main__":
module_PDB_addresses = {}
#interesting = [2, 8, 9, 14, 20, 28, 36, 59, 113, 127, 133, 150, 162, 194, 195]
interesting = range(len(graphs))
#interesting = [0,1,2,3,4,5]
test_modules = interesting
a = get_ss_from_graph(graphs[2][0])
rfam_module_seqs = {}
matches,module_addresses=get_matches(test_modules,rfam,rfam_by_PDB, PDBs, graphs)
print(matches)
print(module_addresses)
print("MATCHES", matches)
for module in test_modules:
print("CURRENT MODULE : ",module)
rfam_module_seqs[module] = []
......@@ -615,8 +615,8 @@ if __name__ == "__main__":
with open("seq2.fasta","w") as f:
f.write("")
time.sleep(10)
#print("MATCHES",matches)
if matches[module][family_match][0]>10:
if matches[module][family_match][0]>4:
#if family_match==2541:
#print("ZIPCODE",module_addresses)
module_seqs = get_module_seqs_for_family(family_match,matches,module_addresses,module,graphs)
......@@ -635,13 +635,15 @@ if __name__ == "__main__":
print("OUTPUT")
final_sequences = []
for i in range(len(rfam_extra_sequences)):
print(i)
print('MODULE:',i)
print('TOTAL EXTRA SEQUENCES FOUND : ',len(rfam_extra_sequences[i]))
new_seq = rule_out_impossible_seqs(graphs[interesting[i]][0],rfam_extra_sequences[i])
print("CORRECTED NUMBER OF EXTRA SEQUENCES : ",len(new_seq))
final_sequences.append(new_seq)
#pickle.dump(rfam_extra_sequences,open("test3_rfam.cPickle",'wb'))
pickle.dump(final_sequences,open("test4_rfam.cPickle",'wb'))
pickle.dump(format_aligned_modulegraphs,open("test4_aligned_modulegraphs.cPickle",'wb'))
pickle.dump(format_graphs,open("test4_one_of_each_graph.cPickle",'wb'))
pickle.dump(format_PDB_names,open("test4_PDB_names.cPickle",'wb'))
pickle.dump(format_PDB_positions,open("test4_PDB_positions.cPickle",'wb'))
pickle.dump(final_sequences,open("full3_rna3dmotif_rfam.cPickle",'wb'))
#pickle.dump(format_aligned_modulegraphs,open("3dmotif_aligned_modulegraphs.cPickle",'wb'))
#$pickle.dump(format_graphs,open("3dmotif_one_of_each_graph.cPickle",'wb'))
#pickle.dump(format_PDB_names,open("3dmotif_PDB_names.cPickle",'wb'))
#pickle.dump(format_PDB_positions,open("3dmotif_PDB_positions.cPickle",'wb'))
import pickle
from matplotlib import pyplot as plt
lens = []
seqs = pickle.load(open("../models/full3_rna3dmotif_rfam.cPickle","rb"))
graphs = pickle.load(open("../models/full3_rna3dmotif_one_of_each_graph.cPickle","rb"))
print(len(seqs))
for i in seqs:
print(len(i))
lens.append(len(i))
plt.bar(range(len(lens)),lens)
plt.title("Number of extra Rfam sequences found by module")
plt.xlabel("module ID")
plt.ylabel("number of extra Rfam sequences")
plt.show()
......@@ -6,7 +6,7 @@ import networkx.algorithms.isomorphism as iso
import operator
import pickle
def compare_graphs(g1,g2):
em = iso.generic_edge_match('label',["cWW","tWW","cWS","tWS","cWH","tWH","cHH","tHH","tHW","cHW","cHS","tHS","cSW","tSW","cSH","tSH","cSS","tSS","c++","t++","c--","t--","b53"],operator.eq)
em = iso.generic_edge_match('label',["CWW","TWW","CWS","TWS","CWH","TWH","CHH","THH","THW","CHW","CHS","THS","CSW","TSW","CSH","TSH","CSS","TSS","C++","T++","C--","T--","B53"],operator.eq)
isof = nx.is_isomorphic(g1,g2,edge_match=em)
return isof
......@@ -29,6 +29,7 @@ def make_align_graph(g1):
nx.draw_networkx_edge_labels(g0,pos,elabels)
plt.show()
os.chdir('../all_catalogue')
os.chdir('DESC')
#print(os.listdir('.'))
tot = len(os.listdir('.'))
......@@ -69,12 +70,12 @@ for i in os.listdir('.'):
#print(p1,p2)
#print(p1_,p2_)
interaction = bp[1][1:4]
orientation = bp[1][5]
orientation = bp[1][5].upper()
#print(p1,p2,interaction,orientation)
if orientation=='s':
continue
elif interaction=="C/C":
g.add_edge(int(p1),int(p2),long_range=False ,label='b53')
g.add_edge(int(p1),int(p2),long_range=False ,label='B53')
elif interaction=="+/+" or interaction=="-/-":
bond = orientation + "WW"
g.add_edge(int(p1), int(p2), long_range= False, label=bond)
......@@ -117,7 +118,10 @@ for k in range(len(graphs)):
print(len(sig_g))
os.chdir("..")
pickle.dump(sig_g,open("all3dmotif_graphs.cPickle",'wb'))
pickle.dump(sig_g,open("min0_3dmotif_graphs.cPickle",'wb'))
os.chdir("..")
os.chdir('models')
pickle.dump(sig_g,open("min0_3dmotif_graphs.cPickle",'wb'))
"""
print(compare_graphs(g,g))
compare = compare_graphs(g,bob)
......
......@@ -315,7 +315,7 @@ def align_multiple_graphs(g_list):
matches[j[0]].append(j[1])
return matches
motif = pickle.load(open("best3dmotif_graphs.cPickle",'rb'))
motif = pickle.load(open("../all_catalogue/min5_3dmotif_graphs.cPickle",'rb'))
#os.chdir('..')
#os.chdir('Src')
full_aln = []
......@@ -329,7 +329,8 @@ motif_pos = []
for j in motif:
#print(i['names'])
i = j
if(len(i['l_graphs'])>3):
if(len(i[0])>10):
j = i['l_graphs'][0]
......
__author__ = 'Roman'
import pickle
import networkx as nx
import matplotlib
from matplotlib import pyplot as plt
import os
from networkx.algorithms.isomorphism import DiGraphMatcher as DGM
from collections import OrderedDict as od
def make_align_graph(g1,g2,corr):
corr_edges = []
g0 = nx.DiGraph()
g0.add_nodes_from(g1.nodes())
g0.add_edges_from(g1.edges())
g0.add_nodes_from(g2.nodes())
g0.add_edges_from(g2.edges())
for i in corr:
corr_edges.append((i[0],i[1]))
print(corr)
g0.add_edges_from(corr_edges)
pos = nx.spring_layout(g0)
nx.draw_networkx_nodes(g0,pos,nodelist=g1.nodes(),node_color='red',node_size=500)
nx.draw_networkx_nodes(g0,pos,nodelist=g2.nodes(),node_color='green',node_size=500)
nx.draw_networkx_edges(g0,pos,edgelist=g1.edges(),edge_color='red',width=2)
nx.draw_networkx_edges(g0,pos,edgelist=g2.edges(),edge_color='green',width=2)
nx.draw_networkx_edges(g0,pos,edgelist=corr_edges,edge_color='blue',width=0.5)
labels={}
elabels = {}
for i in g0.nodes():
labels[i] = i
nx.draw_networkx_labels(g0,pos,labels)
for i in g1.edges():
elabels[i]=g1.get_edge_data(*i)['label']
for i in g2.edges():
elabels[i]=g2.get_edge_data(*i)['label']
nx.draw_networkx_edge_labels(g0,pos,elabels)
plt.show()
return g0
def get_edges_at_node(g1,node):
edgez = []
for i in g1.edges():
if i[0] == node or i[1] == node:
edgez.append(i)
return edgez
def get_edge_between_two_nodes(g1,node1,node2):
edge = [(-1,-1),(-1,-1)]
for i in g1.edges():
if i==(node1,node2) or i==(node2,node1):
if int(node1)<int(node2):
edge[0] = i
else:
edge[1] = i
return edge
def edge_match(g1,g2,edge1,edge2):
edge_data00 = g1.get_edge_data(*edge1[0])
edge_data01 = g1.get_edge_data(*edge1[1])
edge_data10 = g2.get_edge_data(*edge2[0])
edge_data11 = g2.get_edge_data(*edge2[1])
if edge_data00 == edge_data10 and edge_data01 == edge_data11:
# print(edge_data00)
# print(edge_data01)
# print(edge_data10)
# print(edge_data11)
return True
elif edge_data01 == edge_data10 and edge_data00 == edge_data11:
# print(edge_data00)
#print(edge_data01)
# print(edge_data10)
# print(edge_data11)
return True
else:
# print(edge_data00)
# print(edge_data01)
# print(edge_data10)
# print(edge_data11)
return False
def add_to_corr(g1,g2,correspondances,matches,current_expansion=-1):
success=False
neighbors = g1.neighbors(correspondances[current_expansion][0])
neighbors2 = g2.neighbors(correspondances[current_expansion][1])
for u in neighbors:
print(correspondances[current_expansion][0],u, neighbors)
if u not in [x[0] for x in correspondances]:
print(u)
for v in matches[u]:
print(v,matches[u])
if v not in [x[1] for x in correspondances]:
print(v)
if v in neighbors2:
edge1 = get_edge_between_two_nodes(g1,correspondances[current_expansion][0],u)
edge2 = get_edge_between_two_nodes(g2,correspondances[current_expansion][1],v)
edgematch = edge_match(g1,g2,edge1,edge2)
if edgematch==True:
root_match = match_node_to_node(g1,g2,matches,u)
if v in [x[1] for x in root_match] and u not in [x[0] for x in correspondances]:
correspondances.append([u,v])
success=True
return correspondances,success
def fits_current_corr(g1,g2,corr,node1,node2):
edges_to_check_1 = []
edges_to_check_2 = []
for i in g1.neighbors(node1):
for j in corr:
if i in j and (node1,j[0]) not in edges_to_check_1:
matcher = j[1]
edges_to_check_1.append((node1,j[0]))
edges_to_check_2.append((node2,matcher))
is_match = True
for i in range(len(edges_to_check_1)):
if g1.get_edge_data(*edges_to_check_1[i])!=g2.get_edge_data(*edges_to_check_2[i]):
print(edges_to_check_1[i],edges_to_check_2[i])
print(g1.get_edge_data(*edges_to_check_1[i]))
print(g2.get_edge_data(*edges_to_check_2[i]))
is_match = False
return is_match
def match_node_to_node(g1,g2,matches,corr,key_node):
print("MATCH NODE TO NODE")
print(key_node)
root_match = []
if key_node not in [x[0] for x in corr]:
for i in matches[key_node]:
print("TESTING",str(i))
nodes_are_aligned=False
if i not in [x[1] for x in corr]:
nodes_are_aligned=True
edges1 = get_edges_at_node(g1,key_node)
edges2 = get_edges_at_node(g2,i)
for j in edges1:
node_match = False
edge1 = get_edge_between_two_nodes(g1,key_node,j)
for k in edges2:
edge2 = get_edge_between_two_nodes(g2,i,k)
edgematch = edge_match(g1,g2,edge1,edge2)
# print(edgematch,j,k)
if edgematch==True:
if fits_current_corr(g1,g2,corr,key_node,i)==True:
node_match=True
if node_match==False:
nodes_are_aligned=False
if nodes_are_aligned==True:
root_match.append(i)
print(root_match)
print(corr)
print("ENDING NODE TO NODE")
return (root_match,corr)
def align_iterative(g1,g2,alignable, matches,correspondances, seed = 0):
seed_g1,seed_g2 = -1,-1
iterator = 0
for i in alignable.keys():
for j in alignable[i]:
if iterator==seed:
seed_g1,seed_g2 = i,j
iterator = iterator + 1
changed = True
print(alignable)
while changed==True and len(correspondances)<len(g1.nodes()):
print("NEW LOOP")
aligned1=0
aligned2=0
changed=False
for i in alignable.keys():
print("DID FIRST PART")
if len(alignable[i])==1:
changed=True
correspondances.append([i,alignable[i][0]])
aligned1 = i
#aligned2 = alignable[i][0]
#for k in alignable.keys():
# for j in alignable[k]:
# if j == aligned2 :
# alignable[k].remove(j)
# alignable.pop(aligned1,None)
if changed==True:
for t in alignable.keys():
alignable[t],correspondances = match_node_to_node(g1, g2, matches, correspondances, t)
print(alignable)
print(correspondances)
if changed==False and len(alignable.keys())>0:
print("GOT HERE")
print(alignable)
topop = []
for kk in alignable.keys():
if len(alignable[kk])==0:
topop.append(kk)
for ll in topop:
alignable.pop(ll,None)
if seed_g1 not in [x[0] for x in correspondances] and seed_g2 not in [x[1] for x in correspondances]:
try:
correspondances.append([seed_g1,seed_g2])
except:
return correspondances,alignable,False
changed=True
aligned1 = seed_g1
aligned2 = seed_g2
# for k in alignable.keys():
# for j in alignable[k]:
# if j == aligned2 :
# alignable[k].remove(j)
# alignable.pop(aligned1,None)
for t in alignable.keys():
alignable[t],correspondances = match_node_to_node(g1, g2,matches, correspondances, t)
return correspondances,alignable,True
def align_graphs(g1,g2):
"""
takes as input two identical DiGraph objects
:param g1: DiGraph 1
:param g2: DiGraph 2
:param aln : proto-alignment dict. List of dicts
:return: Dictionary of alignment of nodes
"""
correspondances = []
matches = {}
match = DGM(g1,g2)
for xx in g1.nodes():
matches[xx]=[]
for yy in g2.nodes():
if match.syntactic_feasibility(xx,yy) == True:
matches[xx].append(yy)
min_match = 100
key_node = -1
for i in matches.keys():
if (len(matches[i]))<=min_match:
key_node = i
min_match = len(matches[i])
#align g1 on g2
correspondances = []
print(matches)
alignable = od()
for i in g1.nodes():
alignable[i],correspondances = match_node_to_node(g1,g2,matches,correspondances,i)
it = 0
seed = 0
base_alignable = alignable.copy()
base_corr = []
while len(correspondances)<len(g1.nodes()):
if len(correspondances)+ len(alignable)<len(g1.nodes()):
print("NEW SEED")
alignable = od()
correspondances= []
for i in g1.nodes():
alignable[i],correspondances = match_node_to_node(g1,g2,matches,correspondances,i)
seed = seed +1
correspondances,alignable,completed_aln = align_iterative(g1,g2,alignable,matches,[],seed)
it = it+1
if it>80:
raise ValueError('A very specific bad thing happened.')
correspondances,alignable,completed_aln = align_iterative(g1,g2,alignable,matches,correspondances,seed)
print(alignable)
print(correspondances)
while completed_aln==False:
seed = seed +1
correspondances,alignable,completed_aln = align_iterative(g1,g2,base_alignable,matches,correspondances,seed)
print(alignable)
print(correspondances)
return correspondances
"""
root_match = match_node_to_node(g1,g2,matches,key_node)
correspondances.append(root_match[0])
l1 = [i for i in nx.strongly_connected_components(g1)]
l2 = [i for i in nx.strongly_connected_components(g2)]
if len(l1)==1:
while len(correspondances)<len(g2.nodes()):
deadends=len(correspondances)-1
corr,edgematch = add_to_corr(g1,g2,correspondances,matches,deadends)
print(corr,edgematch)
while edgematch==False:
deadends=deadends -1
if (deadends)<0:
exit(0)
else:
corr,edgematch = add_to_corr(g1,g2,correspondances,matches,deadends)
print(deadends,corr,edgematch)
#last1,last2 = -1,-1
#for i in g1.nodes():
## if i not in [x[0] for x in correspondances]:
# last1 = i
# for i in g2.nodes():
# if i not in [x[0] for x in correspondances]:
# last2 = i
# correspondances.append([last1,last2])
return correspondances
"""
def get_seq_len(PDB):
print(PDB)
chain = PDB.split(".")[1]
PDB_name = PDB.split(".")[0].upper() + ".nxpickled"
try:
g = pickle.load(open("../models/all_graphs_pickled/" + PDB_name, "rb"))
chain_nodes = []
for node in g.nodes(data=True):
#print(node)
if node[0][0] == chain:
nodecode = (node[0][1])
chain_nodes.append(int(nodecode))
return len(chain_nodes)
except FileNotFoundError:
print("File not found for ", PDB_name)
return 1000000
finally:
print()
def align_multiple_graphs(g_list):
matches = {}
etalon = g_list[0]
for z in etalon.nodes():
matches[z]=[]
#g_list.remove(g_list[0])
for i in g_list:
corr = align_graphs(etalon,i)
for j in corr:
matches[j[0]].append(j[1])
return matches
motif = pickle.load(open("../all_catalogue/min5_3dmotif_graphs.cPickle",'rb'))
#os.chdir('..')
#os.chdir('Src')