Commit 51e70e5e authored by Roman Sarrazin-Gendron's avatar Roman Sarrazin-Gendron
Browse files

added rfam ss

parent ba8ecbd6
......@@ -364,7 +364,7 @@ for j in motif:
if not_boring==False:
continue
size = len(list(j.nodes()))
if size<12 or size>25:
if size<6 or size>25:
continue
for k in j.edges():
print(k)
......@@ -398,18 +398,18 @@ for j in motif:
main_graph = i[0][1]
this_pdb = [kk for kk in [zzz[0] for zzz in i[1]]]
pdb_names = set([pdb[0:4] for pdb in this_pdb])
if len(pdb_names)<10:
if len(pdb_names)<20:
continue
for pdb in set(this_pdb):
if "." not in pdb:
continue
shortened_pdb = ".".join([pdb.split(".")[0],pdb.split(".")[1]])
if get_seq_len(pdb) in range(30,300):
if get_seq_len(pdb) in range(50,30000):
if shortened_pdb not in short_PDBs:
short_PDBs.append(shortened_pdb)
else:
print("TOO LONG!")
if len(short_PDBs)<10 or len(short_PDBs)>10000:
if len(short_PDBs)<20 or len(short_PDBs)>10000:
continue
#this_pdb = [kk for kk in [zzz[0][0] for zzz in i['names']]]
positions_of_this_pdb = [list(z.nodes()) for z in i[0]]
......@@ -433,10 +433,10 @@ for j in motif:
print("TERRIBLE")
print(main_graph.nodes)
continue
pickle.dump(full_aln,open("full5_rna3dmotif_aligned_modulegraphs.cPickle",'wb'))
pickle.dump(graphs,open("full5_rna3dmotif_one_of_each_graph.cPickle",'wb'))
pickle.dump(PDBs,open("full5_rna3dmotif_PDB_names.cPickle",'wb'))
pickle.dump(motif_pos,open("full5_rna3dmotif_PDB_positions.cPickle",'wb'))
pickle.dump(full_aln,open("all_full3_rna3dmotif_aligned_modulegraphs.cPickle",'wb'))
pickle.dump(graphs,open("all_full3_rna3dmotif_one_of_each_graph.cPickle",'wb'))
pickle.dump(PDBs,open("all_full3_rna3dmotif_PDB_names.cPickle",'wb'))
pickle.dump(motif_pos,open("all_full3_rna3dmotif_PDB_positions.cPickle",'wb'))
print("NUMBER OF MOTIFS")
......
......@@ -4,26 +4,31 @@ import matplotlib.pyplot as plt
from altair import *
import matplotlib.patches as mpatches
import networkx as nx
ALL_RESULTS = []
excellent = [0, 3, 7, 12, 13, 16, 21, 22, 24, 26, 30, 31, 35, 36, 37, 39, 43, 44, 45, 46, 47, 50, 53, 54, 55, 59, 60, 61, 63, 65, 67, 72, 76, 79, 81, 86, 92, 93, 95, 98, 101, 104, 108, 110, 112, 116, 117, 118, 124, 127]
failed = [8,10,13,16]
'''
seqs = pickle.load(open("../models/carlos_rna3dmotif_one_of_each_graph.cPickle","rb"))
print(len(seqs))
for i in seqs:
print(len(i))
exit()
'''
pretty_bad = [10, 15, 20, 29, 32, 34, 42, 49, 58, 69, 80, 88, 97, 100, 105, 109, 111, 125, 126, 128]
NUL = [8, 10, 13, 16, 15, 20, 42, 80, 97, 100, 109]
failed =pretty_bad
graphs = pickle.load(open("../models/all_full3_rna3dmotif_one_of_each_graph.cPickle","rb"))
total = 0
for i in graphs:
if len(i)>19:
total = total+1
graphs = pickle.load(open("../models/full_rna3dmotif_one_of_each_graph.cPickle","rb"))
print("N TOTAL:",total)
#for i in range(len(graphs)):
for i in failed:
for i in excellent:
labels = {}
elabels = {}
g0 = graphs[i][0]
plt.show()
pos = nx.circular_layout(g0)
nx.draw_networkx_nodes(g0, pos, nodelist=g0.nodes(), node_color='red', node_size=500)
print(g0.nodes(data=True))
print(g0.edges(data=True))
for index,i in enumerate(list(g0.nodes())):
print(g0.nodes(data=True))
labels[i] = list(g0.nodes(data=True))[index][1]["nuc"]
nx.draw_networkx_labels(g0,pos,labels)
......@@ -33,14 +38,17 @@ for i in failed:
nx.draw_networkx_edges(g0,pos,edgelist=g0.edges(),edge_color='green',width=2)
plt.show()
exit()
bayespairing_prediction = []
actual_module = []
moip_motifs = []
to_sort = []
#a = pickle.load(open("../models/rna3dmotif_june_crossval_results.cPickle", "rb"))
a = pickle.load(open("../models/full3_rna3dmotif_june_crossval_results.cPickle", "rb"))
#a = pickle.load(open("../models/long_seqs_full3_rna3dmotif_june_crossval_results.cPickle", "rb"))
a = pickle.load(open("../models/all_full3_rna3dmotif_june_crossval_results.cPickle", "rb"))
zeros = []
weak = []
for motif in a:
motif_scores = []
......@@ -57,12 +65,12 @@ for motif in a:
bayespairing_prediction.append(0.00001)
if len(seq_scores) == 0:
seq_scores = [0]
motif_scores.append(max(max(seq_scores),0.02))
motif_scores.append(max(max(seq_scores),0.00))
if len(motif_scores) > 0:
motif_mean = np.mean(motif_scores)
else:
motif_mean = 0
if motif_mean > -1 and len(motif_scores) > 1:
if motif_mean>-1 and len(motif_scores)>19:
moip_motifs.append(motif)
print("M-", motif, "~", str(len(motif_scores)), " : ", str(motif_mean))
to_sort.append((len(motif_scores), motif_mean, motif))
......@@ -80,12 +88,12 @@ for motif in a:
seq_scores.append(score)
if len(seq_scores)==0:
seq_scores = [0]
motif_scores.append(max(max(seq_scores),0.02))
motif_scores.append(max(max(seq_scores),0.00))
if len(motif_scores) > 0 :
motif_mean = np.mean(motif_scores)
else:
motif_mean = 0
if motif_mean>-1 and len(motif_scores)>1:
if motif_mean>-1 and len(motif_scores)>19:
moip_motifs.append(motif)
print("M-",motif, "~",str(len(motif_scores)), " : ", str(motif_mean))
to_sort.append((len(motif_scores), motif_mean,motif))
......@@ -103,12 +111,19 @@ for motif in a:
seq_scores.append(score)
if len(seq_scores)==0:
seq_scores = [0]
motif_scores.append(max(max(seq_scores),0.02))
motif_scores.append(max(max(seq_scores),0.00))
if len(motif_scores) > 0 :
motif_mean = np.mean(motif_scores)
else:
motif_mean = 0
if motif_mean>-1 and len(motif_scores)>1:
# if motif_mean<=0.02 and len(motif_scores)>19:
# failed.append(motif)
if motif_mean<=0.2 and len(motif_scores)>19:
weak.append(motif)
if motif_mean>0.9:
excellent.append(motif)
if motif_mean>-1 and len(motif_scores)>19:
ALL_RESULTS.append(motif_mean)
moip_motifs.append(motif)
print("M-",motif, "~",str(len(motif_scores)), " : ", str(motif_mean))
to_sort.append((len(motif_scores), motif_mean,motif))
......@@ -145,7 +160,7 @@ ax.set_xlabel("Number of unique structures in test set")
ax.set_ylabel("Mean proportion of correctly predicted base pairs")
#ax.set_title("Mean prediction, only top candidate, leave-one-out CV")
#ax.set_title("Mean prediction score, 16 most populated rna3dmotif modules, leave-one-out CV")
ax.set_title("Mean prediction score, rna3dmotif subset, size 12 to 25, minimum 10 unique structures, leave-one-out CV")
ax.set_title("Mean prediction score, rna3dmotif full dataset, leave-one-out CV")
plt.ylim(0,1)
plt.legend()
plt.show()
......@@ -178,9 +193,31 @@ for motif in a:
print("M-", motif, "~", str(len(motif_scores)), " : ", str(motif_mean))
to_sort.append((len(motif_scores), motif_mean, motif))
plt.scatter(bayespairing_prediction,actual_module, alpha=0.25)
plt.xlim(-1,35)
plt.title("Bayespairing prediction likelihood score and prediction accuracy, very large modules")
signal = 0
no_signal = 0
for i in range(len(bayespairing_prediction)):
if 25<bayespairing_prediction[i]<100:
if actual_module[i]>=0.5:
signal=signal+1
else:
no_signal=no_signal +1
print(signal, no_signal)
print(float(signal)/(signal+no_signal))
print(float(no_signal)/(signal+no_signal))
plt.scatter(bayespairing_prediction,actual_module, alpha=0.15)
plt.xlim(25,40)
plt.title("Bayespairing prediction accuracy for high likelihood scores")
plt.xlabel("Bayespairing prediction likelihood score")
plt.ylabel("Prediction accuracy")
plt.show()
\ No newline at end of file
plt.show()
#print(weak)
#print(failed)
plt.hist(ALL_RESULTS,bins=20)
plt.title("Mean accuracy distribution, best of top 5 candidates, full rna3dmotif, leave-one-out CV")
plt.xlabel("Precision")
plt.ylabel("Counts")
plt.show()
print("EXCELLENT")
print(excellent)
\ No newline at end of file
import pickle
import os
crossval_results = []
for i in range(0,218):
crossval_name = "crossval_"+str(i)
for i in range(0,54):
crossval_name = "../models/all_sequences_full3_rna3dmotif_june_crossval_"+str(i)
a = pickle.load(open(crossval_name,"rb"))
crossval_results.append(a)
pickle.dump(crossval_results,open("longseqs_crossval.cPickle","wb"))
pickle.dump(crossval_results,open("temp_crossval.cPickle","wb"))
......@@ -6,10 +6,10 @@ import BayesPairing
import random
from Bio import SeqIO
from random import shuffle
DATASET_NAME = "full5_rna3dmotif"
graphs = pickle.load(open("../models/full5_rna3dmotif_one_of_each_graph.cPickle", "rb"))
pdbs = pickle.load(open("../models/full5_rna3dmotif_PDB_names.cPickle", "rb"))
pdb_positions = pickle.load(open("../models/full5_rna3dmotif_PDB_positions.cPickle", "rb"))
DATASET_NAME = "all_full3_rna3dmotif"
graphs = pickle.load(open("../models/all_full3_rna3dmotif_one_of_each_graph.cPickle", "rb"))
pdbs = pickle.load(open("../models/all_full3_rna3dmotif_PDB_names.cPickle", "rb"))
pdb_positions = pickle.load(open("../models/all_full3_rna3dmotif_PDB_positions.cPickle", "rb"))
# print(pdbs)
fasta_path = "pdb_seqres.txt"
......@@ -157,7 +157,7 @@ def get_seq_ss(PDBid):
# print(PDB)
# print("../all_graphs_pickled/" + PDB + ".nxpickled")
try:
g = pickle.load(open("all_graphs_pickled/" + PDB + ".nxpickled", "rb"))
g = pickle.load(open("../models/all_graphs_pickled/" + PDB + ".nxpickled", "rb"))
except FileNotFoundError:
print("PDB FILE NOT FOUND")
return ("", 0)
......@@ -335,18 +335,19 @@ def cross_validation(modules_to_test):
done = []
for ind, j in enumerate(pdbs_in_modules[i]):
print(motif_positions[ind])
last = motif_positions[ind][1][-1]
first = motif_positions[ind][1][0]
last = sorted(motif_positions[ind][1])[-1]
first = sorted(motif_positions[ind][1])[0]
#print("PDB POSITIONS")
#print(first, last)
if j[0:4].upper() in PDBlist and j[0:4] not in bugged and j[0:4] not in done:
done.append(j[0:4])
print("CURRENT PDB:",j)
seq, ss = get_seq_ss(j)
print("LENGTH OF SEQUENCE:",len(seq))
pdb_len = len(seq)
print("SEQUENCE :",seq)
if pdb_len in range(0, 300) and "T" not in seq and "-" not in seq:
if pdb_len in range(10, 300) and "T" not in seq and "-" not in seq:
#continue
done.append(j[0:4])
#seq = shuffle_seq(seq)
this_mod_seqs = [((j, seq, ss))]
# else:i
......@@ -360,22 +361,23 @@ def cross_validation(modules_to_test):
for k in scores:
print(scores[k])
crossval_results[i].append(scores[k])
else:
continue
elif pdb_len>300:
#continue
new_seq, new_ss = [], []
offset = 0
if last - first < 400:
helices = BayesPairing.find_loops(ss)[1]
print("HELICES")
print(helices)
for helix in helices:
if first > helix[0] and last < helix[1]:
new_seq = seq[helix[0]:helix[1]]
new_ss = ss[helix[0]:helix[1]]
offset = helix[0]
break
else:
continue
offset = first-100
if last - first < 200:
new_seq = seq[first-100:last+100]
#helices = BayesPairing.find_loops(ss)[1]
#print("HELICES")
#print(helices)
#for helix in helices:
# if first > helix[0] and last < helix[1]:
# new_seq = seq[helix[0]:helix[1]]
# new_ss = ss[helix[0]:helix[1]]
# offset = helix[0]
# break
# else:
# continue
print("NEW SEQ LENGTH")
print(len(new_seq))
if len(new_seq) > 500:
......@@ -385,7 +387,7 @@ def cross_validation(modules_to_test):
# offset = first-split
# new_seq = seq[offset:last+(filler-split)]
# new_ss = ss[first-split:last+(filler-split)]
this_mod_seqs.append((j, shuffle_seq(new_seq), new_ss))
this_mod_seqs.append((j, new_seq, new_ss))
# else:i
# print(j[0:4].upper())
module_seqs[i] = this_mod_seqs
......@@ -399,13 +401,13 @@ def cross_validation(modules_to_test):
print(scores[k])
crossval_results[i].append(scores[k])
cv_fn = "full5_rna3dmotif_june_crossval_" + str(i)
cv_fn = "all_sequences_full3_rna3dmotif_june_crossval_" + str(i)
pickle.dump(crossval_results[i], open(cv_fn, "wb"))
for i in crossval_results:
for j in crossval_results[i]:
print(j)
#print(crossval_results)
pickle.dump(crossval_results, open("full6_rna3dmotif_june_crossval_results.cPickle", "wb"))
pickle.dump(crossval_results, open("all_full3_rna3dmotif_june_crossval_results.cPickle", "wb"))
print('FINAL RESULTS')
for i in crossval_results:
print("RESULTS FOR MODULE :", i)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment