Commit ea1be472 authored by Roman Sarrazin-Gendron's avatar Roman Sarrazin-Gendron
Browse files

added rfam ss

parent ae0cdc4b
......@@ -239,7 +239,33 @@ def define_acceptable_columns(counts):
acceptable_cols.remove(i)
return acceptable_cols
def find_good_columns(module_columns_by_family, family):
def get_bps_from_ss(ss, columns):
starts = []
ends = []
ss=list(ss)
for i in range(len(list(ss))):
if ss[i]=="(" or ss[i]=="<" or ss[i]=="{":
starts.append(i)
if ss[i]==")" or ss[i]==">" or ss[i]=="}":
ends.append(i)
#struct = [(columns[starts[i]],columns[ends[i]]) for i in range(min(len(starts),len(ends)))]
if min(len(starts),len(ends))==0:
return ()
else:
struct = [(starts[i],ends[i]) for i in range(min(len(starts),len(ends)))]
return struct
def verify_columns_ss(columns,family,module, graphs):
rfam_ss=get_ss_from_rfam_family(family,columns)
rfam_bps = get_bps_from_ss(rfam_ss,columns)
graph_ss=get_ss_from_graph(graphs[module][0])
print("COMPARING :",rfam_bps,graph_ss)
if rfam_ss==graph_ss:
return True
else:
return False
def find_good_columns(module_columns_by_family, family,module,graphs):
good_cols=[]
cols = []
module_length = -1
......@@ -266,9 +292,19 @@ def find_good_columns(module_columns_by_family, family):
good_cols_for_this_node.append(i)
good_cols.append(good_cols_for_this_node)
#print(counts)
return good_cols
def check_columns_ss(good_cols,family,module,graphs):
final_cols = [[] for x in good_cols]
for i in range(len(good_cols[0])):
this_cols = [x[i] for x in good_cols]
print("GETTING SS FOR COLUMNS:", this_cols)
it_fits = verify_columns_ss(this_cols, family, module,graphs)
if it_fits==True:
for col in range(len(final_cols)):
final_cols[col].append(this_cols[col])
good_cols = final_cols
return good_cols
def most_common(lst):
return max(set(lst), key=lst.count)
......@@ -343,11 +379,21 @@ def sanity_check(struct_seqs,exp_seqs):
return -1000
def get_ss_from_rfam_family(family):
def get_ss_from_rfam_family(family,cols):
ss = ""
fname = "RF"+format(family,"05d")+".stockholm.txt"
with open(fname,"r") as f:
lines = f.readlines()
full_ss=lines[-3].split()[-1]
print("GENERATING SS")
for i in cols:
print(full_ss[i])
ss = ss+full_ss[i]
print('SS for columns : ')
print(cols,ss)
return ss
def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addresses, CURRENT_MODULE):
def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addresses, CURRENT_MODULE,graphs):
newfam=True
written = []
seq_ranges = {}
......@@ -426,7 +472,7 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
#print("---------")
#print(module_columns_by_family)
good_cols= find_good_columns(module_columns_by_family,CURRENTLY_STUDIED_FAMILY)
good_cols= find_good_columns(module_columns_by_family,CURRENTLY_STUDIED_FAMILY, CURRENT_MODULE,graphs)
if [-1000]==good_cols[0]:
return []
print("GOOD COLUMNS",good_cols)
......@@ -472,9 +518,10 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
start_pos = seq_ranges[current_PDB][0]
this_example_columns.append(gapped_column_from_ungapped_nuc_pos(gapped_seq,ungapped_seq,pos-(start_pos)-alignment_is_right))
module_columns_by_family[CURRENTLY_STUDIED_FAMILY][current_PDB]=(this_example_columns,nucs_from_fred(positions_for_this_pdb))
good_cols= find_good_columns(module_columns_by_family,CURRENTLY_STUDIED_FAMILY)
good_cols= find_good_columns(module_columns_by_family,CURRENTLY_STUDIED_FAMILY,CURRENT_MODULE,graphs)
if good_cols==[] or [-1000] in good_cols:
return -1
good_cols = check_columns_ss(good_cols,CURRENTLY_STUDIED_FAMILY,CURRENT_MODULE,graphs)
aln = AlignIO.read(open("new_aln3.clustal"), "clustal")
for PDB in module_columns_by_family[CURRENTLY_STUDIED_FAMILY]:
columns = module_columns_by_family[CURRENTLY_STUDIED_FAMILY][PDB][0]
......@@ -521,8 +568,8 @@ def sanity_check_existing_seqs():
pickle.dump(fixed_seqs,open("test3_ram.cPickle","wb"))
if __name__ == "__main__":
sanity_check_existing_seqs()
"""
#sanity_check_existing_seqs()
rfam = pickle.load(open("RFAM_databse.cPickle","rb"))
rfam_by_PDB = {}
number = 0
......@@ -536,10 +583,10 @@ if __name__ == "__main__":
rfam_by_PDB[PDB_id].append(i)
aligned_modulegraphs = pickle.load(open("rna3dmotif_aligned_modulegraphs.cPickle",'rb'))
PDB_positions = pickle.load(open("rna3dmotif_PDB_positions.cPickle",'rb'))
PDBs = pickle.load(open("rna3dmotif_PDB_names.cPickle",'rb'))
graphs = pickle.load(open("rna3dmotif_one_of_each_graph.cPickle",'rb'))
aligned_modulegraphs = pickle.load(open("test_aligned_modulegraphs.cPickle",'rb'))
PDB_positions = pickle.load(open("test_PDB_positions.cPickle",'rb'))
PDBs = pickle.load(open("test_PDB_names.cPickle",'rb'))
graphs = pickle.load(open("test_one_of_each_graph.cPickle",'rb'))
rfam_extra_sequences = []
format_aligned_modulegraphs = []
format_PDB_positions = []
......@@ -550,7 +597,8 @@ if __name__ == "__main__":
matches={}
positions = {}
module_PDB_addresses = {}
interesting = [2, 8, 9, 14, 20, 28, 36, 59, 113, 127, 133, 150, 162, 194, 195]
#interesting = [2, 8, 9, 14, 20, 28, 36, 59, 113, 127, 133, 150, 162, 194, 195]
interesting = range(len(graphs))
test_modules = interesting
a = get_ss_from_graph(graphs[2][0])
rfam_module_seqs = {}
......@@ -571,7 +619,7 @@ if __name__ == "__main__":
if matches[module][family_match][0]>10:
#if family_match==2541:
#print("ZIPCODE",module_addresses)
module_seqs = get_module_seqs_for_family(family_match,matches,module_addresses,module)
module_seqs = get_module_seqs_for_family(family_match,matches,module_addresses,module,graphs)
if module_seqs==-1:
print("FAILED TO FIND NEW SEQUENCES FOR FAMILY ",family_match)
continue
......@@ -585,11 +633,15 @@ if __name__ == "__main__":
#rfam_name = "Rfam_module_seqs_"+ str(module) + ".cPickle"
print("OUTPUT")
for i in rfam_extra_sequences:
final_sequences = []
for i in range(len(rfam_extra_sequences)):
print(i)
pickle.dump(rfam_extra_sequences,open("test2_rfam.cPickle",'wb'))
pickle.dump(format_aligned_modulegraphs,open("test2_aligned_modulegraphs.cPickle",'wb'))
pickle.dump(format_graphs,open("test2_one_of_each_graph.cPickle",'wb'))
pickle.dump(format_PDB_names,open("test2_PDB_names.cPickle",'wb'))
pickle.dump(format_PDB_positions,open("test2_PDB_positions.cPickle",'wb'))
"""
new_seq = rule_out_impossible_seqs(graphs[interesting[i]][0],rfam_extra_sequences[i])
final_sequences.append(new_seq)
#pickle.dump(rfam_extra_sequences,open("test3_rfam.cPickle",'wb'))
pickle.dump(final_sequences,open("test4_rfam.cPickle",'wb'))
pickle.dump(format_aligned_modulegraphs,open("test4_aligned_modulegraphs.cPickle",'wb'))
pickle.dump(format_graphs,open("test4_one_of_each_graph.cPickle",'wb'))
pickle.dump(format_PDB_names,open("test4_PDB_names.cPickle",'wb'))
pickle.dump(format_PDB_positions,open("test4_PDB_positions.cPickle",'wb'))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment