Commit f554cf23 authored by Roman Sarrazin-Gendron's avatar Roman Sarrazin-Gendron
Browse files

updated mapping of structures to Rfam alignments

parent ceff9cea
......@@ -12,7 +12,7 @@ import math
import make_BN_from_carnaval as makeBN
def parse_sequence(seq, modules, ss, dataset, left_out, sm=0.3, mc=0, p=10000):
def parse_sequence(seq, modules, ss, dataset, left_out, sm=0.3, mc=0, p=10000, leave_out_sequence=False, left_out_sequence=""):
pdbs = pickle.load(open("../models/" + dataset + "_PDB_names.cPickle", "rb"))
graphs = pickle.load(open("../models/" + dataset + "_one_of_each_graph.cPickle", "rb"))
return_dict = {}
......@@ -23,11 +23,14 @@ def parse_sequence(seq, modules, ss, dataset, left_out, sm=0.3, mc=0, p=10000):
this_graph = graphs[mod][0]
#print("Creating Bayesian Network", flush=True)
makeBN.call_makeBN(mod, dataset, left_out)
makeBN.call_makeBN(mod, dataset, left_out, leave_out_seq=leave_out_sequence,left_out_seq=left_out_sequence)
#print("Bayesian Network created", flush=True)
motif = pickle.load(open(str("../models/"+dataset + "_BN_" + str(mod)), "rb"))
pickle_name = dataset + "_samp" + str(mod) + ".cPickle"
print("LEFT OUT", left_out)
if left_out != "NONE":
print("LEFT OUT", left_out, "building new model without")
sample_df, sample_dict, struct_list = bs.setup_sampling(motif, p)
scores, return_struct, max = bs.get_best_struct(motif, struct_list, sm)
pickle.dump(struct_list, open("../models/"+pickle_name, "wb"))
......@@ -107,8 +110,11 @@ def returner(scores, seq, ss="", m=8, n=5, k=1, sscons=2):
nocons_score = testSS.call_rnafold(seq)
cons_score = testSS.call_rnafold(seq, cons_seq)
quotient = (nocons_score / cons_score)
current_score = current_score + float((float(1) / float(k)) * math.log(quotient))
seq_score=current_score
current_score = seq_score + float((float(1) / float(k)) * math.log(quotient))
final_top[str(poz)] = current_score
#if current_score>20:
# print(position_subsets,current_score,nocons_score,cons_score,quotient, seq_score,math.log(quotient))
# end1 = time.time()
# print("FOLDING TIME")
# print(end1-start1)
......
from Bio import SeqIO
import numpy as np
from numpy.random import choice
def get_consensus(module_seqs):
consensus = ""
consensus = []
for i in range(len(module_seqs[0])):
scores = [0,0,0,0]
alph = list("ACGU")
......@@ -17,10 +20,42 @@ def get_consensus(module_seqs):
#print(scores)
#print(scores.index(max(scores)))
#print(np.argmax(scores))
consensus = consensus + alph[scores.index(max(scores))]
tot = sum(scores)
consensus.append([x/tot for x in scores])
return consensus
def rfam_to_module(positions, family_file,output_name):
def rfam_to_module(positions, family_file,output_name, as_list=False, seqs=[]):
alph = ["A","C","G","U"]
if as_list:
module_seqs = []
for seq in seqs:
module_seqs.append("".join(seq))
cons = get_consensus(module_seqs)
fmodule_seqs = []
for s in module_seqs:
this_mod = ""
for col in range(len(s)):
if str(s[col]) != "-":
this_mod = this_mod + str(s[col])
else:
this_mod = this_mod + alph[choice(np.array([0,1,2,3]),p=cons[col])]
fmodule_seqs.append(this_mod)
with open(output_name, "w") as f:
for j in range(len(fmodule_seqs)):
f.write(">module_seq")
f.write(str(j))
f.write("\n")
f.write(fmodule_seqs[j])
f.write("\n")
return
module_seqs = []
modules_handles = []
for record in SeqIO.parse(family_file,"fasta"):
......@@ -40,7 +75,7 @@ def rfam_to_module(positions, family_file,output_name):
if str(s[col]) != "-":
this_mod = this_mod + str(s[col])
else:
this_mod = this_mod + cons[col]
this_mod = this_mod + alph[choice(np.array([0,1,2,3]),p=cons[col])]
fmodule_seqs.append(this_mod)
#print(module_seqs)
......@@ -48,7 +83,7 @@ def rfam_to_module(positions, family_file,output_name):
#print(fmodule_seqs)
with open(output_name,"w")as f:
with open(output_name,"w") as f:
for j in range(len(modules_handles)):
f.write(">")
f.write(modules_handles[j])
......
......@@ -312,6 +312,7 @@ def build_regex(position_subsets,component_distance, iii, convert, positions, mc
#dis = component_distance[junction]
if len(re_call) > 0:
dis = component_distance[junction]
#print(dis)
re_call = re_call + r")([ACGU]{"+str(dis[0]-1)+","+str(dis[1]-1)+"})"
junction=junction+1
......@@ -423,6 +424,16 @@ def adjust_aln(best_struct,positions,better_positions):
new_best_struct = (full_sample,new_nucs_for_each_nodes,best_of_sample)
return new_best_struct
def rotate_list(lst):
return lst[1:] + lst[:1]
def get_all_rotations(lst):
rots = [lst]
for i in range(len(lst)-1):
lst = rotate_list(lst)
rots.append(lst)
return rots
def get_position_permutations(position_subsets,component_distance):
final_position_permutations = []
to_permute = []
......@@ -440,7 +451,7 @@ def get_position_permutations(position_subsets,component_distance):
for position in range(len(component_distance)+1):
if position not in set(permuted):
not_permuted.append(position)
permutations = list(itertools.permutations(to_permute))
permutations = get_all_rotations(to_permute)
for perm in permutations:
this_perm = []
perm_counter = 0
......@@ -581,13 +592,13 @@ def seq_to_struct(
h= int(1.5*b)
j = int(1.2*b)
reduced_component_distance.append([a,c])
reduced_component_distance.append([a,b])
halfway_component_distance.append([a,d])
e_distance.append([a,e])
f_distance.append([a,f])
g_distance.append([a,c])
h_distance.append([a,h])
j_distance.append([a,j])
j_distance.append([a,b])
permutations = get_permutations(position_subsets,component_distance,iii)
regex_list = []
for perm in permutations:
......@@ -611,7 +622,7 @@ def seq_to_struct(
regex_list.append([res6,position_subsets,node_dict])
regex_list.append([res7,position_subsets,node_dict])
regex_list.append([res8,position_subsets,node_dict])
#("REGEX:",res)
#print("REGEX:",regex_list)
output = []
final_model = {}
import time
......
......@@ -214,7 +214,7 @@ def plot_module(graphs,name,latex=False):
BP = mpatches.Patch(color="black", label="Backbone")
SP = mpatches.Patch(color="red", label="Stacking")
# plt.legend(handles=[NCP,CP,BP,SP],prop={'size': 16})
#plt.show()
plt.show()
plt.savefig("../Graphs/" + name + "graph" + str(m) + ".png", format="png")
plt.clf()
......
......@@ -190,16 +190,17 @@ def make_graph_from_carnaval(g, aln_dict):
#plt.show()
return motif
def call_makeBN(mod,dataset,left_out):
def call_makeBN(mod,dataset,left_out, leave_out_seq = False, left_out_seq = ""):
current_ID = mod
excluded = left_out
bayesname = dataset+"_BN_"+str(current_ID)
#if excluded != "NONE":
# removecall = "rm " +"../models/"+bayesname
# subprocess.call(removecall, shell=True)
if excluded != "NONE":
removecall = "rm " +"../models/"+bayesname
subprocess.call(removecall, shell=True)
#if not os.path.isfile(bayesname), changed for cross-validation; always remake the model
if not os.path.isfile("../models/"+bayesname):
print("building and sampling bayesian network")
aln_list = pickle.load(open("../models/"+dataset + "_aligned_modulegraphs.cPickle",'rb'))
g_list = pickle.load(open("../models/"+dataset + "_one_of_each_graph.cPickle",'rb'))
pdb_names = pickle.load(open("../models/"+dataset + "_PDB_names.cPickle",'rb'))
......@@ -207,8 +208,9 @@ def call_makeBN(mod,dataset,left_out):
if excluded!="NONE":
excluded_indexes = []
for pdb in range(len(pdb_names[mod])):
if excluded in pdb_names[mod][pdb]:
excluded_indexes.append(pdb)
motif_seq = "".join([x[1]["nuc"] for x in sorted(g_list[mod][pdb].nodes(data=True))])
if excluded in pdb_names[mod][pdb] or (leave_out_seq and motif_seq == left_out_seq):
excluded_indexes.append(pdb)
if 0 in excluded_indexes:
excluded_indexes.remove(0)
g = g_list[current_ID]
......
......@@ -5,6 +5,8 @@ import networkx as nx
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.Align.Applications import ClustalwCommandline
from Bio.Align.Applications import ClustalOmegaCommandline
import os
from bs4 import BeautifulSoup
from bs4.element import Comment
......@@ -14,6 +16,9 @@ import time
import pickle
import requests
rfam = pickle.load(open("../models/RFAM_databse.cPickle", "rb"))
def get_from_rfam(fam):
url = "http://rfam.org/family/"+fam+"/alignment/stockholm"
r = requests.get(url)
......@@ -33,61 +38,61 @@ def get_rfam_alignment(family):
def get_PDB_sequence(PDBid):
#print("PDB")
#print(PDBid)
print(PDBid)
PDB, chain = PDBid.split(".")
#print(PDB)
#print("../all_graphs_pickled/" + PDB + ".nxpickled")
# print(PDB)
# print("../all_graphs_pickled/" + PDB + ".nxpickled")
try:
g = pickle.load(open("../models/all_graphs_pickled/" + PDB + ".nxpickled", "rb"))
except FileNotFoundError:
print("PDB FILE NOT FOUND")
return ("",0)
seq=""
nodes= []
return ("", 0)
seq = ""
nodes = []
for node in g.nodes(data=True):
#print(node)
#print(node[0][0],chain)
if node[0][0]==chain:
# print(node)
# print(node[0][0],chain)
if node[0][0] == chain:
nodecode = (node[0][1])
nodes.append((int(nodecode),node[1]["nt"]))
nodes.append((int(nodecode), node[1]["nt"]))
sortednodes = sorted(list(nodes))
nuc_by_node={}
nuc_by_node = {}
missing_nuc = False
#print("NODES")
# print("NODES")
numericals = [x[0] for x in sortednodes]
decalage = 0
if 1 not in numericals:
decalage=decalage+1
sortednodes.append((1,"N"))
decalage = decalage + 1
sortednodes.append((1, "N"))
sortednodes = sorted(sortednodes)
#missing_nuc=True
#decalage = decalage +1
# missing_nuc=True
# decalage = decalage +1
newnodes = []
#for i in sortednodes:
# for i in sortednodes:
# newnodes.append((i[0],i[1]))
#sortednodes = sorted(list(newnodes))
# sortednodes = sorted(list(newnodes))
numericals = [x[0] for x in sortednodes]
#print("MISSING 1", PDBid)
#print(numericals)
# print("MISSING 1", PDBid)
# print(numericals)
#
#sortednodes=sorted(sortednodes)
#numericals = [x[0]-1 for x in sortednodes]
#numericals.insert(0,0)
#else:
#print("NOT MISSING", PDBid)
# sortednodes=sorted(sortednodes)
# numericals = [x[0]-1 for x in sortednodes]
# numericals.insert(0,0)
# else:
# print("NOT MISSING", PDBid)
for i in sortednodes:
nuc_by_node[i[0]]=i[1]
#print(sortednodes)
for i in range(1,int(sortednodes[-1][0])+1):
nuc_by_node[i[0]] = i[1]
# print(sortednodes)
for i in range(1, int(sortednodes[-1][0]) + 1):
if i not in numericals:
"NOT IN NODES"
seq = seq+"-"
seq = seq + "-"
else:
seq = seq + nuc_by_node[i]
ss = g.graph['ss']
#print(seq)
#print("MISSING_NUC",PDBid,missing_nuc)
return(seq,decalage)
print(seq)
# print("MISSING_NUC",PDBid,missing_nuc)
return (seq, 0)
def match_rfam_to_PDB(PDB, family, range="",written=[],newfam=False):
if newfam==True:
......@@ -108,9 +113,9 @@ def match_rfam_to_PDB(PDB, family, range="",written=[],newfam=False):
# print("VALID")
positions = rfam[family][PDB.split(".")[0]][1]
seq_range= [int(i) for i in positions.split(" - ")]
#print(seq_range)
seq=seq[seq_range[0]-1:seq_range[1]-1]
#seq_range= [int(i) for i in positions.split(" - ")]
#print("SEQ RANGE",seq_range)
#seq=seq[seq_range[0]-1:seq_range[1]-1]
#print(seq)
#print(written)
......@@ -133,7 +138,7 @@ def match_rfam_to_PDB(PDB, family, range="",written=[],newfam=False):
# break
#if len(gapped_seq)==0:
# exit("NO MATCH FOUND")
return gapped_seq,written, seq_range,decalage
return gapped_seq,written, 0,0
def get_matches(rfam,rfam_by_PDB, PDBs):
......@@ -168,21 +173,23 @@ def get_matches(rfam,rfam_by_PDB, PDBs):
return matches
def gapped_column_from_ungapped_nuc_pos(gapped,ungapped,ungapped_pos):
# print(gapped)
#print(ungapped)
#print("CONVERTING GAPPED TO UNGAPPED")
#print("POSITION ",ungapped_pos)
gapped_counter=0
ungapped_counter = 0
# --01-2
gapped_counter=-1
ungapped_counter = -1
for i in range(len(gapped)):
if ungapped_counter==ungapped_pos+1:
#print("GAPPED",gapped_counter-1)
return gapped_counter-1
if gapped[i]=="-":
gapped_counter=gapped_counter+1
else:
gapped_counter=gapped_counter+1
ungapped_counter = ungapped_counter+1
if ungapped_counter==ungapped_pos:
#print("GAPPED",gapped_counter-1)
return gapped_counter
return -1000
def pos_from_fred(fred_list):
......@@ -260,7 +267,7 @@ def find_good_columns(module_columns_by_family, family):
cols = []
module_length = -1
for col in module_columns_by_family[family]:
module_length = len(module_columns_by_family[family][col][0])
module_length = len(module_columns_by_family[family][col])
break
# print(module_length)
for node_number in range(module_length):
......@@ -268,7 +275,7 @@ def find_good_columns(module_columns_by_family, family):
counts={}
total = 0
for example in module_columns_by_family[family]:
this_col = module_columns_by_family[family][example][0][node_number]
this_col = module_columns_by_family[family][example][node_number]
#print(this_col)
if this_col in counts:
counts[this_col]= counts[this_col]+1
......@@ -383,6 +390,25 @@ def get_ss_from_rfam_family(family,cols):
print(cols,ss)
return ss
def alignment_column_from_seq_positions(positions_for_this_pdb, seq):
print("GETTING ALIGNMENT COLUMNS")
gapped_seq = seq
s2 = str(seq)
ungapped_seq = s2.replace("-","")
print(gapped_seq,ungapped_seq)
aln_pos = []
for i in positions_for_this_pdb:
j = gapped_column_from_ungapped_nuc_pos(gapped_seq,ungapped_seq,i)
aln_pos.append(j)
return(aln_pos)
def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addresses):
newfam=True
written = []
......@@ -418,19 +444,21 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
print("sequence file is empty")
return -1
a =ClustalwCommandline("./clustalw2", infile="seq2.fasta", outfile="seq3.clustal")
print(a)
a()
#a =ClustalwCommandline("./clustalw2", infile="seq2.fasta", outfile="seq3.clustal")
#print(a)
#a()
print(module_PDB_addresses)
b =ClustalwCommandline("./clustalw2",profile1=profile,profile2="seq2.fasta",outfile="new_aln3.clustal")
if os.path.isfile("new_aln3.fasta"):
os.remove("new_aln3.fasta")
b =ClustalOmegaCommandline("../src/clustalo",profile1=profile,profile2="seq2.fasta",outfile="new_aln3.fasta")
#./clustalw2 -PROFILE1=RF02540.clustal -SEQUENCES -PROFILE2=seq.clustal -OUTPUT=new_alignment.clustal
print(b)
b()
module_columns_by_family={}
module_columns_by_family[CURRENTLY_STUDIED_FAMILY]={}
aln = AlignIO.read(open("new_aln3.clustal"), "clustal")
aln = AlignIO.read(open("new_aln3.fasta"), "fasta")
appearances_of_this_module_in_this_family = []
for record in aln:
current_PDB= record.id[3:]
......@@ -444,10 +472,12 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
if current_PDB in module_PDB_addresses:
positions_for_this_pdb = module_PDB_addresses[current_PDB]
print("positions",positions_for_this_pdb)
module_columns_by_family[CURRENTLY_STUDIED_FAMILY][current_PDB]=positions_for_this_pdb
columns_for_this_pdb = alignment_column_from_seq_positions(positions_for_this_pdb, record.seq)
print("COLUMNS FOR THIS PDB",columns_for_this_pdb)
module_columns_by_family[CURRENTLY_STUDIED_FAMILY][current_PDB]=columns_for_this_pdb
#print("---------")
#print(module_columns_by_family)
print("Module columns positions by family",module_columns_by_family)
good_cols= find_good_columns(module_columns_by_family,CURRENTLY_STUDIED_FAMILY)
print('FOUND COLUMNS:',good_cols)
if [-1000]==good_cols[0]:
......@@ -462,10 +492,10 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
#print(columns)
#print(good_cols)
if columns[0] in good_cols[0]:
if columns in good_cols:
#print(sequence)
aln = AlignIO.read(open("new_aln3.clustal"), "clustal")
aln = AlignIO.read(open("new_aln3.fasta"), "fasta")
for record in aln:
#print("PDB :", PDB)
if PDB in record.id and -1000 not in columns:
......@@ -498,9 +528,9 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
good_cols= find_good_columns(module_columns_by_family,CURRENTLY_STUDIED_FAMILY)
if good_cols==[] or [-1000] in good_cols:
return -1
aln = AlignIO.read(open("new_aln3.clustal"), "clustal")
aln = AlignIO.read(open("new_aln3.fasta"), "fasta")
for PDB in module_columns_by_family[CURRENTLY_STUDIED_FAMILY]:
columns = module_columns_by_family[CURRENTLY_STUDIED_FAMILY][PDB][0]
columns = module_columns_by_family[CURRENTLY_STUDIED_FAMILY][PDB]
sequence = module_columns_by_family[CURRENTLY_STUDIED_FAMILY][PDB][1]
experiment_seq = []
#print(columns)
......@@ -543,10 +573,11 @@ def sanity_check_existing_seqs():
print("--------------------")
pickle.dump(fixed_seqs,open("test3_ram.cPickle","wb"))
if __name__ == "__main__":
#sanity_check_existing_seqs()
rfam = pickle.load(open("../models/RFAM_databse.cPickle","rb"))
def PDB_to_Rfam(PDB_code, PDB_positions):
rfam_by_PDB = {}
number = 0
for i in rfam:
......@@ -571,15 +602,18 @@ if __name__ == "__main__":
positions = {}
module_PDB_addresses = {}
rfam_module_seqs = []
print(matches)
#module_PDB = ['3D2V.B.2',"2GDI.Y.1"]
module_PDB = ['3D2V.B.2']
#module_positions= {'3D2V.B' : [6,7,39,40,41,42,43,44,45,70,71,72,73],"2GDI.Y": [14,15,51,52,53,54,55,56,82,83,84,85]}
module_positions= {'3D2V.B' : [6,7,39,40,41,42,43,44,45,70,71,72,73]}
print(rfam_by_PDB)
module_PDB = PDB_code
print(module_PDB,PDB_positions)
module_positions = {}
for i in range(len(module_PDB)):
module_positions[module_PDB[i]] = [x-1 for x in PDB_positions[i]]
print("Rfam by PDB",rfam_by_PDB)
matches= get_matches(rfam,rfam_by_PDB, module_PDB)
print(matches)
print("KEYS",list(matches.keys()))
print("matches",matches)
#print("KEYS",list(matches.keys()))
if os.path.isfile("seq2.fasta"):
os.remove("seq2.fasta")
with open("seq2.fasta", "w") as f:
......@@ -600,3 +634,10 @@ if __name__ == "__main__":
for seq in module_seqs:
rfam_module_seqs.append(seq)
print(rfam_module_seqs)
return(rfam_module_seqs)
if __name__ == "__main__":
PDB_pos = [6, 7, 39, 40, 41, 42, 43, 44, 45, 70, 71, 72, 73]
a = PDB_to_Rfam(["3D2V.B"], [PDB_pos])
print(a)
\ No newline at end of file
......@@ -114,7 +114,12 @@ def create_module(desc,fasta,model_name, PDBs=[]):
graphs_name = "../models/" +model_name+"_one_of_each_graph.cPickle"
aln_name = "../models/"+model_name+"_aligned_modulegraphs.cPickle"
PDB_name = "../models/"+model_name+"_PDB_names.cPickle"
g = make_graph(desc)
if 'desc' in desc.lower():
g = make_graph(desc)
else:
g = pickle.load(open(desc,'rb'))
graph_list = make_new_graph_examples(g, fasta)
if os.path.isfile(graphs_name):
......
......@@ -41,31 +41,22 @@ def get_constraints_from_BN(graph, positions):
visited.append(n)
return constraints
def call_rnafold(seq,cons=""):
def call_rnafold(seq, cons="", just_ss=False):
E = 0
p = 0
Z = 0
"""
with open("seq.txt","w") as f:
f.write(seq)
if cons!="":
f.write("\n")
f.write(cons)
subcall = ""
"""
if cons=="":
if cons == "":
out = run(['RNAfold', '-p', '--noPS'], input=seq, stdout=PIPE, universal_newlines=True)
#subcall = "RNAfold -p < seq.txt > result.txt"
else:
#subcall = "RNAfold -p -C < seq.txt > result.txt"
out = run(['RNAfold', '-p', '--noPS', '-C'], input='{}\n{}'.format(seq, cons), stdout=PIPE, universal_newlines=True)
#subprocess.call(subcall,shell=True)
"""
with open("result.txt", "r") as f:
lines = f.readlines()
"""
out = run(['RNAfold', '-p', '--noPS', '-C'], input='{}\n{}'.format(seq, cons), stdout=PIPE,
universal_newlines=True)
lines = out.stdout.splitlines()
ss = lines[1].split(" ")[0]
if just_ss:
return ss
energy = (lines[1].split(" ")[-1]).strip()
eval = energy[1:]
final_energy = eval[:-1]
......@@ -73,16 +64,17 @@ def call_rnafold(seq,cons=""):
weight = (lines[-1].split(" ")[7]).strip()
p = float(weight[:-1])
#formula : Z = 1/p * exp(E/ RT))
# formula : Z = 1/p * exp(E/ RT))
T = 274.5
R = 0.00198717