Commit 63aa69eb authored by Roman Sarrazin-Gendron's avatar Roman Sarrazin-Gendron
Browse files

BayesPairing 1.2 online, see CHANGELOG

parent 098b4048
BAYESPAIRING 1.1
BAYESPAIRING 1.2
- Fixed issue where a candidate could be double-counted in the end-of-run statistics.
- Updated Bayesian Network structure to allow inclusion of all edges (all backbone interactions, base pairs and edges)
- Accelerated sampling process by basing Bayesian probabilites on parents only rather than on the full ensemble.
- Known issue: cannot update Bayesian sample if the sample exists. It needs to be deleted manually before the command is run for -p to be considered. Will be fixed soon.
(February 27th, 2019) BAYESPAIRING 1.1
- Updated scoring system to score structure probability out of 25000 in order to maintain interpretability of results while keeping the same scoring system when comparing datasets with different sample size
- Updated Bayesian probability scoring to never score the same structure twice (keeping a smaller dictionary of already scored candidates rather than looking through the large dictionary of samples every time).
- Known issue: cannot update Bayesian sample if the sample exists. It needs to be deleted manually before the command is run for -p to be considered. Will be fixed soon.
......@@ -158,6 +158,7 @@ def build_pwm(positions, aln_dict):
:param aln_dict: all alignment information, needed to call get_cons_seq_from_column
:return: dictionary of the nuc statistics for each column.
"""
#print("building position weight matrix")
scores = {}
regex = r""
for i in positions:
......@@ -226,7 +227,7 @@ def get_partial_prob_data(aln_dict, node, partners, nuc, more_than_one=False):
for i in aln[example_key].keys():
match = True
for ind,j in enumerate(partners):
#print("ALIGNMENT",aln.keys())
#print("ALIGNMENT",aln)
#print("nuc",nuc,ind)
if aln[j][i] != str(nuc[ind]):
match = False
......@@ -253,38 +254,21 @@ def flatten_parents_and(g,node,parents):
if len(anc)==0:
#print("no anc")
clustered_parents.append([int(p)])
#print(clustered_parents)
elif len(anc)==1:
#print("one anc")
# print("one anc")
clustered_parents.append([int(anc[0]),int(p)])
# print(clustered_parents)
else:
#print("mutltiple anc")
uncles_and_aunts= []
added=True
while len(anc)>0 and added==True:
#print("new loop",anc,clustered_parents)
a=anc[0]
imm_par= get_immediate_parents(g, a)
#print("parents of",a,"are ",imm_par)
#print("current_anc", anc)
for par in imm_par:
if par not in anc:
imm_par.remove(par)
if imm_par==[]:
clustered_parents.append([int(p)])
anc.remove(a)
else:
for dad in imm_par:
#print(dad,"immediate parents:",imm_par)
uncles_and_aunts.append(int(dad))
#print("current_anc",anc,"current_unc",uncles_and_aunts)
if str(dad) in anc :
anc.remove(str(dad))
elif int(dad) in anc:
anc.remove(dad)
else:
added=False
uncles_and_aunts.append(p)
clustered_parents.append(uncles_and_aunts)
# print("ancestors",anc)
#imm_par= get_immediate_parents(g, p)
#print("parents of current parent",imm_par)
#exit(0)
anc.append(str(p))
clustered_parents.append(sorted(anc))
#print(clustered_parents)
#print("INITIAL:",parents,"CLUSTERED:",clustered_parents)
return clustered_parents
......@@ -307,29 +291,26 @@ def get_priors(g, node, seqfreq, priors, aln_dict):
"""
#print("CURRENTLY BUILDING BAYES NET AT NODE :", node)
if node not in priors.keys():
ancestors = g._get_ancestors_of((node))
anc = list(ancestors)
anc.remove(node)
parents = get_immediate_parents(g, node)
#print("ANCESTORS:",anc)
parents = get_immediate_parents(g,node)
#print("parents",parents, flush=True)
# if the node has no parents, then it's easy, the conditional probability distribution is a PWM
if len(anc) == 0:
if len(parents) == 0:
values = []
priors[node] = [[seqfreq[node]['A']], [seqfreq[node]['C']], [seqfreq[node]['G']], [seqfreq[node]['U']]]
else:
for j in anc:
for j in parents:
if j not in priors.keys():
priors = get_priors(g, j, seqfreq, priors, aln_dict)
cpd_list = []
# If the node is not yet in the prior dict, and only has one parent, the size of the CPD is 16 :
if node not in priors.keys() and len(parents) == 1:
for z in parents:
x = priors[z]
for y in range(len(x)):
for parent in parents:
x = priors[parent]
for y in range(4):
nuclist = ['A', 'C', 'G', 'U']
exp_score = get_partial_prob_data(aln_dict, node, [int(z)], nuclist[y % 4])
exp_score = get_partial_prob_data(aln_dict, node, [int(parent)], nuclist[y % 4])
cpd_list.append([float(exp_score['A']) * float(x[y][0])])
cpd_list.append([float(exp_score['C']) * float(x[y][0])])
cpd_list.append([float(exp_score['G']) * float(x[y][0])])
......@@ -337,16 +318,16 @@ def get_priors(g, node, seqfreq, priors, aln_dict):
priors[node] = cpd_list
# Here if more than one parent, we solve recursively
elif node not in priors.keys() and len(parents) > 1:
elif node not in priors.keys() and len(parents)> 1:
cpd_list = []
cpd_list_with_node_data = []
prob_parents = []
for i in flatten_parents_and(g,node,parents):
#print("flattened parents:",i)
for j in i:
temp = [0,1,2,3]
for i in parents:
temp = [0,1,2,3]
prob_parents.append(temp)
prob_parents.append(temp)
# print("EXAMPLE ARRAY PROB PARENTS", flush=True)
#print(prob_parents)
tuple_iter = itertools.product(*prob_parents)
#print("TUPLES", prob_parents)
......@@ -355,21 +336,23 @@ def get_priors(g, node, seqfreq, priors, aln_dict):
tuples = [item for item in tuple_iter]
#print(tuples)
# We generated the format of the list, now we fill it
all_parents = flatten_parents_and(g,str(node),parents)
#all_parents = flatten_parents_and(g,str(node),parents)
#print("SIZE OF TUPLE SPACE",len(tuples), flush=True)
for j in tuples:
combined_prob = 1
combined_nuc = ""
nuclist = ['A', 'C', 'G', 'U']
# print(j)
for i in range(len(j)):
#print(i,j)
parents = parents_in_single_list(all_parents)
combined_prob = combined_prob * priors[str(parents[i])][j[i]][0]
combined_nuc = combined_nuc + nuclist[j[i] % 4]
#print("NUMBER OF COMBINED NUCLEOTIDES:",len(combined_nuc))
#print(combined_nuc, combined_prob)
exp_score = get_partial_prob_data(aln_dict, node, all_parents, combined_nuc,more_than_one=True)
#if (tuples.index(j))%10000==0:
#print("step",tuples.index(j),"of",len(tuples), flush=True)
#print("NUMBER OF COMBINED NUCLEOTIDES:",len(combined_nuc))
#print(combined_nuc, combined_prob)
exp_score = get_partial_prob_data(aln_dict, node, [int(x) for x in parents], combined_nuc)
#print("current_probs:",exp_score)
cpd_list_with_node_data.append([float(exp_score['A'])])
cpd_list_with_node_data.append([float(exp_score['C'])])
......@@ -389,6 +372,10 @@ def add_cpd(g, i, priors, cpds):
:param cpds: list of node CPDs, updated recursively
:return: updated graph, updated list of CPD
"""
#print("ADDING CPD OF LENGTH",len(priors[i]), flush=True)
#print("parents added",cpds)
parents_card = []
pars = get_immediate_parents(g, i)
if len(pars) == 0:
......@@ -400,7 +387,7 @@ def add_cpd(g, i, priors, cpds):
if j not in cpds:
g, cpds = add_cpd(g, j, priors, cpds)
for j in pars:
parents_card.append(len(priors[j]))
parents_card.append(4)
# add the CPD to the model
# print(i,len(priors[i]),pars,parents_card)
......@@ -463,6 +450,7 @@ def check_sum_1(probs):
def build_BN(g, seqfreq, aln_dict):
#print("building BN")
"""
Actually builds the bayes net using the above functions
:param g: Bayes net object
......@@ -470,27 +458,50 @@ def build_BN(g, seqfreq, aln_dict):
:param aln_dict: all the information about the alignment
:return: the BayesianModel object
"""
motif = g
labels = {}
pos = nx.circular_layout(motif)
nodes = nx.draw_networkx_nodes(motif, pos, nodelist=motif.nodes(), node_color='lightgrey', node_size=500, linewidth=2,alpha=0.99)
for index, i in enumerate(list(motif.nodes(data=True))):
labels[i[0]] = i[0]
nx.draw_networkx_labels(motif, pos, labels, font_size=10)
nx.draw_networkx_edges(motif, pos, edgelist=motif.edges(), edge_color='purple', width=2, arrowsize=25)
plt.axis("off")
plt.savefig("graphBN.png",format = "png")
cpds = []
priors = {}
# first, get the prior information for each nodei
#print("ALL NODES:", g.nodes())
for i in g.nodes():
ancestors = g._get_ancestors_of((i))
anc = list(ancestors)
anc.remove(i)
#print("currently getting priors of node",i, flush=True)
ancestors = get_immediate_parents(g,i)
parents = list(ancestors)
#print("there are",len(parents),"parents", flush=True)
priors = get_priors(g, i, seqfreq, priors, aln_dict)
# verifies the priors are in the correct format
for i in priors.keys():
priors[i] = check_sum_1(priors[i])
#print("ADDING CPDs TO MODEL", flush=True)
# Adds the CPDs to the model
for i in g.nodes():
for i in sorted(list(g.nodes()),key=float):
#print("NODE:",i, flush=True)
if i not in cpds:
g, cpds = add_cpd(g, i, priors, cpds)
probas = (g.get_cpds())
probas = g.get_cpds()
#for i in g.get_cpds():
# print(i)
##g_copy= nx.DiGraph()
#g_copy.add_nodes_from(g.nodes())
#g_copy.add_edges_from(g.edges())
bayes_dict = {}
for ind,node in enumerate(sorted(list(g.nodes()),key=float)):
bayes_dict[node] = probas[ind].values
import pickle
pickle.dump(g,open("bayes_net_hairpin.cPickle","wb"))
return g
#
......@@ -2,26 +2,29 @@ __author__ = 'Roman'
import bayes_to_seqpy as bs
import os
import pickle
import subprocess
#import subprocess
import testSS
import heapq
from operator import itemgetter
import ast
import math
import time
#import time
import make_BN_from_carnaval as makeBN
def parse_sequence(seq, modules, ss, dataset, left_out, sm=0.3, mc=0, p=25000):
def parse_sequence(seq, modules, ss, dataset, left_out, sm=0.3, mc=0, p=10000):
pdbs = pickle.load(open("../models/" + dataset + "_PDB_names.cPickle", "rb"))
graphs = pickle.load(open("../models/" + dataset + "_one_of_each_graph.cPickle", "rb"))
return_dict = {}
scores = {}
MODEL_PATH = "../models/"
for mod in modules:
# print("building Bayes Net for module : "+str(mod))
print("building Bayes Net for module : "+str(mod), flush=True)
this_graph = graphs[mod][0]
print("Creating Bayesian Network", flush=True)
makeBN.call_makeBN(mod, dataset, left_out)
print("Bayesian Network created", flush=True)
motif = pickle.load(open(str("../models/"+dataset + "_BN_" + str(mod)), "rb"))
pickle_name = dataset + "_samp" + str(mod) + ".cPickle"
if left_out != "NONE":
......@@ -31,25 +34,27 @@ def parse_sequence(seq, modules, ss, dataset, left_out, sm=0.3, mc=0, p=25000):
pickle.dump((scores, return_struct, max), open("../models/best" + pickle_name , "wb"))
struct_list = pickle.load(open("../models/"+pickle_name, "rb"))
best_struct = pickle.load(open("../models/best" + pickle_name, "rb"))
this_score, time = bs.seq_to_struct(seq, motif, struct_list, best_struct, graphs[mod], ss, mc, p)
this_score, time = bs.seq_to_struct(seq, motif, struct_list, best_struct, graphs[mod], ss, mc)
scores[mod] = this_score
return_dict[mod] = [this_score, this_graph]
else:
if not (os.path.isfile("../models/"+pickle_name)) or not (os.path.isfile("../models/"+"best" + pickle_name)):
print("building and sampling Bayes Net for module : " + str(mod))
print("building and sampling Bayes Net for module : " + str(mod), flush=True)
sample_df, sample_dict, struct_list = bs.setup_sampling(motif, p)
print("obtaining best structure", flush=True)
scores, return_struct, max = bs.get_best_struct(motif, struct_list, sm)
pickle.dump(struct_list, open("../models/"+pickle_name, "wb"))
pickle.dump((scores, return_struct, max), open("../models/best" + pickle_name, "wb"))
struct_list = pickle.load(open("../models/"+pickle_name, "rb"))
best_struct = pickle.load(open("../models/best" + pickle_name, "rb"))
this_score, time = bs.seq_to_struct(seq, motif, struct_list, best_struct, graphs[mod], ss, mc, p)
print("parsing sequence", flush=True)
this_score, time = bs.seq_to_struct(seq, motif, struct_list, best_struct, graphs[mod], ss, mc)
scores[mod] = this_score
return_dict[mod] = [this_score, this_graph]
else:
struct_list = pickle.load(open("../models/"+pickle_name, "rb"))
best_struct = pickle.load(open("../models/"+"best" + pickle_name, "rb"))
this_score, time = bs.seq_to_struct(seq, motif, struct_list, best_struct, graphs[mod], ss, mc, p)
this_score, time = bs.seq_to_struct(seq, motif, struct_list, best_struct, graphs[mod], ss, mc)
scores[mod] = this_score
return_dict[mod] = [this_score, this_graph]
......@@ -192,6 +197,7 @@ def returner(scores, seq, ss="", m=8, n=5, k=1, sscons=2):
maxs.append(returner)
return maxs
if __name__ == "__main__":
modules_to_parse = [216]
# modules_to_test =list(set(sorted([191, 194, 198, 216])))
......
__author__ = 'Roman'
import itertools
import matplotlib.pyplot as plt
import networkx as nx
import pgmpy
from pgmpy.models import BayesianModel
def get_cons_seq_from_column(col, aln_dict):
"""
Takes as input the full alignment information, and the column of interest
"""
aln, addresses, seqs = aln_dict
all_nucs = []
# print(aln.keys())
tot = len(aln[col].values())
for i in aln[col].values():
all_nucs.append(i)
# print(all_nucs)
scores_dict = {}
for i in "ACGU":
z = all_nucs.count(i)
but = all_nucs.count('-')
scores_dict[i] = float(z + 0.1) / float(tot + 0.4 - but)
return scores_dict
def get_key_interaction_nodes(cols, n, type, addresses):
"""
Get the important nodes, that have a frequency over some threshold, that will be used to build the bayes net,
and outputs them in the format of a list
"""
nodes = []
if type == '3d':
threshold = 0.15
else:
threshold = 0.35
for i in range(len(cols)):
if cols[i][1] != None:
if (int(cols[i][1]) / float(n)) > threshold:
nodes.append(addresses[i])
return nodes
def build_graph(cols_ss, cols_3d, nodesSS, nodes3d, details, aln_dict, motif_zone):
"""
Builds a graph with all the nucleotides and nodes that are close to important 3D nodes
:param cols_ss: all most common secondary structure interactions
:param cols_3d: all most common 3D interactions
:param nodesSS: Important secondary structure nodes
:param nodes3d: Important 3D structure nodes
:param details: leontis westhof details for the 3D interactions
:param aln_dict: all alignment information
:param motif_zone: exact alignment positions of the motifs if given (nodes)
:return:
"""
motif = BayesianModel()
edge_3d = []
edge_ss = []
allnodes = []
keynodes = []
nearkeynodes = []
backbone = []
# nodes that have a significant 3D interaction inside the motif region are keynodes
for i in nodes3d:
for j in cols_3d:
if j[0] == i and int(i) in motif_zone:
if j[0] == i and str(i) not in keynodes:
keynodes.append(str(i))
if j[0] == i and i not in allnodes:
allnodes.append(str(i))
for zz in motif_zone:
if str(zz) not in keynodes:
keynodes.append(str(zz))
for j in keynodes:
if j not in nearkeynodes:
nearkeynodes.append(str(i))
# add secondary structure interactions only if they are between two nodes in the immediate vicinity of a keynode
for i in range(len(cols_ss)):
if cols_ss[i][0] != None:
#print(str(motif_zone[i]), cols_ss[i][0])
if (str(motif_zone[i]) in nearkeynodes and str(cols_ss[i][0]) in nearkeynodes) and (
str(motif_zone[i]), str(cols_ss[i][0])) not in edge_ss and (
str(cols_ss[i][0]), str(motif_zone[i])) not in edge_ss:
if i < cols_ss[i][0]:
edge_ss.append((str(motif_zone[i]), str(int(cols_ss[i][0]))))
else:
edge_ss.append((str(int(cols_ss[i][0])), str(motif_zone[i])))
for i in keynodes:
allnodes.append(i)
for i in nearkeynodes:
if i not in allnodes:
allnodes.append(i)
finalnodes = sorted(allnodes)
weightss = []
for i in finalnodes:
weightss.append(1)
motif.add_nodes_from(finalnodes, weightss)
# add secondary structure nodes
motif.add_edges_from(edge_ss)
new_edgess = []
for i in edge_ss:
new_edgess.append((str(i[0]), str(i[1])))
# add 3D interaction nodes
for i in range(len(cols_3d)):
if cols_3d[i][0] != None:
if (str(motif_zone[i]) in keynodes and str(cols_3d[i][0]) in keynodes) and motif_zone[i] != cols_3d[i][
0] and (str(motif_zone[i]), str(cols_3d[i][0])) not in edge_3d and (
str(cols_3d[i][0]), str(motif_zone[i])) not in edge_3d:
if i < cols_3d[i][0]:
edge_3d.append((str(motif_zone[i]), str(cols_3d[i][0])))
else:
edge_3d.append((str(cols_3d[i][0]), str(motif_zone[i])))
for i in edge_3d:
i = (str(i[0]), str(i[1]))
motif.add_edges_from(edge_3d)
othernodes = []
for node in motif.nodes():
if node not in keynodes:
othernodes.append(node)
# builds a secondary graph that copies the bayes net, but with added backbone.
plotter = nx.Graph()
plotter.add_nodes_from(motif.nodes())
plotter.add_edges_from(motif.edges())
sorted_nodes = (sorted(plotter.nodes()))
for i in range(1, len(sorted_nodes)):
plotter.add_edge(sorted_nodes[i - 1], sorted_nodes[i])
backbone.append((sorted_nodes[i - 1], sorted_nodes[i]))
pos = nx.spring_layout(plotter)
nx.draw_networkx_nodes(plotter, pos, node_color='g', node_size=400)
nx.draw_networkx_nodes(plotter, pos, nodelist=othernodes, node_color='r', node_size=400)
nx.draw_networkx_edges(plotter, pos, edgelist=edge_ss, width=1.6, edge_color='r')
nx.draw_networkx_edges(plotter, pos, edgelist=edge_3d, width=2.0, edge_color='g')
nx.draw_networkx_edges(plotter, pos, edgelist=backbone, width=0.2, edge_color='b')
labels = {}
for i in motif.nodes():
labels[i] = str(int(i) % (motif_zone[0]) + 1)
nx.draw_networkx_labels(motif, pos, labels, font_size=12)
plt.show()
return motif
def build_pwm(positions, aln_dict):
"""
This function gets the nucleotide statistics for each column of the alignment and returns them as a dictionary
:param positions: the positions of the alignment, input as a list
:param aln_dict: all alignment information, needed to call get_cons_seq_from_column
:return: dictionary of the nuc statistics for each column.
"""
scores = {}
regex = r""
for i in positions:
scores[i] = get_cons_seq_from_column(int(i), aln_dict)
return scores
def get_immediate_parents(g, node):
"""
helper function that returns a list of the parents of a node
:param g: graph of the motif
:param node: the node we want parents for
:return: a list of parents
"""
parents = []
for i in g.edges():
if i[1] == node:
parents.append(i[0])
if node in parents:
parents.remove(node)
return parents
def get_partial_prob_data(aln_dict, node, partners, nuc, more_than_one=False):
#print("GETTING PARTIAL PROB OF",node)
"""
Extract data for conservatory mutations.
:param aln_dict: all nucleotide information
:param node: node of interest
:param partners: interacting node of interest
:param nuc: nucleotide of the partner column
:return: a list of scores, but only including sequences including the nucleotide nuc
at the partner position
"""
#print(node,partners,nuc)
if more_than_one:
oneD_partners=[]
for i in partners:
for j in i:
oneD_partners.append(int(j))
else:
oneD_partners=partners
aln, addresses, seqs = aln_dict
#print("ALL PARTNERS IN ONE LIST:",oneD_partners)
if len(oneD_partners) == 1:
partner = int(partners[0])
example_key = list(aln.keys())[0]
nucs = []
scores_dict = {}
for i in aln[example_key].keys():
if aln[partner][i] == str(nuc):
nucs.append(aln[int(node)][i])
for i in "ACGU":
z = nucs.count(i)
gaps = nucs.count('-')
scores_dict[i] = float(z + 1) / float(len(nucs) - gaps + 4)
else:
#print("DONE")
partners=oneD_partners
#print("partners",partners)
nucs = []
scores_dict = {}
example_key = list(aln.keys())[0]
for i in aln[example_key].keys():
match = True
for ind,j in enumerate(partners):
#print("ALIGNMENT",aln.keys())
#print("nuc",nuc,ind)
if aln[j][i] != str(nuc[ind]):
match = False
if match == True:
nucs.append(aln[int(node)][i])
#print("NUCLEOTIDES OBSERVED",nucs)
for i in "ACGU":
z = nucs.count(i)
gaps = nucs.count('-')
scores_dict[i] = float(z + 0.1) / float(len(nucs) - gaps + 0.4)
return scores_dict
def flatten_parents_and(g,node,parents):
if str(node) in parents:
parents.remove(node)
#print("FLATTENING PARENTS of",node)
clustered_parents= []
for p in parents:
anc = list(g._get_ancestors_of(str(p)))
anc.remove(str(p))
#print("ANCESTRY OF PARENT:",p,"IS :",anc)
#anc.remove(node)
if len(anc)==0:
#print("no anc")
clustered_parents.append([int(p)])
elif len(anc)==1:
#print("one anc")
clustered_parents.append([int(anc[0]),int(p)])
else:
#print("mutltiple anc")
uncles_and_aunts= []
added=True
while len(anc)>0 and added==True: