Commit a2345fb6 authored by Roman SARRAZIN GENDRON's avatar Roman SARRAZIN GENDRON
Browse files

Merge branch 'master' of jwgitlab.cs.mcgill.ca:sarrazin/rnabayespairing2

parents 101e49eb fd70e074
......@@ -21,7 +21,7 @@ def get_consensus(module_seqs):
#print(scores.index(max(scores)))
#print(np.argmax(scores))
tot = max(sum(scores),4)
consensus.append([x/(tot+0.25) for x in scores])
consensus.append([x/tot for x in scores])
return consensus
def rfam_to_module(positions=[], family_file="",output_name="module_seqs.fasta", as_list=False, seqs=[]):
......@@ -68,7 +68,7 @@ def rfam_to_module(positions=[], family_file="",output_name="module_seqs.fasta",
this_mod = this_mod + str(fullseq[col-1])
module_seqs.append(this_mod)
cons = get_consensus(module_seqs)
fmodule_seqs = []
for s in module_seqs:
this_mod = ""
......
......@@ -349,5 +349,28 @@ def call_makeBN(mod,dataset,left_out, leave_out_seq = False, left_out_seq = "",
BN.from_alignment_dataset(dataset, mod, [g], alignment, test_seqs, [], Lambda, Theta)
return BN
# Temporary placeholder function for CV for building a Bayesian Network
def make_BN(module, dataset, graphs, motif_sequences, Lambda=0.35, Theta=1):
# Test sequences should be tuple of seq and positions, seems like it is outdated/not used
test_seqs = []
# Build the BN with the motif sequences
g = graphs[module][0]
aln = {}
for n in sorted(list(g.nodes())):
aln[n] = n
alignment = get_alignment(g, aln, motif_sequences)
motif = make_graph_from_carnaval(g, alignment)
pwm = BN_builder.build_pwm(sorted(list(motif.nodes())), alignment)
BN = BN_builder.build_BN(motif, pwm, alignment)
# PDBs array is also empty, outdated?
BN.from_alignment_dataset(dataset, module, [g], alignment, test_seqs, [], Lambda, Theta)
return BN
if __name__ == "__main__":
print("Please run this from parse_sequences.py")
......@@ -23,15 +23,15 @@ def get_dependencies(module, map_index, out=None):
G = nx.Graph()
G.add_nodes_from(range(len(module.nodes)))
G.add_edges_from([(map_index[i], map_index[j]) for i,j in module.edges()])
td = treedecomp.makeTD_nx(G)
td = treedecomp.TreeDecompositionFactory().create(len(G.nodes), G.edges)
if out:
td.writeTD(out)
res = {}
for idx in td.toposorted_bag_indices():
children = td.adj[idx]
x = td.bags[idx]
x = td.get_bags()[idx]
for idy in children:
y = td.bags[idy]
y = td.get_bags()[idy]
key = td.diff_set(x,y)[0]
value = td.sep_set(x,y)
res[inv_map[key]] = [inv_map[x] for x in value]
......
#!/usr/bin/env python3
#
# InfraRed --- A generic engine for Boltzmann sampling over constraint networks
# (C) Sebastian Will, 2018
#
# This file is part of the InfraRed source code.
#
# InfraRed provides a generic framework for tree decomposition-based
# Boltzmann sampling over constraint networks
#
## @file
##
## Storing and manipulating tree decompositions
##
## Supports computation by delegate
##
## * generate tree decompositions usng libhtd (via python wrapper) or calling TDlib
##
## * plot dependency graphs and tree decompositions using dot
##
## NOTE: TDlib writes files to disk, currently it is *not* safe to use
## in distributed computations. In contrast, libhtd (default) does not
## write to disk at all
##
## For using TDlib, it has to be in the Java class path (environment
## variable CLASSPATH). The default tree decomposition by libhtd needs
## to find the shared library libhtd.so; this could require to set
## LD_LIBRARY_PATH manually.
import os
import subprocess
import re
from math import sqrt,ceil
from networkx.algorithms.approximation.treewidth import treewidth_min_degree
## @brief Class to hold a tree decomposition
##
## contains
##
## * bags a list of 0-based indexed lists of vertices of the decomposed graph (or "variables")
##
## * edges a list of edges; an edge is a list of bags (bag1, bag2)
##
## * adj adjacency lists (as defined by edges)
##
## edges are directed, an edge from bag x to y is represented by (x,y)
class TreeDecomp:
## @brief construct from bags and edges
##
## @param bags lists of indices of the variables (vertices) in the
## bag
##
## @param edges list of edges between the bags; edges are
## specified as pairs (x,y), where x is the index of the parent
## bag and y, of the child bag (0-based indices according to
## bags).
##
## Edges must be directed from root(s) to leaves. Typically the
## bags and edges result from calling a tree decomposition
## algorithm. The supported algorithms return correctly oriented
## edges.
def __init__(self,bags,edges):
# copy bags
self.bags = list(bags)
# ensure that all edges are tuples or lists of length 2
assert(all(len(x)==2 for x in edges))
# copy edges and convert all edges to pairs (even if they have been lists)
self.edges = list(map(lambda x: (x[0],x[1]), edges))
self.update_adjacency_lists()
## @brief Comute adjacency list representation
##
## @param n number of nodes
## @param edges list of edges
@staticmethod
def adjacency_lists(n,edges):
adj = { i:[] for i in range(n)}
for (i,j) in edges:
adj[i].append(j)
return adj
## @brief Update the adjacency representation
##
## Updates the adjacency representation in adj according to the
## number of bags and edges
def update_adjacency_lists(self):
self.adj = self.adjacency_lists(len(self.bags),self.edges)
## @brief Toppological sort of bags
##
## @returns sorted list of bag indices
def toposorted_bag_indices(self):
n = len(self.bags)
visited = set()
sorted = list()
def toposort_component(i):
visited.add(i)
for j in self.adj[i]:
if not j in visited:
toposort_component(j)
sorted.append(i)
for i in range(n):
if not i in visited:
toposort_component(i)
return sorted[::-1]
## @brief Difference set
##
## @param xs first list
## @param ys second list
##
## @return ys setminus xs
##
## For bags xs and ys, this computes the introduced variable
## indices when going from parent xs to child ys.
@staticmethod
def diff_set(xs,ys):
return [ y for y in ys if y not in xs ]
## @brief Separator set
##
## @param xs first list
## @param ys second list
##
## @return overlap of xs and ys
##
## For bags xs and ys, this computes the 'separator' of xs and ys,
## i.e. the common variables.
@staticmethod
def sep_set(xs,ys):
return [ y for y in ys if y in xs ]
## @brief Get tree width
def treewidth(self):
return max([len(bag) for bag in self.bags]) - 1
## @brief Write tree decomposition in dot format
## @param out output file handle
def writeTD(self, out):
def baglabel(bag):
if len(bag)==0:
return ""
lwidth = ceil( sqrt(len(bag)) )
lnum = ceil( len(bag) / lwidth )
xs = [str(i) for i in sorted(bag)]
lines=list()
for i in range(0,lnum):
lines.append(" ".join(xs[i*lwidth:(i+1)*lwidth]))
return "\\n".join(lines)
out.write("digraph G {\n\n")
for bagid,bag in enumerate(self.bags):
label = baglabel(bag)
out.write( "\tbag{} [label=\"{}\"]\n".format(bagid+1,label) )
out.write("\n\n")
for (x,y) in self.edges:
edgelabel = " ".join( [ str(x) for x in self.diff_set(self.bags[x],self.bags[y] )] )
out.write( "\tbag{} -> bag{} [label=\"{}\"]\n".format(x+1,y+1,edgelabel) )
out.write("\n}\n")
## @brief Guarantee a certain maximal diff set size
##
## @param maxdiffsize maximum size of diff sets after transformation
##
## @see diff_set()
##
## @pre the tree is connected
##
## Expands the tree by inserting a minimum number of in-between
## bags whereever the diff set size exceeds maxdiffsize. This can
## limit the complexity of certain algorithm based on the tree (in
## particular sampling in Infrared).
##
def expand_treedecomposition(self, maxdiffsize=1):
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
n = len(self.bags)
root = self.toposorted_bag_indices()[0]
next_bag_idx = n
## perform depth first traversal
visited = set()
stack = [(None,root)] # parent idx, bag index
while stack:
#pop
(u,v) = stack[-1]
stack = stack[:-1]
if v in visited: continue
visited.add(v)
# push children on stack
for w in self.adj[v]:
stack.append((v,w))
if u is not None:
# determine diff set
diff = self.diff_set(self.bags[u],self.bags[v])
if len(diff) > maxdiffsize:
sep = self.sep_set(self.bags[u],self.bags[v])
if (u,v) in self.edges:
self.edges.remove((u,v))
if (v,u) in self.edges:
self.edges.remove((v,u))
last_bag_idx = u
newbag = sep
for ext in chunks(diff[:-1],maxdiffsize):
newbag.extend(ext)
self.bags.append(newbag[:])
self.edges.append([ last_bag_idx, next_bag_idx])
last_bag_idx = next_bag_idx
next_bag_idx += 1
self.edges.append([ last_bag_idx, v])
self.update_adjacency_lists()
# ##########################################################
# writing graphs to files in different formats
#
## @brief Plot graph
##
## @param graphfile file of graph in dot format
##
## The graph is plotted and written to a pdf file by calling
## graphviz's dot tool on th dot file.
def dotfile_to_pdf(graphfile):
outfile = re.sub(r".dot$",".pdf",graphfile)
subprocess.check_output(["dot","-Tpdf","-o",outfile,graphfile])
## @brief Write graph in dgf format
##
##@param out output file handle
##@param num_nodes number of nodes
##@param edges the edges of the graph
def write_dgf(out, num_nodes, edges):
edge_num=len(edges)
out.write("p tw {} {}\n".format(num_nodes, edge_num))
for (u,v) in sorted(edges):
out.write("e {} {}\n".format(u+1,v+1))
## @brief Write graph in dot format
##
## @param out output file handle
## @param num_nodes number of nodes
## @param edges the edges of the graph
def write_dot(out, num_nodes, edges):
edge_num=len(edges)
out.write("graph G{\n\n")
for v in range(num_nodes):
out.write("\tnode{idx} [label=\"{idx}\"]\n".format(idx=v))
for (u,v) in edges:
out.write("\tnode{} -- node{}\n".format(u,v))
out.write("\n}\n")
# ##########################################################
# Interface TDlib
#
# specific functions to call TDlib's tree decomposition tool
# and parse the result
#
## @brief Parse tree decomposition as written by TDlib
##
## @param tdfh file handle to tree decomposition in dot format
## @param num_nodes number of nodes
## @returns tree decomposition
##
## Assume file is in td format as written by the tool TDlib.
def parseTD_TDlib(tdfh, num_nodes):
bags = list()
edges = list()
for line in tdfh:
# catch bags "New_vertex"
if re.search("New_Vertex",line):
bags.append([])
continue
m = re.search(r"bag(\d+).+label=\"(.+)\"",line)
if m:
bagid=int(m.group(1))
label = m.group(2)
labels = re.findall('\d+', label)
labels = [ int(label) for label in labels ]
if bagid!=len(bags)+1:
raise IOError("Bag indices in td file must be consecutive (at bag {})!".format(bagid))
bags.append(labels)
else:
m = re.search(r"bag(\d+) -- bag(\d+)", line)
if m:
edges.append((int(m.group(1)),
int(m.group(2))))
# decrease bag labels
def dec(xs):
return [ [x-1 for x in ys] for ys in xs]
bags = dec(bags)
edges = dec(edges)
# add missing bags
present = set()
for b in bags:
for x in b:
present.add(x)
for i in range(num_nodes):
if not i in present:
bags.append([i])
return TreeDecomp(bags,edges)
## @brief Obtain tree deomposition using networkx
def makeTD_nx(G, maxdiffsize=1):
i, tree = treewidth_min_degree(G)
bags = list(map(list, tree.nodes))
edges = [(bags.index(list(i)),bags.index(list(j))) for i,j in tree.edges]
td = TreeDecomp(bags, edges)
td.expand_treedecomposition(maxdiffsize)
return td
# End of Interface tree demposition libraries
# ----------------------------------------------------------
if __name__ == "__main__":
pass
import pickle
import random
import BayesPairing
import make_BN_from_carnaval
import parse_sequences
import collections
import argparse
import testSS
def computeCountAndLists(s):
"""
Constructs nucleotide dictionary, dinucleotide dictionary and list of dicnucleotides
:param s: Nucleotide sequence String consisting of A,C,G,U or .
:return: Tuple of nucleotide counts, dinucleotide counts and dict of dinucleotides in List form
"""
# WARNING: Use of function count(s,'UU') returns 1 on word UUU
# since it apparently counts only nonoverlapping words UU
# For this reason, we work with the indices.
# Initialize lists and mono- and dinucleotide dictionaries
# List is a dictionary of lists
List = {'A': [], 'C': [], 'G': [], 'U': [], '.': []}
nuclList = ["A", "C", "G", "U", "."]
s = s.upper()
nuclCnt = {} # empty dictionary
dinuclCnt = {} # empty dictionary
for x in nuclList:
nuclCnt[x] = 0
dinuclCnt[x] = {}
for y in nuclList:
dinuclCnt[x][y] = 0
# Compute count and lists
nuclCnt[s[0]] = 1
nuclTotal = 1
dinuclTotal = 0
for i in range(len(s) - 1):
x = s[i];
y = s[i + 1]
List[x].append(y)
nuclCnt[y] += 1;
nuclTotal += 1
dinuclCnt[x][y] += 1;
dinuclTotal += 1
assert (nuclTotal == len(s))
assert (dinuclTotal == len(s) - 1)
return nuclCnt, dinuclCnt, List
def chooseEdge(x, dinuclCnt):
"""
Chooses an adjacent nucleotide from the given dinucleotide count Dictionary and subtracts 1 from the count
:param x: Nucleotide A,C,G,U
:param dinuclCnt: Dict of dinucleotide counts
:return:
"""
z = random.random()
#print(z)
denom = dinuclCnt[x]['A'] + dinuclCnt[x]['C'] + dinuclCnt[x]['G'] + dinuclCnt[x]['U']
numerator = dinuclCnt[x]['A']
if z < float(numerator) / float(denom):
dinuclCnt[x]['A'] -= 1
return 'A'
numerator += dinuclCnt[x]['C']
if z < float(numerator) / float(denom):
dinuclCnt[x]['C'] -= 1
return 'C'
numerator += dinuclCnt[x]['G']
if z < float(numerator) / float(denom):
dinuclCnt[x]['G'] -= 1
return 'G'
dinuclCnt[x]['U'] -= 1
return 'U'
def connectedToLast(edgeList, nuclList, lastCh):
D = {}
for x in nuclList:
D[x] = 0
for edge in edgeList:
a = edge[0];
b = edge[1]
if b == lastCh:
D[a] = 1
for i in range(2):
for edge in edgeList:
a = edge[0];
b = edge[1]
if D[b] == 1: D[a] = 1
for x in nuclList:
if x != lastCh and D[x] == 0: return 0
return 1
def eulerian(s):
"""
:param s:
:return:
"""
nuclCnt, dinuclCnt, List = computeCountAndLists(s)
# compute nucleotides appearing in s
nuclList = []
for x in ["A", "C", "G", "U"]:
if x in s:
nuclList.append(x)
# compute numInList[x] = number of dinucleotides beginning with x
numInList = {}
for x in nuclList:
numInList[x] = 0
for y in nuclList:
numInList[x] += dinuclCnt[x][y]
# create dinucleotide shuffle L
firstCh = s[0] # start with first letter of s
lastCh = s[-1]
edgeList = []
for x in nuclList:
if x != lastCh:
edgeList.append([x, chooseEdge(x, dinuclCnt)])
ok = connectedToLast(edgeList, nuclList, lastCh)
return ok, edgeList, nuclList, lastCh
def shuffleEdgeList(L):
n = len(L);
barrier = n
for i in range(n - 1):
z = int(random.random() * barrier)
tmp = L[z]
L[z] = L[barrier - 1]
L[barrier - 1] = tmp
barrier -= 1
return L
def dinuclShuffle(s):
if len(s)==0:
return("")
ok = 0
while not ok:
ok, edgeList, nuclList, lastCh = eulerian(s)
nuclCnt, dinuclCnt, List = computeCountAndLists(s)
# remove last edges from each vertex list, shuffle, then add back
# the removed edges at end of vertex lists.
for [x, y] in edgeList: List[x].remove(y)
for x in nuclList: shuffleEdgeList(List[x])
for [x, y] in edgeList: List[x].append(y)
# construct the eulerian path
L = [s[0]];
prevCh = s[0]
for i in range(len(s) - 2):
ch = List[prevCh][0]
L.append(ch)
del List[prevCh][0]
prevCh = ch
L.append(s[-1])
t = ''.join(L)
return t
def shuffle_seq(seq):
try:
shuffled = dinuclShuffle(seq)
return shuffled
except:
seq = list(seq)
random.shuffle(seq)
return "".join(seq)
# Functions above to shuffle - TODO: Look over and understand code
# Helper functions
def convert_seq(seq, aiming_for):
"""
Function to convert sequence to ungapped sequence and corresponding positions for module