############################################################################### #Copyright (c) 2015, Olivier Tremblay Savard, Vladimir Reinharz # # & Jerome Waldispuhl # #All rights reserved. # # # #Redistribution and use in source and binary forms, with or without # #modification, are permitted provided that the following conditions are met: # #* Redistributions of source code must retain the above copyright # #notice, this list of conditions and the following disclaimer. # #* Redistributions in binary form must reproduce the above copyright # #notice, this list of conditions and the following disclaimer in the # #documentation and/or other materials provided with the distribution. # #* Neither the name of the nor the # #names of its contributors may be used to endorse or promote products # #derived from this software without specific prior written permission. # # # #THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # #AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # #IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # #ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY # #DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # #(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # #LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # #ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # #(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS# #SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE # ############################################################################### ##################################################################################### # Ancestral RNA sequence reconstruction using sequence and 2D structure information # # Contains also the regular Fitch and Sankoff algorithms. ##################################################################################### from enum import Enum from collections import defaultdict from collections import OrderedDict from collections import deque from pprint import pprint import sys from Bio import Phylo from io import StringIO import time #For RNAfold stuff import os import re from math import exp from subprocess import check_output from tempfile import NamedTemporaryFile as NTF #------------------------------------------------------------------------------------------------------------------------------------------------------ ### ### Class MutatedSequence: contains a sequence and has methods to find all mutated sequences that can be reached from this sequence ### class MutatedSequence: #Constructor def __init__(self, sequence = None, leftChild = None, rightChild = None, cost = None): self.sequence = sequence #the sequence representing this object (string) self.leftChild = leftChild #the left child from which this sequence was obtained by a mutation process (bottom-up) self.rightChild = rightChild #the right child from which this sequence was obtained by a mutation process (bottom-up) self.cost = cost #the cost of childSequence + the cost of going from childSequence to this sequence ### ### Accessors ### #Returns a set of sequences (strings) that can be reached by doing a maximum of nbMuts mutations from this MutatedSequence. #isLeftChild is true if we're looking for sequences on the left branch of the SpeciesTree, false if it's for the right branch def getSetReachableMutSeqsStrings(self, nbMuts): #print(self.sequence) setMutSeqsStrings = set([self.sequence]) #a set of strings, self.sequence represents the sequence obtained after 0 mutations for x in range(nbMuts): #repeating this nbMuts times newListMut = [] #temporary list for mutSeqString in setMutSeqsStrings: #print(mutSeqString) for pos in range(len(mutSeqString)): #going through all the positions nucAtPos = SpeciesNode.nucToNucDict[mutSeqString[pos]] #translating just in case for nuc in SpeciesNode.NUCLEOS: if nucAtPos != nuc: #not changing a nucleotide for the same nucleotide newSequence = mutSeqString[:pos] + nuc + mutSeqString[pos+1:] #print("Before mutation, sequence was = " + mutSeqString + " and new sequence = " + newSequence) newListMut.append(newSequence) setMutSeqsStrings.update(newListMut) #adding the whole list to the set return setMutSeqsStrings #the set of sequences (strings) that can be reached by a max of nbMut mutations from this MutatedSequence Object #Returns the cost of going from the sequence represented by this object, to the mutSeq parameter (string) #We don't count the cost of the whole sequence, just the effect of the modified nucleotides, and we add that to the cost of this object #currentStruct and otherStruct are lists of paired positions, T is the T parameter coming from the SpeciesNode def getCostOfMutSeq(self, mutSeq, currentStruct, otherStruct, T): subMatrix = SpeciesNode.SANKOFF_MATRIX bpMatrix = SpeciesNode.BASEPAIR_MATRIX_SIMPLE if len(mutSeq) != len(self.sequence): print("Big problem, sequences are not the same length.") print("mutSeq: " + mutSeq + " and self.sequence: " + self.sequence) listMutatedPos = [] cost = self.cost #initializing with the cost of this MutatedSequence for pos in range(len(mutSeq)): if mutSeq[pos] != self.sequence[pos]: listMutatedPos.append(pos) #keeping track of the mutated positions #print("TEST positional argument: " + self.sequence[pos] + " " + mutSeq[pos]) cost += subMatrix[SpeciesNode.nucleoToIndexDict[self.sequence[pos]]][SpeciesNode.nucleoToIndexDict[mutSeq[pos]]] #substitutions are not multiplied by T #basepair cost, currentStruct posPair = currentStruct[pos] if posPair != -1 and posPair not in listMutatedPos: #only if it is paired and if we didn't already consider this position in the cost because it was mutated too bpCostAfter = bpMatrix[SpeciesNode.nucleoToIndexDict[mutSeq[pos]]][SpeciesNode.nucleoToIndexDict[mutSeq[posPair]]] #we just want the bpCost of the positions in mutSeq cost += T * bpCostAfter #basepair cost, otherStruct posPair = otherStruct[pos] if posPair != -1 and posPair not in listMutatedPos: #only if it is paired in otherStruct and the position wasn't already considered in the cost bpCostAfter = bpMatrix[SpeciesNode.nucleoToIndexDict[mutSeq[pos]]][SpeciesNode.nucleoToIndexDict[mutSeq[posPair]]] #we just want the bpCost of the positions in mutSeq cost += (1-T) * bpCostAfter return cost #we return cost, which corresponds to the transition cost #Clone def clone(): return MutatedSequence(self.sequence, self.leftChild, self.rightChild, self.cost) ### ### Mutators ### #------------------------------------------------------------------------------------------------------------------------------------------------------ ### ### Class SpeciesTree: information on the whole species tree, methods to reconstruct ancestral sequences. ### class SpeciesTree: #Class variables ALGO = Enum('ALGO', 'Fitch Sankoff SeqAndStruct TwoStructsApprox Enumerate') #an Enum representing the different scoring functions for the ancestral sequence reconstruction NB_MUTATIONS = 1 #the maximum number of mutations to consider for the enumerate algorithm ### ### Static methods ### #To compare 2 trees: "simulTree" is the simulated tree and "inferredTree" is the tree containing the inferred ancestral sequences. #Obviously, the 2 trees are required to have the exact same topology and node ids #This returns the total number of errors followed by the total number of nucleotides in the simulated tree (for all sequences in the tree) in a list. def compareTrees(simulTree, inferredTree, verbose=False): if verbose: print("\nCOMPARING TREES") queue = deque() queue.appendleft(simulTree.root) queue.appendleft(inferredTree.root) #putting the nodes from each tree one after the other in the same queue, yolo ### THE STATS ### #All variables are lists of ints, representing, in this order: [family1, family2, all] #For the complete tree: totalNumberOfNucleos = [0, 0, 0] #Will count the total number of nucleotides in the simulated tree (for all sequences of all families) totalNumberOfErrors = [0, 0, 0] #Will count the total number of errors relative to the simulated sequences in all the tree totalNumberOfOptimalSeqs = [0, 0, 0] #Will count the total number of optimal sequences for all the nodes in the species tree; this will allow us to compute the average nb of errors averageNumberOfErrors = [0, 0, 0] #Will count the average number of errors over all the optimal solutions; will be divided by totalNumberOfOptimalSeqs #For the positions with/without structure totalNumberOfNucleos_withStruct = [0, 0, 0] #Will count the total number of nucleotides that are part of a structure totalNumberOfErrors_withStruct = [0, 0, 0] #Will count the total number of errors relative to the simulated sequences in all the tree at structured positions totalNumberOfNucleos_withoutStruct = [0, 0, 0] #Will count the total number of nucleotides that are not part of a structure totalNumberOfErrors_withoutStruct = [0, 0, 0] #Will count the total number of errors relative to the simulated sequences in all the tree at unstructured positions averageNumberOfErrors_withStruct = [0, 0, 0] #Will count the average number of errors over all the optimal solutions in all structured positions; divided by totalNumberOfOptimalSeqs averageNumberOfErrors_withoutStruct = [0, 0, 0] #Will count the average number of errors over all the optimal solutions in all unstructured positions; divided by totalNumberOfOptimalSeqs numberOfNodesInTheTree = [0, 0, 0] #Will count the number of nodes for family 1, family 2, and the 2 together (result should be [x, x, 2x]) #For the root only totalNumberOfErrors_root = [0, 0, 0] totalNumberOfOptimalSeqs_root = [0, 0, 0] #With the new findSuperAncestorAndUpdateMatrix method, we should have the same number for fam 0 and fam 1 averageNumberOfErrors_root = [0, 0, 0] averageNumberOfErrors_root_withoutStruct = [0, 0, 0] averageNumberOfErrors_root_withStruct = [0, 0, 0] ################# while queue: #while the queue is not empty theNodeSimul = queue.pop() theNodeInfer = queue.pop() #Making sure the ids are identical if theNodeSimul.id != theNodeInfer.id: print("WARNING: Node ids are different. I did not expect that...") if verbose: print("- Node: " + str(theNodeSimul.id)) #Comparing the sequences for familyIndex in range(simulTree.getNbStructs()): #simulSeq = theNodeSimul.getSequence(familyIndex, 0) #there is only one sequence for each family in the simulated tree simulSeq = theNodeSimul.getOnlySequence(familyIndex) #NEW: updated to deal with sets minNbErrors = float("inf") minNbErrorsWithStruct = float("inf") minNbErrorsWithoutStruct = float("inf") totalNumberOfOptimalSeqs[familyIndex] += theNodeInfer.getNbSeqs(familyIndex) #adding the number of seqs for the current familyIndex totalNumberOfOptimalSeqs[2] += theNodeInfer.getNbSeqs(familyIndex) #adding the number of seqs for all (family1 and family2) numberOfNodesInTheTree[familyIndex] += 1 numberOfNodesInTheTree[2] += 1 #Counting the number of positions with structure and without structure (only once for each family for each node) for pos in range(len(simulSeq)): pairedPos = theNodeInfer.getPairedPos(familyIndex, pos) #theNodeInfer and theNodeSimul should have the same structs and same seq length if pairedPos == -1: #unpaired totalNumberOfNucleos_withoutStruct[familyIndex] += 1 #Those numbers will have to be divided by the totalNumberOfOptimalSeqs totalNumberOfNucleos_withoutStruct[2] += 1 else: #paired totalNumberOfNucleos_withStruct[familyIndex] += 1 totalNumberOfNucleos_withStruct[2] += 1 for seq in theNodeInfer.getSequencesListsSet(familyIndex): #seq = theNodeInfer.getSequence(familyIndex, seqIndex) #NEW: now seq is in the loop if len(simulSeq) != len(seq): print("Big problem: sequences are not the same length...") else: nbErrors = 0 nbErrorsWithStruct = 0 nbErrorsWithoutStruct = 0 for pos in range(len(seq)): pairedPos = theNodeInfer.getPairedPos(familyIndex, pos) if SpeciesNode.nucToNucDict[simulSeq[pos]] != SpeciesNode.nucToNucDict[seq[pos]]: #using the translation, just in case nbErrors += 1 averageNumberOfErrors[familyIndex] += 1 #adding all the errors, we'll divide it by totalNumberOfOptimalSeqs at the end averageNumberOfErrors[2] += 1 #for all if pairedPos == -1: #unpaired nbErrorsWithoutStruct += 1 averageNumberOfErrors_withoutStruct[familyIndex] += 1 averageNumberOfErrors_withoutStruct[2] += 1 else: #paired nbErrorsWithStruct += 1 averageNumberOfErrors_withStruct[familyIndex] += 1 averageNumberOfErrors_withStruct[2] += 1 if nbErrors < minNbErrors: minNbErrors = nbErrors minNbErrorsWithStruct = nbErrorsWithStruct minNbErrorsWithoutStruct = nbErrorsWithoutStruct if verbose: #Printing a report print("Family : " + str(familyIndex) + " --> minNbErrors = " + str(minNbErrors) + " (" + str(minNbErrors/len(simulSeq) * 100) + " %)") #updating stats for the whole tree totalNumberOfErrors[familyIndex] += minNbErrors totalNumberOfErrors[2] += minNbErrors #updating all in the list totalNumberOfNucleos[familyIndex] += len(simulSeq) totalNumberOfNucleos[2] += len(simulSeq) totalNumberOfErrors_withoutStruct[familyIndex] += minNbErrorsWithoutStruct totalNumberOfErrors_withoutStruct[2] += minNbErrorsWithoutStruct totalNumberOfErrors_withStruct[familyIndex] += minNbErrorsWithStruct totalNumberOfErrors_withStruct[2] += minNbErrorsWithStruct if not theNodeSimul.isLeaf(): #only checking one, both of them should be leaves at the same time anyway queue.appendleft(theNodeSimul.leftChild) queue.appendleft(theNodeInfer.leftChild) queue.appendleft(theNodeSimul.rightChild) queue.appendleft(theNodeInfer.rightChild) if theNodeInfer.isRoot(): #We save the stats for the root only for index in range(len(totalNumberOfErrors)): totalNumberOfErrors_root[index] = totalNumberOfErrors[index] totalNumberOfOptimalSeqs_root[index] = totalNumberOfOptimalSeqs[index] averageNumberOfErrors_root[index] = averageNumberOfErrors[index] / totalNumberOfOptimalSeqs_root[index] #We calculate the averages immediately averageNumberOfErrors_root_withoutStruct[index] = averageNumberOfErrors_withoutStruct[index] / totalNumberOfOptimalSeqs_root[index] averageNumberOfErrors_root_withStruct[index] = averageNumberOfErrors_withStruct[index] / totalNumberOfOptimalSeqs_root[index] for x in range(3): #Calculating the averages -> numbers will be for one sequence only (average nb errors for 1 optimal sequence in the tree) averageNumberOfErrors[x] /= totalNumberOfOptimalSeqs[x] averageNumberOfErrors_withStruct[x] /= totalNumberOfOptimalSeqs[x] averageNumberOfErrors_withoutStruct[x] /= totalNumberOfOptimalSeqs[x] return totalNumberOfErrors, totalNumberOfNucleos, totalNumberOfOptimalSeqs, averageNumberOfErrors, totalNumberOfNucleos_withStruct, totalNumberOfErrors_withStruct, totalNumberOfNucleos_withoutStruct, totalNumberOfErrors_withoutStruct, averageNumberOfErrors_withStruct, averageNumberOfErrors_withoutStruct, numberOfNodesInTheTree, totalNumberOfErrors_root, totalNumberOfOptimalSeqs_root, averageNumberOfErrors_root, averageNumberOfErrors_root_withoutStruct, averageNumberOfErrors_root_withStruct #Constructor def __init__(self, structuresList, root = None): ### Data attributes self.nbStructs = len(structuresList) #the number of different RNA families (with different structures) self.structuresList = structuresList #a list a all the structures (one per family). A structure is represented by a list(int) of size sequenceLength which #gives for each position the position of the nucleo it is paired with (or -1 if unpaired) self.sequenceLength = len(structuresList[0]) #the length of every sequence (right now we have the same length for family 1 and family 2) self.root = root #The root of the tree (SpeciesNode) ##useless##self.leavesList = leavesList #a list of all the leaves of the tree (list of SpeciesNode) self.structuresList = structuresList #a list of structures -> one structure is an array of size seqLength giving the position of the other base or -1 self.maxDepth = -1 #the maximum depth of the tree, will be updated after the call to setDepths ### ### Accessors ### def getNbStructs(self): return self.nbStructs def getSeqLength(self): return self.sequenceLength def getRoot(self): return self.root def isRoot(self, node): if node == self.root: return True return False def isLeaf(self, node): if node.leftChild == None and node.rightChild == None: return True return False #Returns the list of paired positions for familyIndex def getStruct(self, familyIndex): return self.structuresList[familyIndex] #To print the whole tree in level order. #printCosts is a boolean indicating if we want to print the cost of every nucleotide for every position. def printTree(self, printCosts = False): queue = deque() queue.append(self.root) while queue: #while the queue is not empty theNode = queue.pop() theNode.printNode(printCosts) if not theNode.isLeaf(): queue.appendleft(theNode.leftChild) queue.appendleft(theNode.rightChild) #To print everything below the given node (including the node) def printSubtree(self, node): if node == None: return node.printNode() self.printSubtree(node.leftChild) self.printSubtree(node.rightChild) #To create a clone of the tree for which we remove the sequences at the ancestral nodes (to create a copy of the simulated tree, without the answers (ie anc. sequences)) def cloneWithoutAncSeqs(self): treeClone = SpeciesTree(self.structuresList) #creating the new tree with the same structures and root -> not using a copy of structuresList, but shouldn't be a problem treeClone.root = self.root.clone(treeClone) #setting the root self.cloneWithoutAncSeqsRec(self.root, treeClone.root, treeClone) #calling the recursive method return treeClone #Recursive method to clone the tree #internalNode and internalNodeClone correspond to the same node def cloneWithoutAncSeqsRec(self, internalNode, internalNodeClone, treeClone): if internalNode.isLeaf(): return leftChildClone = internalNode.leftChild.clone(treeClone) rightChildClone = internalNode.rightChild.clone(treeClone) internalNodeClone.leftChild = leftChildClone internalNodeClone.rightChild = rightChildClone leftChildClone.parent = internalNodeClone rightChildClone.parent = internalNodeClone #Recursive calls self.cloneWithoutAncSeqsRec(internalNode.leftChild, leftChildClone, treeClone) self.cloneWithoutAncSeqsRec(internalNode.rightChild, rightChildClone, treeClone) ### ### Mutators ### #Method to set the depth of every node in the tree (root has depth 0) def setDepths(self): depth = 0 self.setDepthsRec(self.root, depth) #call to the recursive method #Recursive method to set the depths of the nodes. #Parameter 'node' is the node to update and 'depth' is the depth it has. def setDepthsRec(self, node, depth): node.depth = depth if depth > self.maxDepth: self.maxDepth = depth if not node.isLeaf(): self.setDepthsRec(node.leftChild, depth+1) self.setDepthsRec(node.rightChild, depth+1) #General method to do the reconstruction; calls the bottom-up step and then the top-down step def reconstructAncestralSeqs(self, algo): if algo == SpeciesTree.ALGO.Enumerate: for familyIndex in range(self.getNbStructs()): self.enumerateMutatedSequences(self.root, SpeciesTree.NB_MUTATIONS, familyIndex) for familyIndex in range(self.getNbStructs()): #test #for x in range(len(self.root.listMutatedSequences[familyIndex])): # print(str(x) + ": " + self.root.listMutatedSequences[familyIndex][x].sequence + " cost = " + str(self.root.listMutatedSequences[familyIndex][x].cost)) minCost = self.root.listMutatedSequences[familyIndex][0].cost #Since listMutatedSequences should be sorted, the best cost is at [0] index = 0 while self.root.listMutatedSequences[familyIndex][index].cost == minCost: #this will consider all the optimal sequences self.selectMutatedSequences(familyIndex, self.root, self.root.listMutatedSequences[familyIndex][index]) index += 1 else: self.calculateCosts(algo) self.selectNucleos(algo) #The bottom-up step for the Enumerate algorithm, uses a post-order traversal def enumerateMutatedSequences(self, node, nbMuts, familyIndex): if node.isLeaf(): node.addToListMutatedSequences(familyIndex, MutatedSequence(node.getOnlySequence(familyIndex), None, None, 0)) #Creating the only MutatedSequence object for the leaf else: #internal node self.enumerateMutatedSequences(node.leftChild, nbMuts, familyIndex) self.enumerateMutatedSequences(node.rightChild, nbMuts, familyIndex) node.enumerateMutSeqs(familyIndex, nbMuts) #The top-down step for the Enumerate algorithm. optimalMutSeq is the MutatedSequence object that is on the optimal path from the root to the 'node' parameter. def selectMutatedSequences(self, familyIndex, node, optimalMutSeq): if not node.isLeaf(): #otherwise, there is nothing to do node.addSequence(familyIndex, optimalMutSeq.sequence) #adding the sequence as an optimal ancestral sequence self.selectMutatedSequences(familyIndex, node.leftChild, optimalMutSeq.leftChild) self.selectMutatedSequences(familyIndex, node.rightChild, optimalMutSeq.rightChild) #recursive calls #The bottom-up step, calculating the costs using the algo specified by the parameter 'algo' def calculateCosts(self, algo): self.root.calculateCosts(algo) #The top-down step, called after calculateCosts; this method selects the nucleotide of minimum cost for each position and builds the ancestral sequences def selectNucleos(self, algo): self.findSuperAncestorAndUpdateMatrix(algo) #We start by choosing the nucleotides of minimum cost in the root. WARNING: For now, we only build 1 of all the possible optimal sequences for familyIndex in range(self.getNbStructs()): #One optimal sequence sequenceList = [[" " for x in range(self.getSeqLength())]] #A list of optimal sequences (each sequence is a list of chars here) for pos in range(self.getSeqLength()): #print(pos) #print(len(sequenceList)) minCost = float("inf") nucleoOfMinCost = [] #a list of all optimal nucleos for the position for key in self.root.posToNucleoCosts[familyIndex][pos]: #Finding all nucleotides of minimum cost cost = self.root.getCostOf(familyIndex, pos, key) if cost == minCost: #print("appending " + key) nucleoOfMinCost.append(key) #appending to the list if cost < minCost: #print("Updating minCost: minCost = " + str(cost) + " and nucleo = " + nucleo) minCost = cost nucleoOfMinCost = [key] #starting a new list if len(nucleoOfMinCost[0]) == 1: SpeciesNode.addNucleoToSequencesList(sequenceList, pos, nucleoOfMinCost) elif len(nucleoOfMinCost[0]) == 2: #base pair position posPair = self.root.getPairedPos(familyIndex, pos) if pos < posPair: #otherwise, it has been updated already #print("pos = " + str(pos) + " and posPair = " + str(posPair) + " and seqLength = " + str(self.getSeqLength()) + " and len(sequence) = " + str(len(sequence))) SpeciesNode.addNucleoToSequencesList(sequenceList, pos, [nuc[0] for nuc in nucleoOfMinCost]) SpeciesNode.addNucleoToSequencesList(sequenceList, posPair, [nuc[1] for nuc in nucleoOfMinCost]) for seq in sequenceList: #adding all the sequences #print(seq) self.root.addSequence(familyIndex, "".join(seq)) #NEW: Keeping only the intersection of family 0 and family 1 at the root. We want only optimal sequences that were found for both (or more) structures. #intersection = self.root.getSequencesListsSet(0) & self.root.getSequencesListsSet(1) #initializing with the first 2 structures (there are at least 2) #for familyIndex in range(2, self.getNbStructs()): #starting at 2 # intersection &= self.root.getSequencesListsSet(familyIndex) #if len(intersection) == 0: # print("HUGE PROBLEM: intersection of optimal sequences at the root is empty...") # sys.exit() #for familyIndex in range(self.getNbStructs()): # self.root.sequencesLists[familyIndex] = intersection #putting only the intersection set for all structures at the root #Now that the root sequences have been found, we call the SpeciesNode selectNucleos method on the root to select the nucleos for its two children nodes self.root.selectNucleos(algo) #To calculate the matrix costs, using Fitch, for the super ancestor (ancestor of family 1 and family 2 of the root). #Then, this method will update the costMatrix of the root with this super ancestor matrix for both families (same matrix copied for both) def findSuperAncestorAndUpdateMatrix(self, algo): #First, we need to create a fake tree of 3 nodes, so that we can call computeFitch normally fakeSpeciesTree = SpeciesTree([[-1 for i in range(self.getSeqLength())]]) #fake tree with no structure, we don't need it. We need only one list, so that it will be like there is only one family. The length of the structureList is important, as it defines the length of the sequences in the tree. superAncNode = SpeciesNode(None, None, None, fakeSpeciesTree) #corresponds to the super ancestor fakeSpeciesTree.root = superAncNode leftChild = SpeciesNode(superAncNode, None, None, fakeSpeciesTree) #corresponds to Family 0 of the root rightChild = SpeciesNode(superAncNode, None, None, fakeSpeciesTree) #corresponds to Family 1 of the root superAncNode.leftChild = leftChild superAncNode.rightChild = rightChild #We need to check if we are dealing with di-nucleotides (which is the case for algos 3 and 4) if (algo == SpeciesTree.ALGO.SeqAndStruct or algo == SpeciesTree.ALGO.TwoStructsApprox): for familyIndex in range(self.root.getNbStructs()): for pos in range(self.getSeqLength()): posPair = self.root.getPairedPos(familyIndex, pos) if posPair != -1: #we are dealing with di-nucleotides in the costMatrix dictBestScoreForFirstNucleo = {} #Will store the lowest cost for A,C,G and U over all the di-nucleotide score (looking only at first position) for dinucleo in SpeciesNode.DINUCLEOS: dinucleoCost = self.root.getCostOf(familyIndex, pos, dinucleo) correspondingNucleo = dinucleo[0] #first nucleo of the dinucleo is the one corresponding to the current position if correspondingNucleo not in dictBestScoreForFirstNucleo or dinucleoCost < dictBestScoreForFirstNucleo[correspondingNucleo]: dictBestScoreForFirstNucleo[correspondingNucleo] = dinucleoCost for nucleo, minCost in dictBestScoreForFirstNucleo.items(): #We update the costMatrix, adding new key-values for single nucleotides self.root.updatePosToNucleoCosts(familyIndex, pos, nucleo, minCost) leftChild.posToNucleoCosts[0] = self.root.posToNucleoCosts[0] #Here, we map the costMatrix of family 0 to the leftChild of the fakeTree rightChild.posToNucleoCosts[0] = self.root.posToNucleoCosts[1] #and the costMatrix of family 1 to the rightChild of the fakeTree for pos in range(fakeSpeciesTree.getSeqLength()): #for each position superAncNode.computeFitch(0, pos) #calling computeFitch for every position, in family 0 only (there is only one family in the fakeTree) ###TEST #for pos in range(self.getSeqLength()): # costString = "Pos " + str(pos) + ": " # for key in sorted(superAncNode.posToNucleoCosts[0][pos]): # costString += key + ": " + str(superAncNode.posToNucleoCosts[0][pos][key]) + " " # print(costString) #sys.exit() #Now the tricky part: we replace the cost matrices of the root (for fam 0 and fam 1) with the matrix of the superAncestor... and we hope for the best. self.root.posToNucleoCosts[0] = superAncNode.posToNucleoCosts[0] self.root.posToNucleoCosts[1] = superAncNode.posToNucleoCosts[0] #This is just a copy of the reference, should not be a problem though #------------------------------------------------------------------------------------------------------------------------------------------- ### ### Class SpeciesNode: information on a node of the SpeciesTree and methods to calculate scores ### class SpeciesNode: BP_WEIGHT = 0.001 #The weight that will be used to multiply the values in the BASEPAIR_MATRIX_SIMPLE #BP_WEIGHT = 0.05 #That was the best so far (61 and 59 total error for 3 and 4 resp.) NUCLEOS = ['A', 'C', 'G', 'U'] #Just a list of the nucleotides, will help with for loops DINUCLEOS = ['AA', 'AC', 'AG', 'AU', 'CA', 'CC', 'CG', 'CU', 'GA', 'GC', 'GG', 'GU', 'UA', 'UC', 'UG', 'UU'] FITCH_MATRIX = [[0, 1, 1, 1], [1, 0, 1, 1], [1, 1, 0, 1], [1, 1, 1, 0]] SANKOFF_MATRIX = [[0, 2, 1, 2], [2, 0, 2, 1], [1, 2, 0, 2], [2, 1, 2, 0]] #BASEPAIR_MATRIX_SIMPLE = [[5, 5, 5, 1], [5, 5, 0, 5], [5, 0, 4, 3], [1, 5, 4, 5]] #not bad, but 3 does better than 4 BASEPAIR_MATRIX_SIMPLE = [[3, 3, 3, 1], [3, 3, 0, 3], [3, 0, 3, 2], [1, 3, 2, 3]] #this one is not bad at all #BASEPAIR_MATRIX_SIMPLE = [[2.5, 2.5, 2.5, 0], [2.5, 2.5, 0, 2.5], [2.5, 0, 2.5, 2.5], [0, 2.5, 2.5, 2.5]] #first matrix for i in range(len(BASEPAIR_MATRIX_SIMPLE)): for j in range(len(BASEPAIR_MATRIX_SIMPLE[i])): BASEPAIR_MATRIX_SIMPLE[i][j] *= BP_WEIGHT #print(BASEPAIR_MATRIX_SIMPLE) #sys.exit() LIST_MUT_SEQS_MAX_SIZE = 1000 #maximum size of listMutatedSequences -> we want to order the list and keep the 1000 with lowest cost (most promising) lastId = 1 #Class variable; incrementing every time we call the constructor nucleoToIndexDict = {'A': 0, 'a': 0, 'C': 1, 'c': 1, 'G': 2, 'g': 2, 'T': 3, 't': 3, 'U': 3, 'u': 3} #mapping a nucleotide to an index, starting from 0 nucToNucDict = {'A': 'A', 'a': 'A', 'C': 'C', 'c': 'C', 'G': 'G', 'g': 'G', 'T': 'U', 't': 'U', 'U': 'U', 'u': 'U'} #translating nucleotides so that we deal only with A,C,G,U ### ### static methods ### #To return the index of the nucleo, using nucleoToIndexDict (shorter to write) def nucToInd(nucleo): return SpeciesNode.nucleoToIndexDict[nucleo] def addNucleoToSequencesList(sequencesList, position, listOfNucsToAdd): #print("Call to addNucleoToSequencesList") #print("sequencesList = " + str(sequencesList) + " position = " + str(position) + " and listOfNucsToAdd = " + str(listOfNucsToAdd)) #New: to protect from memory explosion if len(sequencesList) > 2500000: print("Too many optimal sequences.") sys.exit() setOfNucsToAdd = set() for nuc in listOfNucsToAdd: #getting rid of duplicates (possibly coming from di-nucleotides) with a set setOfNucsToAdd.add(nuc) listOfNucsToAdd = list(setOfNucsToAdd) #rewriting the list #print("--- after elimination of duplicates, listOfNucsToAdd = " + str(listOfNucsToAdd)) initialNbSeqs = len(sequencesList) for x in range(len(setOfNucsToAdd)-1): #no need to duplicate sequences if there is only one nucleo to add, otherwise we need one additional copy for every additional nucleo to add for y in range(initialNbSeqs): copy = sequencesList[y][:] sequencesList.append(copy) #print("Appending copy!!!!!!!!!!!!!!!") for x in range(len(sequencesList)): nucIndex = x // initialNbSeqs #integer division to find the index of the nucleo to add in the sequences sequencesList[x][position] = listOfNucsToAdd[nucIndex] #print("At the end, sequencesList = " + str(sequencesList)) #Constructor def __init__(self, parent, leftChild, rightChild, speciesTree, id = None): ### Data attributes self.parent = parent #the parent node in the tree (root has parent = None) self.leftChild = leftChild #the leftChild in the tree (leaves have leftChild = None) self.rightChild = rightChild #the rightChild in the tree (leaves have rightChild = None) self.speciesTree = speciesTree #a reference to the SpeciesTree that contains this node self.posToNucleoCosts = [[defaultdict(lambda: float("inf")) for i in range(speciesTree.getSeqLength())] for j in range(self.getNbStructs())] #For each family (nbStructs), we have one list of all positions, and at each position a dict mapping a nucleotide (or a dinucleotide) to its cost self.sequencesLists = [set() for i in range(speciesTree.getNbStructs())] #One *set* (*new, before it was a list) of optimal sequences (strings) for every family if id is None: self.id = SpeciesNode.lastId SpeciesNode.lastId += 1 else: self.id = id self.depth = -1 #intializing to -1, don't forget to call setDepths() on the tree after it is contructed self.listMutatedSequences = [[] for i in range(speciesTree.getNbStructs())] #list of most promising MutatedSequence objects (only one sequence for leaves) for each family ### ### Accessors ### def isRoot(self): return self.speciesTree.isRoot(self) def isLeaf(self): return self.speciesTree.isLeaf(self) def getSeqLength(self): return self.speciesTree.getSeqLength() #Warning: right now, we assume that all sequences have the same length def getNbStructs(self): return self.speciesTree.getNbStructs() def getPairedPos(self, familyIndex, position): return self.speciesTree.structuresList[familyIndex][position] def getNbSeqs(self, familyIndex): return len(self.sequencesLists[familyIndex]) #Returns the list of paired positions for familyIndex def getStruct(self, familyIndex): return self.speciesTree.getStruct(familyIndex) #Returns the sequence at seqIndex for the family at familyIndex ####DEPRECATED; see getOnlySequence(self, familyIndex) #def getSequence(self, familyIndex, seqIndex): #return self.sequencesLists[familyIndex][seqIndex] ##We changed it to a set now, so we can't access with an index #Returns the only sequence in the set for the specified familyIndex. There should be only one. def getOnlySequence(self, familyIndex): desiredList = list(self.sequencesLists[familyIndex]) return desiredList[0] #Returns the set for the specified familyIndex def getSequencesListsSet(self, familyIndex): return self.sequencesLists[familyIndex] #For a specified familyIndex, returns the number of optimal sequences (for a leaf, there should be only the input sequence, so just 1) def getNbOfSeqs(self, familyIndex): return len(self.sequencesLists[familyIndex]) #To get a cost inside posToNucleoCosts for a specified familyIndex, position and nucleotide (char) def getCostOf(self, familyIndex, position, nucleotide): return self.posToNucleoCosts[familyIndex][position][nucleotide] #To return the index of the nucleo, using nucleoToIndexDict (shorter to write) def nucToInd(self, nucleo): return SpeciesNode.nucleoToIndexDict[nucleo] #To return a translation of the nucleotide, using nucToNucDict def translateNuc(self, nucleo): return SpeciesNode.nucToNucDict[nucleo] #To return the value of the parameter T depending on depth of the node def getT(self): #using a simple linear function now; depth 0 returns 50% (0.5) slope = (1 - 0.5) / self.speciesTree.maxDepth #range is from 50% (for the root) to 100% (for the leaves) #--> 100% will never be returned, because this method will be called on the parent node return 0.5 + self.depth * slope #To get the listMutatedSequences for a familyIndex def getListMutSeqs(self, familyIndex): return self.listMutatedSequences[familyIndex] #Prints a description of the node def printNode(self, printCosts): parentIdString = "None" if not self.isRoot(): parentIdString = str(self.parent.id) lcIdString = "None" rcIdString = "None" if not self.isLeaf(): lcIdString = str(self.leftChild.id) rcIdString = str(self.rightChild.id) print("\n--- Id = " + str(self.id) + " Parent = " + parentIdString + " LeftChild = " + lcIdString + " RightChild = " + rcIdString + " Depth = " + str(self.depth) + " T = " + str(self.getT())) for familyIndex in range(self.getNbStructs()): print(". Family " + str(familyIndex) + ":") for seq in self.sequencesLists[familyIndex]: print(seq) #pprint(self.posToNucleoCosts[familyIndex], width = 500) #The costs might not be printed in order, we might have to sort the keys if printCosts: for pos in range(self.getSeqLength()): costString = "Pos " + str(pos) + ": " for key in sorted(self.posToNucleoCosts[familyIndex][pos]): costString += key + ": " + str(self.posToNucleoCosts[familyIndex][pos][key]) + " " print(costString) #To return a clone of this node, without the sequences unless it's a leaf node. Parent and child nodes are not set here. def clone(self, clonedSpeciesTree): cloneNode = SpeciesNode(None, None, None, clonedSpeciesTree, self.id) #keeping the same ids if self.isLeaf(): for familyIndex in range(self.getNbStructs()): for seq in self.getSequencesListsSet(familyIndex): cloneNode.addSequence(familyIndex, seq) #copying the sequences, only for the leaves return cloneNode ### ### Mutators ### #To add a MutatedSequence object to listMutatedSequences def addToListMutatedSequences(self, familyIndex, mutSeq): self.listMutatedSequences[familyIndex].append(mutSeq) #To sort listMutatedSequences and keep only the most promising ones (we keep only LIST_MUT_SEQS_MAX_SIZE objects in the list) def shortenListMutatedSequences(self, familyIndex): self.getListMutSeqs(familyIndex).sort(key = lambda x: x.cost) #Sorting in place using the costs self.listMutatedSequences[familyIndex] = self.getListMutSeqs(familyIndex)[:SpeciesNode.LIST_MUT_SEQS_MAX_SIZE] #test #print("- Node: " + str(self.id) + " Length of shortened list : " + str(len(self.getListMutSeqs(familyIndex))) + " and first element cost = " + str(self.listMutatedSequences[familyIndex][0].cost)) #To add a sequence to sequencesList at a certain familyIndex #Now checks if the sequence is already present; the method adds the sequence only if it is different def addSequence(self, familyIndex, sequence): if familyIndex >= self.speciesTree.getNbStructs(): print("familyIndex : " + str(familyIndex) + " doesn't exist! Sequence cannot be added.") return if sequence not in self.sequencesLists[familyIndex]: #self.sequencesLists[familyIndex].append(sequence) self.sequencesLists[familyIndex].add(sequence) #NEW: this is a set now, not a list #print("******************* inside addSequence, appending : " + sequence + " for node " + str(self.id) + " and fam " + str(familyIndex)) #To update posToNucleoCosts for a specified familyIndex, a specified position and a specified nucleotide (char) #If addToValue is True, then we add theCost to the one that is there, otherwise we set the value def updatePosToNucleoCosts(self, familyIndex, position, nucleotide, theCost, addToValue = False): if addToValue: self.posToNucleoCosts[familyIndex][position][nucleotide] += theCost else: self.posToNucleoCosts[familyIndex][position][nucleotide] = theCost #Recursive method to calculate the costs using the algo defined by the parameter 'algo' (Bottom-up step) def calculateCosts(self, algo): if self.isLeaf(): #for the leaves, the costs are either 0 or infinity (infinity is the default cost) for familyIndex in range(self.getNbStructs()): #for each family if self.isLeaf() and self.getNbOfSeqs(familyIndex) != 1: print("---> Huge problem, leaf node doesn't have exactly 1 sequence for family " + str(familyIndex)) #sequence = self.getSequence(familyIndex, 0) #There should be only one sequence, at index 0 sequence = self.getOnlySequence(familyIndex) #NEW: updated to deal with sets for pos in range(self.getSeqLength()): #for each position nucleo = sequence[pos] posPair = self.getPairedPos(familyIndex, pos) if (algo == SpeciesTree.ALGO.SeqAndStruct or algo == SpeciesTree.ALGO.TwoStructsApprox) and posPair != -1: #If we have to consider the secondary structure pairedNuc = sequence[posPair] self.updatePosToNucleoCosts(familyIndex, pos, nucleo+pairedNuc, 0) #We put the pair of nucleos as the key else: #if the algo doesn't consider the 2D struct or if this position is not paired self.updatePosToNucleoCosts(familyIndex, pos, nucleo, 0) #updating only the cost of the nucleotide that is in the sequence at position pos else: #if the current node is an internal node #postorder self.leftChild.calculateCosts(algo) #Calculate for left child first self.rightChild.calculateCosts(algo) #then for the right child #When the costs have been calculated for both children nodes, calculate for the current node for familyIndex in range(self.getNbStructs()): #for each family for pos in range(self.getSeqLength()): #for each position if algo == SpeciesTree.ALGO.Fitch: self.computeFitch(familyIndex, pos) elif algo == SpeciesTree.ALGO.Sankoff: self.computeSankoff(familyIndex, pos) elif algo == SpeciesTree.ALGO.SeqAndStruct: self.computeSeqAndStruct(familyIndex, pos) elif algo == SpeciesTree.ALGO.TwoStructsApprox: self.computeSeqAndStruct_2structsApprox(familyIndex, pos) #The top-down step, called after calculateCosts; this method selects the nucleotide of minimum cost for each position and builds the ancestral sequences #This method is called on a node for which we already have selected the optimal sequences, and we find the optimal sequences for its children depending #on that sequence def selectNucleos(self, algo): if self.leftChild.isLeaf() and self.rightChild.isLeaf(): #we go through the code if only one of the children is a leaf return for familyIndex in range(self.getNbStructs()): #print("selectNucleos: working on familyIndex # " + str(familyIndex)) #print("There are " + str(self.getNbSeqs(familyIndex)) + " optimal sequences at the parent node") for parentSequence in self.getSequencesListsSet(familyIndex): #for every optimal sequence #if seqIndex % 100 == 0: #print("### selectNucleos: working on sequence # " + str(seqIndex)) sequenceLeftList = [[" " for x in range(self.getSeqLength())]] #using list representation of strings here, so we can modify easily characters sequenceRightList = [[" " for x in range(self.getSeqLength())]] #parentSequence = self.getSequence(familyIndex, seqIndex) #now going through all the optimal sequences --> now parentSequence is in the loop for pos in range(self.getSeqLength()): #print("selectNucleos: working on position # " + str(pos)) nucleo = parentSequence[pos] if algo == SpeciesTree.ALGO.Fitch or algo == SpeciesTree.ALGO.Sankoff: if algo == SpeciesTree.ALGO.Fitch: leftAndRightChildNucleos = self.computeFitch(familyIndex, pos, nucleo) elif algo == SpeciesTree.ALGO.Sankoff: leftAndRightChildNucleos = self.computeSankoff(familyIndex, pos, nucleo) #print(leftAndRightChildNucleos) SpeciesNode.addNucleoToSequencesList(sequenceLeftList, pos, leftAndRightChildNucleos[0]) SpeciesNode.addNucleoToSequencesList(sequenceRightList, pos, leftAndRightChildNucleos[1]) elif algo == SpeciesTree.ALGO.SeqAndStruct or algo == SpeciesTree.ALGO.TwoStructsApprox: posPair = self.getPairedPos(familyIndex, pos) if algo == SpeciesTree.ALGO.SeqAndStruct: method = self.computeSeqAndStruct else: method = self.computeSeqAndStruct_2structsApprox if posPair == -1: #unpaired leftAndRightChildNucleos = method(familyIndex, pos, nucleo) SpeciesNode.addNucleoToSequencesList(sequenceLeftList, pos, leftAndRightChildNucleos[0]) SpeciesNode.addNucleoToSequencesList(sequenceRightList, pos, leftAndRightChildNucleos[1]) elif pos < posPair: #paired in the 2D structure, and we did not consider the pos already leftAndRightChildNucleos = method(familyIndex, pos, nucleo + parentSequence[posPair]) #print(nucleo + parentSequence[posPair]) #print("leftAndRightChildNucleos = " + str(leftAndRightChildNucleos)) SpeciesNode.addNucleoToSequencesList(sequenceLeftList, pos, [nuc[0] for nuc in leftAndRightChildNucleos[0]]) #A list of di-nucleotides is returned: SpeciesNode.addNucleoToSequencesList(sequenceRightList, pos, [nuc[0] for nuc in leftAndRightChildNucleos[1]]) # first position in the list corresponds to the SpeciesNode.addNucleoToSequencesList(sequenceLeftList, posPair, [nuc[1] for nuc in leftAndRightChildNucleos[0]]) # left child and second position SpeciesNode.addNucleoToSequencesList(sequenceRightList, posPair, [nuc[1] for nuc in leftAndRightChildNucleos[1]]) # corresponds to right child else: continue #If one of the children nodes is a leaf, we don't add the sequence for this node if not self.leftChild.isLeaf(): for seq in sequenceLeftList: self.leftChild.addSequence(familyIndex, "".join(seq)) #using join to transform the sequence back into a string if not self.rightChild.isLeaf(): for seq in sequenceRightList: self.rightChild.addSequence(familyIndex, "".join(seq)) #using join to transform the sequence back into a string #Calling on the children nodes self.leftChild.selectNucleos(algo) self.rightChild.selectNucleos(algo) ### The different algorithms #The Fitch algorithm to get the costs for one position #If chosenNucleo is specified, then the method will return a list of all left and right children nucleotides that gave rise to this nucleotide in the current node (for top-down step) def computeFitch(self, familyIndex, position, chosenNucleo = None, costMatrix = None): if costMatrix == None: #we use Fitch costMatrix = SpeciesNode.FITCH_MATRIX for nucleo in SpeciesNode.NUCLEOS: #for every nucleotide, we want to find the minimum cost #print("chosenNucleo = " + str(chosenNucleo) + " and nucleo = " + nucleo) if chosenNucleo is not None and nucleo != chosenNucleo: #only for top-down step; we want to do calculation only for specified nucleo continue minLeft = float("inf") #Initializing minRight = float("inf") minNucLeft = [] #for top-down step only -> now a list of all optimal choices minNucRight = [] #for top-down step only -> now a list of all optimal choices for nucleoChild in SpeciesNode.NUCLEOS: #for every possible nucleotide in the left and right children costLeft = self.leftChild.getCostOf(familyIndex, position, nucleoChild) + costMatrix[self.nucToInd(nucleoChild)][self.nucToInd(nucleo)] if costLeft == minLeft: minNucLeft.append(nucleoChild) #appending elif costLeft < minLeft: minLeft = costLeft minNucLeft = [nucleoChild] #starting a new list costRight = self.rightChild.getCostOf(familyIndex, position, nucleoChild) + costMatrix[self.nucToInd(nucleoChild)][self.nucToInd(nucleo)] if costRight == minRight: minNucRight.append(nucleoChild) #appending elif costRight < minRight: minRight = costRight minNucRight = [nucleoChild] #starting a new list if chosenNucleo is not None: #print("HOLA!!!") #print("\n##IN FITCH## node id = " + str(self.id) + " position = " + str(position) + " chosenNucleo = " + chosenNucleo) #print(minNucLeft) #print(minNucRight) return [minNucLeft, minNucRight] #returning a list containing the list of optimal nucleos for the left child followed by the list of nucleos for the right child self.updatePosToNucleoCosts(familyIndex, position, nucleo, (minLeft + minRight)) #updating the minimum cost for nucleotide nucleo #The Sankoff algorithm, similar to Fitch, but with a different substitution matrix #If chosenNucleo is specified, then the method will return the left and right children nucleotides that gave rise to this nucleotide in the current node (for top-down step) def computeSankoff(self, familyIndex, position, chosenNucleo = None): #The Sankoff cost matrix: costMatrix = SpeciesNode.SANKOFF_MATRIX #Just calling the Fitch method with the Sankoff cost matrix return self.computeFitch(familyIndex, position, chosenNucleo, costMatrix) #The Sankoff algorithm for nucleotide substitutions + the 2D structure change cost (considering only 1 structure for each sequence here) #chosenNucleo here can be a list of 2 nucleos (1st corresponds to position, 2nd corresponds to posPair) def computeSeqAndStruct(self, familyIndex, position, chosenNucleo = None): costMatrix = SpeciesNode.SANKOFF_MATRIX #Using the Sankoff matrix for nucleotide substitution costs bpMatrix = SpeciesNode.BASEPAIR_MATRIX_SIMPLE minNucsList = [] #A list that will contain in order: left and right minimum di-nucleotides posPair = self.getPairedPos(familyIndex, position) #Checking if position is paired with another position posPair in the 2D structure if posPair == -1: #unpaired in the 2D structure return self.computeSankoff(familyIndex, position, chosenNucleo) #in this case chosenNucleo should be only one nucleo (not a list) elif chosenNucleo is None and self.getCostOf(familyIndex, position, 'AA') < float("inf"): #if the cost has been calculated already because of the base pair #print("Position already considered because of the base pair and chosenNucleo = " + str(chosenNucleo)) return else: #position not considered yet for nucleosBP in SpeciesNode.DINUCLEOS: #for every possible di-nucleotide if chosenNucleo is not None and nucleosBP != chosenNucleo: continue ### Calculating the minimum total cost to get nucleosBP ### minLeft = float("inf") #Initializing minRight = float("inf") minNucleosLeft = [] #for top-down step only -> now a list of all optimal choices minNucleosRight = [] #for top-down step only -> now a list of all optimal choices for nucleosChild in SpeciesNode.DINUCLEOS: #for every possible nucleotide base pair in the left and right children costLeft = self.leftChild.getCostOf(familyIndex, position, nucleosChild) + costMatrix[self.nucToInd(nucleosChild[0])][self.nucToInd(nucleosBP[0])] + costMatrix[self.nucToInd(nucleosChild[1])][self.nucToInd(nucleosBP[1])] + bpMatrix[self.nucToInd(nucleosBP[0])][self.nucToInd(nucleosBP[1])] #adding up the cost of both substitutions and bp cost if costLeft == minLeft: minNucleosLeft.append(nucleosChild) #appending elif costLeft < minLeft: minLeft = costLeft minNucleosLeft = [nucleosChild] #starting a new list costRight = self.rightChild.getCostOf(familyIndex, position, nucleosChild) + costMatrix[self.nucToInd(nucleosChild[0])][self.nucToInd(nucleosBP[0])] + costMatrix[self.nucToInd(nucleosChild[1])][self.nucToInd(nucleosBP[1])] + bpMatrix[self.nucToInd(nucleosBP[0])][self.nucToInd(nucleosBP[1])] #adding up the cost of both substitutions and bp cost if costRight == minRight: minNucleosRight.append(nucleosChild) #appending if costRight < minRight: minRight = costRight minNucleosRight = [nucleosChild] #starting a new list if chosenNucleo is not None: return [minNucleosLeft, minNucleosRight] seqAndStructTotalCost = minLeft + minRight #+ bpMatrix[self.nucToInd(nucleosBP[0])][self.nucToInd(nucleosBP[1])] --> before I added the bp cost at the end only (no bueno) #Adding the cost in posToNucleoCosts for the two positions, with the di-nucleotide as the key in the dict (first nucleo is the nucleo of the position # and second nucleo is always the nucleo at the base pair position) self.updatePosToNucleoCosts(familyIndex, position, nucleosBP, seqAndStructTotalCost) self.updatePosToNucleoCosts(familyIndex, posPair, nucleosBP[1] + nucleosBP[0], seqAndStructTotalCost) #reverse order of the di-nucleotides #The Sankoff algorithm for nucleotide substitutions + the 2D structure change cost (considering the 2 structures, but not the full cycles) #chosenNucleo here can be a list of 2 nucleos (1st corresponds to position, 2nd corresponds to posPair) def computeSeqAndStruct_2structsApprox(self, familyIndex, position, chosenNucleo = None): costMatrix = SpeciesNode.SANKOFF_MATRIX #Using the Sankoff matrix for nucleotide substitution costs bpMatrix = SpeciesNode.BASEPAIR_MATRIX_SIMPLE minNucsList = [] #A list that will contain in order: left and right minimum di-nucleotides posPair = self.getPairedPos(familyIndex, position) #Checking if position is paired with another position posPair in the 2D structure if posPair == -1: #unpaired in the 2D structure nucleosOrDinucleos = SpeciesNode.NUCLEOS #will be used in the for loop else: nucleosOrDinucleos = SpeciesNode.DINUCLEOS if chosenNucleo is None and self.getCostOf(familyIndex, position, 'AA') < float("inf"): #if the cost has been calculated already because of the base pair #print("Position already considered because of the base pair and chosenNucleo = " + str(chosenNucleo)) return for nucleo_s in nucleosOrDinucleos: #for every possible nucleo or dinucleo (dependending of the paired state) (called nucleo_s because it's either 1 or 2 nucleo(s)) if chosenNucleo is not None and nucleo_s != chosenNucleo: continue ### Calculating the minimum total cost to get nucleo_s ### minLeft = float("inf") #Initializing minRight = float("inf") minNucleosLeft = [] #for top-down step only -> now a list of all optimal choices minNucleosRight = [] #for top-down step only -> now a list of all optimal choices for nucleo_sChild in nucleosOrDinucleos: #for every possible nucleotide or nucleotide base pair in the left and right children costLeft = self.leftChild.find2StructsApproxCost(familyIndex, position, posPair, nucleo_sChild, nucleo_s) if costLeft == minLeft: minNucleosLeft.append(nucleo_sChild) #appending elif costLeft < minLeft: minLeft = costLeft minNucleosLeft = [nucleo_sChild] #starting a new list costRight = self.rightChild.find2StructsApproxCost(familyIndex, position, posPair, nucleo_sChild, nucleo_s) if costRight == minRight: minNucleosRight.append(nucleo_sChild) #appending if costRight < minRight: minRight = costRight minNucleosRight = [nucleo_sChild] #starting a new list if chosenNucleo is not None: return [minNucleosLeft, minNucleosRight] seqAndStructTotalCost = minLeft + minRight #Adding the cost in posToNucleoCosts for the two positions, with the di-nucleotide as the key in the dict (first nucleo is the nucleo of the position # and second nucleo is always the nucleo at the base pair position) # If this position is not paired, we do only the first update self.updatePosToNucleoCosts(familyIndex, position, nucleo_s, seqAndStructTotalCost) if posPair != -1: self.updatePosToNucleoCosts(familyIndex, posPair, nucleo_s[1] + nucleo_s[0], seqAndStructTotalCost) #reverse order of the di-nucleotides #Calculating the cost of going from nucleo_sChild to nucleo_s; **this method is called on the child node**. #Here we consider the 2 structures in the cost, but not cycles. def find2StructsApproxCost(self, familyIndex, position, posPair, nucleo_sChild, nucleo_s): subMatrix = SpeciesNode.SANKOFF_MATRIX bpMatrix = SpeciesNode.BASEPAIR_MATRIX_SIMPLE #for now, using a very simple matrix T = self.parent.getT() #getting the time parameter ###T = 0.5 ###THIS IS JUST A TEST!! costCurrentStruct = self.getCostOf(familyIndex, position, nucleo_sChild) #intializing with the cost of nucleo_sChild of this node (which is a child node) costOtherStruct = 0 #First dealing with the current structure if posPair == -1: #unpaired in current struct costCurrentStruct += subMatrix[self.nucToInd(nucleo_sChild)][self.nucToInd(nucleo_s)] else: #paired costCurrentStruct += subMatrix[self.nucToInd(nucleo_sChild[0])][self.nucToInd(nucleo_s[0])] + subMatrix[self.nucToInd(nucleo_sChild[1])][self.nucToInd(nucleo_s[1])] + T * bpMatrix[self.nucToInd(nucleo_s[0])][self.nucToInd(nucleo_s[1])] #T multiplies the bpCost ONLY #Second, we deal with the base pairs of the second structure directly connected to position and posPair if familyIndex == 0: otherFamily = 1 else: otherFamily = 0 #Whether position is paired or unpaired, we always have to check for a paired position for 'position' in the other structure otherFamilyPosPair = self.getPairedPos(otherFamily, position) if otherFamilyPosPair != -1: #otherwise, if == -1, there is nothing to calculate (costOtherStruct stays the same) costSum = 0 for nuc in SpeciesNode.NUCLEOS: #we're not setting the nucleo at position: otherFamilyPosPair, so we take into account all possible nucleotides #not counting substitutions for otherFamilyPosPair for now: #costSum += bpMatrix[self.nucToInd(nucleo_sChild[0])][self.nucToInd(nuc)] #we can always access nucleo_sChild[0], whether 'position' is paired or not #October 22nd 2015: Pretty sure that the line above had a bug. We need to look at all possible basepairs with the proposed nucleotide, not the child one costSum += bpMatrix[self.nucToInd(nucleo_s[0])][self.nucToInd(nuc)] #we can always access nucleo_sChild[0], whether 'position' is paired or not costOtherStruct += costSum / 4 #adding the average cost for the 4 different nucleos possible if posPair != -1: #paired in the current struct, so we have to consider 'posPair' in the other structure #checking for posPair otherFamilyPosPair = self.getPairedPos(otherFamily, posPair) if otherFamilyPosPair != -1: #otherwise, if == -1, there is nothing to calculate (costOtherStruct stays the same) costSum = 0 for nuc in SpeciesNode.NUCLEOS: #we're not setting the nucleo at position: otherFamilyPosPair, so we take into account all possible nucleotides #not counting substitutions for otherFamilyPosPair for now: #costSum += bpMatrix[self.nucToInd(nucleo_sChild[1])][self.nucToInd(nuc)] #here it is safe to access nucleo_sChild[1] #October 22nd 2015: Pretty sure that the line above had a bug. We need to look at all possible basepairs with the proposed nucleotide, not the child one costSum += bpMatrix[self.nucToInd(nucleo_s[1])][self.nucToInd(nuc)] #here it is safe to access nucleo_s[1] costOtherStruct += costSum / 4 #adding the average cost for the 4 different nucleos possible return costCurrentStruct + (1-T) * costOtherStruct #(1-T) multiplies costOtherStruct, only because it contains only bp costs, as of now #The exact method, using MutatedSequence objects, to enumerate all possibles mutated sequences for each internal node and their transition cost #This method is called on every internal node, nbMut is the maximum number of mutations allowed on every branch def enumerateMutSeqs(self, familyIndex, nbMut): leftMutSeqsList = self.leftChild.getListMutSeqs(familyIndex) rightMutSeqsList = self.rightChild.getListMutSeqs(familyIndex) setMutSeqsLeftBranch = set() for mutSeq in leftMutSeqsList: #working on the left branch setMutSeqsLeftBranch.update(mutSeq.getSetReachableMutSeqsStrings(nbMut)) setMutSeqsRightBranch = set() for mutSeq in rightMutSeqsList: #working on the right branch setMutSeqsRightBranch.update(mutSeq.getSetReachableMutSeqsStrings(nbMut)) intersectionSet = setMutSeqsLeftBranch & setMutSeqsRightBranch #finding the intersection #test# #print("Intersection of node " + str(self.id) + " : " + str(intersectionSet)) if len(intersectionSet) == 0: print("ERROR: Can't reconstruct the history with the Enumerate algorithm because the number of mutations allowed per branch is too low for this tree.") sys.exit() #Calculating the costs (left + right branches) for each sequence (string) in the intersection set T = self.getT() currentStruct = self.getStruct(familyIndex) if familyIndex == 0: #considering there are only 2 families otherFamilyIndex = 1 else: otherFamilyIndex = 0 otherStruct = self.getStruct(otherFamilyIndex) for seqString in intersectionSet: minCostLeft = float("inf") #minimum cost on the left branch minMutSeqLeft = None #considering only one possible optimal child for now for mutSeq in leftMutSeqsList: cost = mutSeq.getCostOfMutSeq(seqString, currentStruct, otherStruct, T) if cost < minCostLeft: minCostLeft = cost minMutSeqLeft = mutSeq minCostRight = float("inf") #minimum cost on the right branch minMutSeqRight = None #considering only one possible optimal child for now for mutSeq in rightMutSeqsList: cost = mutSeq.getCostOfMutSeq(seqString, currentStruct, otherStruct, T) if cost < minCostRight: minCostRight = cost minMutSeqRight = mutSeq #Once we have the minimums on each branch, we can create the MutatedSequence object for the internal node intNodeMutSeq = MutatedSequence(seqString, minMutSeqLeft, minMutSeqRight, minCostLeft + minCostRight) self.addToListMutatedSequences(familyIndex, intNodeMutSeq) #adding the object to the list #print("For node " + str(self.id) + ", adding mutseq: " + intNodeMutSeq.sequence + "(" + "cost=" + str(intNodeMutSeq.cost)) self.shortenListMutatedSequences(familyIndex) #keeping only the best candidates #----------------------------------------------------------------------------------------------------------------------------------------- ############### ## Main code ## ############### ### ### Methods ### #RNAfold stuff RNAFOLD = 'RNAfold' R = 0.0019872041 T = 310.15 def weight_in_ensemble(seq, ss): def frequency_ensemble(weight, en_ensemble): return exp((-en_ensemble+weight)/(R*T)) def energy_freq(out_rnafold): out_rnafold = out_rnafold.split(b'\n') en = out_rnafold[2] en = float(en[en.rfind(b'(') + 1:en.rfind(b')')]) en_ensemble = float(re.match(b'[^\d]+(-\d+\.\d+)[^\d]+', out_rnafold[3]).groups()[0]) return en, en_ensemble tmp_file = NTF(dir='.', delete=False) tmp_file.write(bytes('>\n%s' % seq, 'UTF-8')) tmp_file.close() try: mfe = check_output([RNAFOLD, '--noPS', '-p0'], stdin=open(tmp_file.name, "r")) except: print("WTF") print(seq) print(ss) sys.exit(1) finally: os.remove(tmp_file.name) tmp_file = NTF(dir='.', delete=False) tmp_file.write(bytes('>\n%s\n%s' % (seq, ss.replace('.', 'x')), 'UTF-8')) tmp_file.close() try: const = check_output([RNAFOLD, '--noPS', '-p0', '-C'], stdin=open(tmp_file.name, "r")) except: print("WTF") print(seq) print(ss) sys.exit(1) finally: os.remove(tmp_file.name) _, mfe_ensemble = energy_freq(mfe) constr_en, _ = energy_freq(const) return frequency_ensemble(mfe_ensemble, constr_en) #To create a SpeciesTree, for testing purposes def createATree(): #speciesTree = SpeciesTree(["(())", "()()"]) #root = SpeciesNode(None, None, None, speciesTree) #speciesTree.root = root #root.leftChild = SpeciesNode(root, None, None, speciesTree) #root.leftChild.addSequence(0, "ACGT") #root.leftChild.addSequence(1, "ATCG") #root.rightChild = SpeciesNode(root, None, None, speciesTree) #root.rightChild.addSequence(0, "AATT") #root.rightChild.addSequence(1, "ATCC") #speciesTree = SpeciesTree(["(())", "()()"]) speciesTree = SpeciesTree([[3, 2, 1, 0], [1, 0, 3, 2]]) listNodes = [] for x in range(7): listNodes.append(SpeciesNode(None, None, None, speciesTree)) speciesTree.root = listNodes[0] listNodes[0].addSequence(0, "GAUU") listNodes[0].addSequence(1, "CGCG") listNodes[0].leftChild = listNodes[1] listNodes[0].rightChild = listNodes[2] listNodes[1].parent = listNodes[0] listNodes[2].parent = listNodes[0] listNodes[1].addSequence(0, "GAUU") listNodes[1].addSequence(1, "CGCG") listNodes[2].addSequence(0, "GAUU") listNodes[2].addSequence(1, "CGCG") listNodes[1].leftChild = listNodes[3] listNodes[1].rightChild = listNodes[4] listNodes[3].parent = listNodes[1] listNodes[4].parent = listNodes[1] listNodes[3].addSequence(0, "AAUU") listNodes[3].addSequence(1, "CGCG") listNodes[4].addSequence(0, "AAAU") listNodes[4].addSequence(1, "CGCG") listNodes[2].leftChild = listNodes[5] listNodes[2].rightChild = listNodes[6] listNodes[5].parent = listNodes[2] listNodes[6].parent = listNodes[2] listNodes[5].addSequence(0, "UGUU") listNodes[5].addSequence(1, "CCCG") listNodes[6].addSequence(0, "GAUU") listNodes[6].addSequence(1, "CGCG") return speciesTree #To read a string representing a newick tree with multiple sequences separated by a hyphen (-) and return a SpeciesTree. #Each sequence represents a family (the sequences must be in the same order as the order of the structures in structureList) def getSpeciesTreeFromNewick(newickString, structuresList): phyloTree = Phylo.read(StringIO(newickString), "newick") #print(phyloTree) #print(phyloTree.root) #Reinitializing SpeciesNode.lastId SpeciesNode.lastId = 1 speciesTree = SpeciesTree(structuresList) speciesTree.root = SpeciesNode(None, None, None, speciesTree) #Calling the recursive method on the root getSpeciesTreeFromNewickRec(speciesTree.root, phyloTree.root, speciesTree) return speciesTree #Recursive method to build the SpeciesTree from the newick tree def getSpeciesTreeFromNewickRec(speciesNodeInternal, phyloNodeInternal, speciesTree): #First adding the sequences in the SpeciesNodeInternal sequences = phyloNodeInternal.name.split("-") for familyIndex in range(len(sequences)): speciesNodeInternal.addSequence(familyIndex, sequences[familyIndex]) #Then creating the SpeciesNode objects representing the children nodes counter = 0 for child in phyloNodeInternal: counter += 1 if counter == 1: leftSpeciesNode = SpeciesNode(speciesNodeInternal, None, None, speciesTree) #Specifying the parent in the constructor speciesNodeInternal.leftChild = leftSpeciesNode #Creating the parent-child link phyloLeftChild = child elif counter == 2: rightSpeciesNode = SpeciesNode(speciesNodeInternal, None, None, speciesTree) #Specifying the parent in the constructor speciesNodeInternal.rightChild = rightSpeciesNode #Creating the parent-child link phyloRightChild = child else: #More than 2 children??!! print("---> BIG PROBLEM: newick tree is not binary.") if counter == 2: #Should be 0 or 2. If counter == 0, it means that we're working on a leaf, and there is nothing more to do #Calling the recursive method on the left and right children nodes getSpeciesTreeFromNewickRec(leftSpeciesNode, phyloLeftChild, speciesTree) getSpeciesTreeFromNewickRec(rightSpeciesNode, phyloRightChild, speciesTree) #Reading a file containing simulated trees and returns a list of SpeciesTree objects def readFile(filename): file = open(filename, 'r') structuresRead = False structuresParenthesisList = [] #contains the parenthesis representations of the structures structuresPairedPosList = [] #The list of pairedPos lists representing the structures speciesTreesList = [] #the list of SpeciesTree objects that will be returned for line in file: if line[0] == "#": #reading a tree index, we don't really need to store this information structuresRead = True #we're done reading structures if not structuresPairedPosList: for structure in structuresParenthesisList: structuresPairedPosList.append(parenthesisToPairedPos(structure)) #print(structuresPairedPosList) elif not structuresRead: structuresParenthesisList.append(line.strip()) else: #reading a newick string speciesTreesList.append(getSpeciesTreeFromNewick(line.strip(), structuresPairedPosList)) return speciesTreesList #To translate a parenthesis representation of a structure into a list of paired positions def parenthesisToPairedPos(parenthesisString): pairedPosList = [-1] * len(parenthesisString) #initializing the list with -1 everywhere openParenthesisStack = deque() #stack that will be used to put the positions of open parentheses, will be poped when we read a closing parenthesis for pos in range(len(parenthesisString)): char = parenthesisString[pos] if char == '(': openParenthesisStack.append(pos) elif char == ')': posPair = openParenthesisStack.pop() pairedPosList[pos] = posPair pairedPosList[posPair] = pos return pairedPosList ### ### Main ### if __name__ == '__main__': #treeSolutions = createATree() #treeSolutions.printTree() #tree = treeSolutions.cloneWithoutAncSeqs() #tree.setDepths() #important to set the depths of every node #tree.printTree() #print("\nAfter computeFitch:\n") #tree.reconstructAncestralSeqs(SpeciesTree.ALGO.Fitch) #tree.printTree() #print("\nAfter computeSankoff:\n") #tree.reconstructAncestralSeqs(SpeciesTree.ALGO.Sankoff) #tree.printTree() #print("\nAfter computeSeqAndStruct:\n") #tree.reconstructAncestralSeqs(SpeciesTree.ALGO.SeqAndStruct) #tree.printTree() #print("\nAfter TwoStructsApprox:\n") #tree.reconstructAncestralSeqs(SpeciesTree.ALGO.TwoStructsApprox) #tree.printTree() #print("\nAfter Enumerate:\n") #tree.reconstructAncestralSeqs(SpeciesTree.ALGO.Enumerate) #tree.printTree() #SpeciesTree.compareTrees(treeSolutions, tree) ###Testing newick tree #newickSpeciesTreeTest = getSpeciesTreeFromNewick("((4.AAUU-CGCG,5.AAAU-CGCG)2.GAUU-CGCG,(6.UGUU-CCCG,7.GAUU-CGCG)3.GAUU-CGCG)1.GAUU-CGCG;", [[3, 2, 1, 0], [1, 0, 3, 2]]) #newickSpeciesTreeTest.printTree() #newickTreeNoSolutions = newickSpeciesTreeTest.cloneWithoutAncSeqs() #newickTreeNoSolutions.setDepths() #don't forget to call setDepths before reconstructing the ancestral sequences #newickTreeNoSolutions.printTree() ###Test read file #print("\nTEST READ FILE\n") #speciesTreesList = readFile("simulatedTrees.txt") #################### ## The real thing ## #################### if len(sys.argv) < 3: print("\n-USAGE: python fitchAndCompany.py algoNumber simulFile [maxNbMutations] [maxListSize]") print("\nalgoNumber: 1=Fitch 2=Sankoff 3=Sankoff + one structure 4=Sankoff + two structures 5=Enumerate") print("simulFile: the file containing the 2 structures and a list of simulated trees in Newick format.") print("maxNbMutations: optional argument for Enumerate only. The maximum number of mutations that can be inferred on one branch. (DEFAULT=1)") print("maxListSize: optional argument for Enumerate only. The maximum number best candidate sequences kept at each internal node during the bottom-up step. (DEFAULT=1000)") sys.exit() algoNumber = int(sys.argv[1]) if algoNumber == 1: algo = SpeciesTree.ALGO.Fitch elif algoNumber == 2: algo = SpeciesTree.ALGO.Sankoff elif algoNumber == 3: algo = SpeciesTree.ALGO.SeqAndStruct elif algoNumber == 4: algo = SpeciesTree.ALGO.TwoStructsApprox elif algoNumber == 5: algo = SpeciesTree.ALGO.Enumerate else: print("\nError: invalid algoNumber.") sys.exit() startTime = time.time() #to measure time of computation speciesTreesList = readFile(sys.argv[2]) #reading the simulFile argument if len(sys.argv) >= 4: #maxNbMutations specified SpeciesTree.NB_MUTATIONS = int(sys.argv[3]) if len(sys.argv) >= 5: #maxListSize specified (note that to specify a maxListSize, you need to specify maxNbMutations before) SpeciesNode.LIST_MUT_SEQS_MAX_SIZE = int(sys.argv[4]) counter = 1 for simulTree in speciesTreesList: print("\nTree # " + str(counter)) #simulTree.printTree() #sys.exit() inferenceTree = simulTree.cloneWithoutAncSeqs() inferenceTree.setDepths() #necessary inferenceTree.reconstructAncestralSeqs(algo) inferenceTree.printTree() #--> useful to look at the solutions for every node of the tree totalNumberOfErrors, totalNumberOfNucleos, totalNumberOfOptimalSeqs, averageNumberOfErrors, totalNumberOfNucleos_withStruct, totalNumberOfErrors_withStruct, totalNumberOfNucleos_withoutStruct, totalNumberOfErrors_withoutStruct, averageNumberOfErrors_withStruct, averageNumberOfErrors_withoutStruct, numberOfNodesInTheTree, totalNumberOfErrors_root, totalNumberOfOptimalSeqs_root, averageNumberOfErrors_root, averageNumberOfErrors_root_withoutStruct, averageNumberOfErrors_root_withStruct = SpeciesTree.compareTrees(simulTree, inferenceTree) for i in range(len(totalNumberOfErrors)): if i == 0: print("\nFAMILY 0 ONLY:\n") elif i == 1: print("\nFAMILY 1 ONLY:\n") elif i == 2: print("\nALL FAMILIES (STRUCTS):\n") ###TOTAL print("Total: " + str(totalNumberOfErrors[i]) + " errors / " + str(totalNumberOfNucleos[i]) + " nucleotides (" + str(totalNumberOfErrors[i]/totalNumberOfNucleos[i]*100) + " % error)") print("Total for unstructured positions: " + str(totalNumberOfErrors_withoutStruct[i]) + " errors / " + str(totalNumberOfNucleos_withoutStruct[i]) + " nucleotides (" + str(totalNumberOfErrors_withoutStruct[i]/totalNumberOfNucleos_withoutStruct[i]*100) + " % error)") print("Total for structured positions: " + str(totalNumberOfErrors_withStruct[i]) + " errors / " + str(totalNumberOfNucleos_withStruct[i]) + " nucleotides (" + str(totalNumberOfErrors_withStruct[i]/totalNumberOfNucleos_withStruct[i]*100) + " % error)") ###AVERAGE print("Average (over " + str(totalNumberOfOptimalSeqs[i]) + " optimal sequences): " + str(averageNumberOfErrors[i]) + " aver. errors per optimal sequence of length = " + str(totalNumberOfNucleos[i]/numberOfNodesInTheTree[i]) + " nucleotides (" + str(averageNumberOfErrors[i]/(totalNumberOfNucleos[i]/numberOfNodesInTheTree[i])*100) + " % error)") print("Average for unstructured positions: " + str(averageNumberOfErrors_withoutStruct[i]) + " aver. errors per optimal sequence, considering " + str(totalNumberOfNucleos_withoutStruct[i]/numberOfNodesInTheTree[i]) + " nucleotides per seq. (" + str(averageNumberOfErrors_withoutStruct[i]/(totalNumberOfNucleos_withoutStruct[i]/numberOfNodesInTheTree[i])*100) + " % error)") print("Average for structured positions: " + str(averageNumberOfErrors_withStruct[i]) + " aver. errors per optimal sequence, considering " + str(totalNumberOfNucleos_withStruct[i]/numberOfNodesInTheTree[i]) + " nucleotides per seq. (" + str(averageNumberOfErrors_withStruct[i]/(totalNumberOfNucleos_withStruct[i]/numberOfNodesInTheTree[i])*100) + " % error)") ###ROOT print("-For the root only:") print("Total: " + str(totalNumberOfErrors_root[i]) + " errors") print("Average (over " + str(totalNumberOfOptimalSeqs_root[i]) + " optimal sequences): " + str(averageNumberOfErrors_root[i]) + " aver. errors per optimal sequence of length = " + str(totalNumberOfNucleos[i]/numberOfNodesInTheTree[i]) + " nucleotides (" + str(averageNumberOfErrors_root[i]/(totalNumberOfNucleos[i]/numberOfNodesInTheTree[i])*100) + " % error)") print("Average for unstructured positions: " + str(averageNumberOfErrors_root_withoutStruct[i]) + " aver. errors per optimal sequence, considering " + str(totalNumberOfNucleos_withoutStruct[i]/numberOfNodesInTheTree[i]) + " nucleotides per seq. (" + str(averageNumberOfErrors_root_withoutStruct[i]/(totalNumberOfNucleos_withoutStruct[i]/numberOfNodesInTheTree[i])*100) + " % error)") print("Average for structured positions: " + str(averageNumberOfErrors_root_withStruct[i]) + " aver. errors per optimal sequence, considering " + str(totalNumberOfNucleos_withStruct[i]/numberOfNodesInTheTree[i]) + " nucleotides per seq. (" + str(averageNumberOfErrors_root_withStruct[i]/(totalNumberOfNucleos_withStruct[i]/numberOfNodesInTheTree[i])*100) + " % error)") elapsedTime = time.time() - startTime print("\n### Computation took " + str(elapsedTime) + " seconds.")