fitchAndCompany.py 84.4 KB
Newer Older
Vladimir Reinharz's avatar
v1.0  
Vladimir Reinharz committed
1
###############################################################################
Vladimir Reinharz's avatar
typo    
Vladimir Reinharz committed
2
#Copyright (c) 2015, Olivier Tremblay Savard, Vladimir Reinharz               #
Vladimir Reinharz's avatar
v1.0  
Vladimir Reinharz committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
#                                                  & Jerome Waldispuhl        #
#All rights reserved.                                                         #
#                                                                             #
#Redistribution and use in source and binary forms, with or without           #
#modification, are permitted provided that the following conditions are met:  #
#* Redistributions of source code must retain the above copyright             #
#notice, this list of conditions and the following disclaimer.                #
#* Redistributions in binary form must reproduce the above copyright          #
#notice, this list of conditions and the following disclaimer in the          #
#documentation and/or other materials provided with the distribution.         #
#* Neither the name of the <organization> nor the                             #
#names of its contributors may be used to endorse or promote products         #
#derived from this software without specific prior written permission.        #
#                                                                             #
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"  #
#AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE    #
#IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE   #
#ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY       #
#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES   #
#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; #
#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND  #
#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT   #
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS#
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE                  #
###############################################################################

#####################################################################################
# Ancestral RNA sequence reconstruction using sequence and 2D structure information
#
# Contains also the regular Fitch and Sankoff algorithms.
#####################################################################################


from enum import Enum
from collections import defaultdict
from collections import OrderedDict
from collections import deque
from pprint import pprint
import sys

from Bio import Phylo
from io import StringIO

import time

#For RNAfold stuff
import os
import re
from math import exp
from subprocess import check_output
from tempfile import NamedTemporaryFile as NTF


#------------------------------------------------------------------------------------------------------------------------------------------------------

###
### Class MutatedSequence: contains a sequence and has methods to find all mutated sequences that can be reached from this sequence
###
class MutatedSequence:

    #Constructor
    def __init__(self, sequence = None, leftChild = None, rightChild = None, cost = None):

        self.sequence = sequence  #the sequence representing this object (string)
        self.leftChild = leftChild  #the left child from which this sequence was obtained by a mutation process (bottom-up)
        self.rightChild = rightChild  #the right child from which this sequence was obtained by a mutation process (bottom-up)
        self.cost = cost  #the cost of childSequence + the cost of going from childSequence to this sequence


    ###
    ### Accessors
    ###

    #Returns a set of sequences (strings) that can be reached by doing a maximum of nbMuts mutations from this MutatedSequence.
    #isLeftChild is true if we're looking for sequences on the left branch of the SpeciesTree, false if it's for the right branch
    def getSetReachableMutSeqsStrings(self, nbMuts):

        #print(self.sequence)
        setMutSeqsStrings = set([self.sequence])  #a set of strings, self.sequence represents the sequence obtained after 0 mutations

        for x in range(nbMuts):  #repeating this nbMuts times

            newListMut = []  #temporary list
            
            for mutSeqString in setMutSeqsStrings:

                #print(mutSeqString)

                for pos in range(len(mutSeqString)):  #going through all the positions

                    nucAtPos = SpeciesNode.nucToNucDict[mutSeqString[pos]]  #translating just in case
                    for nuc in SpeciesNode.NUCLEOS:
                        if nucAtPos != nuc:  #not changing a nucleotide for the same nucleotide
                            newSequence = mutSeqString[:pos] + nuc + mutSeqString[pos+1:]
                            #print("Before mutation, sequence was = " + mutSeqString + "  and new sequence = " + newSequence)
                            newListMut.append(newSequence)
            setMutSeqsStrings.update(newListMut)  #adding the whole list to the set

        return setMutSeqsStrings  #the set of sequences (strings) that can be reached by a max of nbMut mutations from this MutatedSequence Object


    #Returns the cost of going from the sequence represented by this object, to the mutSeq parameter (string)
    #We don't count the cost of the whole sequence, just the effect of the modified nucleotides, and we add that to the cost of this object
    #currentStruct and otherStruct are lists of paired positions, T is the T parameter coming from the SpeciesNode
    def getCostOfMutSeq(self, mutSeq, currentStruct, otherStruct, T):

        subMatrix = SpeciesNode.SANKOFF_MATRIX
        bpMatrix = SpeciesNode.BASEPAIR_MATRIX_SIMPLE

        if len(mutSeq) != len(self.sequence):
            print("Big problem, sequences are not the same length.")
            print("mutSeq: " + mutSeq + " and self.sequence: " + self.sequence)

        listMutatedPos = []
        cost = self.cost  #initializing with the cost of this MutatedSequence
        for pos in range(len(mutSeq)):

            if mutSeq[pos] != self.sequence[pos]:
                listMutatedPos.append(pos)  #keeping track of the mutated positions

                #print("TEST positional argument: " + self.sequence[pos] + "   " + mutSeq[pos])
                cost += subMatrix[SpeciesNode.nucleoToIndexDict[self.sequence[pos]]][SpeciesNode.nucleoToIndexDict[mutSeq[pos]]]  #substitutions are not multiplied by T

                #basepair cost, currentStruct
                posPair = currentStruct[pos]

                if posPair != -1 and posPair not in listMutatedPos:  #only if it is paired and if we didn't already consider this position in the cost because it was mutated too
                    bpCostAfter = bpMatrix[SpeciesNode.nucleoToIndexDict[mutSeq[pos]]][SpeciesNode.nucleoToIndexDict[mutSeq[posPair]]] #we just want the bpCost of the positions in mutSeq

                    cost += T * bpCostAfter

                #basepair cost, otherStruct
                posPair = otherStruct[pos]

                if posPair != -1 and posPair not in listMutatedPos:  #only if it is paired in otherStruct and the position wasn't already considered in the cost
                    bpCostAfter = bpMatrix[SpeciesNode.nucleoToIndexDict[mutSeq[pos]]][SpeciesNode.nucleoToIndexDict[mutSeq[posPair]]] #we just want the bpCost of the positions in mutSeq

                    cost += (1-T) * bpCostAfter

        return cost  #we return cost, which corresponds to the transition cost

                    
    #Clone
    def clone():

        return MutatedSequence(self.sequence, self.leftChild, self.rightChild, self.cost)

    ###
    ### Mutators
    ###
        

#------------------------------------------------------------------------------------------------------------------------------------------------------

###
### Class SpeciesTree: information on the whole species tree, methods to reconstruct ancestral sequences.
###
class SpeciesTree:

    #Class variables
    ALGO = Enum('ALGO', 'Fitch Sankoff SeqAndStruct TwoStructsApprox Enumerate')  #an Enum representing the different scoring functions for the ancestral sequence reconstruction
    NB_MUTATIONS = 1  #the maximum number of mutations to consider for the enumerate algorithm

    ###
    ### Static methods
    ###

    #To compare 2 trees: "simulTree" is the simulated tree and "inferredTree" is the tree containing the inferred ancestral sequences.
    #Obviously, the 2 trees are required to have the exact same topology and node ids
    #This returns the total number of errors followed by the total number of nucleotides in the simulated tree (for all sequences in the tree) in a list.
    def compareTrees(simulTree, inferredTree, verbose=False):

        if verbose:
            print("\nCOMPARING TREES")
        queue = deque()
        queue.appendleft(simulTree.root)
        queue.appendleft(inferredTree.root)  #putting the nodes from each tree one after the other in the same queue, yolo

        ### THE STATS ###

        #All variables are lists of ints, representing, in this order: [family1, family2, all]
        
        #For the complete tree:
        totalNumberOfNucleos = [0, 0, 0]  #Will count the total number of nucleotides in the simulated tree (for all sequences of all families)
        totalNumberOfErrors = [0, 0, 0]  #Will count the total number of errors relative to the simulated sequences in all the tree

        totalNumberOfOptimalSeqs = [0, 0, 0] #Will count the total number of optimal sequences for all the nodes in the species tree; this will allow us to compute the average nb of errors
        averageNumberOfErrors = [0, 0, 0]  #Will count the average number of errors over all the optimal solutions; will be divided by totalNumberOfOptimalSeqs

        #For the positions with/without structure
        totalNumberOfNucleos_withStruct = [0, 0, 0]  #Will count the total number of nucleotides that are part of a structure
        totalNumberOfErrors_withStruct = [0, 0, 0]  #Will count the total number of errors relative to the simulated sequences in all the tree at structured positions

        totalNumberOfNucleos_withoutStruct = [0, 0, 0]  #Will count the total number of nucleotides that are not part of a structure
        totalNumberOfErrors_withoutStruct = [0, 0, 0]  #Will count the total number of errors relative to the simulated sequences in all the tree at unstructured positions

        averageNumberOfErrors_withStruct = [0, 0, 0]  #Will count the average number of errors over all the optimal solutions in all structured positions; divided by totalNumberOfOptimalSeqs
        averageNumberOfErrors_withoutStruct = [0, 0, 0]  #Will count the average number of errors over all the optimal solutions in all unstructured positions; divided by totalNumberOfOptimalSeqs
        numberOfNodesInTheTree = [0, 0, 0]  #Will count the number of nodes for family 1, family 2, and the 2 together (result should be [x, x, 2x])

        #For the root only
        totalNumberOfErrors_root = [0, 0, 0]
        totalNumberOfOptimalSeqs_root = [0, 0, 0]  #With the new findSuperAncestorAndUpdateMatrix method, we should have the same number for fam 0 and fam 1
        averageNumberOfErrors_root = [0, 0, 0]
        averageNumberOfErrors_root_withoutStruct = [0, 0, 0]
        averageNumberOfErrors_root_withStruct = [0, 0, 0]
        
        #################

        while queue:  #while the queue is not empty

            theNodeSimul = queue.pop()
            theNodeInfer = queue.pop()

            #Making sure the ids are identical
            if theNodeSimul.id != theNodeInfer.id:
                print("WARNING: Node ids are different. I did not expect that...")

            if verbose:
                print("- Node: " + str(theNodeSimul.id))
            
            #Comparing the sequences
            for familyIndex in range(simulTree.getNbStructs()):

                #simulSeq = theNodeSimul.getSequence(familyIndex, 0)  #there is only one sequence for each family in the simulated tree
                simulSeq = theNodeSimul.getOnlySequence(familyIndex)  #NEW: updated to deal with sets
                
                minNbErrors = float("inf")
                minNbErrorsWithStruct = float("inf")
                minNbErrorsWithoutStruct = float("inf")

                totalNumberOfOptimalSeqs[familyIndex] += theNodeInfer.getNbSeqs(familyIndex)  #adding the number of seqs for the current familyIndex
                totalNumberOfOptimalSeqs[2] += theNodeInfer.getNbSeqs(familyIndex)  #adding the number of seqs for all (family1 and family2)
                numberOfNodesInTheTree[familyIndex] += 1
                numberOfNodesInTheTree[2] += 1

                #Counting the number of positions with structure and without structure (only once for each family for each node)
                for pos in range(len(simulSeq)):

                    pairedPos = theNodeInfer.getPairedPos(familyIndex, pos)  #theNodeInfer and theNodeSimul should have the same structs and same seq length
                    if pairedPos == -1:  #unpaired
                        totalNumberOfNucleos_withoutStruct[familyIndex] += 1  #Those numbers will have to be divided by the totalNumberOfOptimalSeqs
                        totalNumberOfNucleos_withoutStruct[2] += 1

                    else:  #paired
                        totalNumberOfNucleos_withStruct[familyIndex] += 1
                        totalNumberOfNucleos_withStruct[2] += 1

                for seq in theNodeInfer.getSequencesListsSet(familyIndex):

                    #seq = theNodeInfer.getSequence(familyIndex, seqIndex)  #NEW: now seq is in the loop

                    if len(simulSeq) != len(seq):
                        print("Big problem: sequences are not the same length...")

                    else:
                        nbErrors = 0
                        nbErrorsWithStruct = 0
                        nbErrorsWithoutStruct = 0
                        for pos in range(len(seq)):

                            pairedPos = theNodeInfer.getPairedPos(familyIndex, pos)

                            if SpeciesNode.nucToNucDict[simulSeq[pos]] != SpeciesNode.nucToNucDict[seq[pos]]:  #using the translation, just in case
                                nbErrors += 1
                                averageNumberOfErrors[familyIndex] += 1  #adding all the errors, we'll divide it by totalNumberOfOptimalSeqs at the end
                                averageNumberOfErrors[2] += 1  #for all

                                if pairedPos == -1:  #unpaired
                                    nbErrorsWithoutStruct += 1
                                    averageNumberOfErrors_withoutStruct[familyIndex] += 1
                                    averageNumberOfErrors_withoutStruct[2] += 1

                                else:  #paired
                                    nbErrorsWithStruct += 1
                                    averageNumberOfErrors_withStruct[familyIndex] += 1
                                    averageNumberOfErrors_withStruct[2] += 1

                        if nbErrors < minNbErrors:
                            minNbErrors = nbErrors
                            minNbErrorsWithStruct = nbErrorsWithStruct
                            minNbErrorsWithoutStruct = nbErrorsWithoutStruct

                if verbose:
                    #Printing a report
                    print("Family : " + str(familyIndex) + " --> minNbErrors = " + str(minNbErrors) + " (" + str(minNbErrors/len(simulSeq) * 100) + " %)")

                #updating stats for the whole tree
                totalNumberOfErrors[familyIndex] += minNbErrors
                totalNumberOfErrors[2] += minNbErrors  #updating all in the list
                totalNumberOfNucleos[familyIndex] += len(simulSeq)
                totalNumberOfNucleos[2] += len(simulSeq)

                totalNumberOfErrors_withoutStruct[familyIndex] += minNbErrorsWithoutStruct
                totalNumberOfErrors_withoutStruct[2] += minNbErrorsWithoutStruct

                totalNumberOfErrors_withStruct[familyIndex] += minNbErrorsWithStruct
                totalNumberOfErrors_withStruct[2] += minNbErrorsWithStruct
            
            if not theNodeSimul.isLeaf():  #only checking one, both of them should be leaves at the same time anyway
                queue.appendleft(theNodeSimul.leftChild)
                queue.appendleft(theNodeInfer.leftChild)
                queue.appendleft(theNodeSimul.rightChild)
                queue.appendleft(theNodeInfer.rightChild)

                if theNodeInfer.isRoot(): #We save the stats for the root only
                    for index in range(len(totalNumberOfErrors)):
                        totalNumberOfErrors_root[index] = totalNumberOfErrors[index]
                        totalNumberOfOptimalSeqs_root[index] = totalNumberOfOptimalSeqs[index]
                        averageNumberOfErrors_root[index] = averageNumberOfErrors[index] / totalNumberOfOptimalSeqs_root[index]  #We calculate the averages immediately
                        averageNumberOfErrors_root_withoutStruct[index] = averageNumberOfErrors_withoutStruct[index] / totalNumberOfOptimalSeqs_root[index]
                        averageNumberOfErrors_root_withStruct[index] = averageNumberOfErrors_withStruct[index] / totalNumberOfOptimalSeqs_root[index]

        for x in range(3):  #Calculating the averages -> numbers will be for one sequence only (average nb errors for 1 optimal sequence in the tree)
            averageNumberOfErrors[x] /= totalNumberOfOptimalSeqs[x]
            averageNumberOfErrors_withStruct[x] /= totalNumberOfOptimalSeqs[x]
            averageNumberOfErrors_withoutStruct[x] /= totalNumberOfOptimalSeqs[x]

        return totalNumberOfErrors, totalNumberOfNucleos, totalNumberOfOptimalSeqs, averageNumberOfErrors, totalNumberOfNucleos_withStruct, totalNumberOfErrors_withStruct, totalNumberOfNucleos_withoutStruct, totalNumberOfErrors_withoutStruct, averageNumberOfErrors_withStruct, averageNumberOfErrors_withoutStruct, numberOfNodesInTheTree, totalNumberOfErrors_root, totalNumberOfOptimalSeqs_root, averageNumberOfErrors_root, averageNumberOfErrors_root_withoutStruct, averageNumberOfErrors_root_withStruct
                
    
    #Constructor
    def __init__(self, structuresList, root = None):

        ### Data attributes
        self.nbStructs = len(structuresList)  #the number of different RNA families (with different structures)
        self.structuresList = structuresList  #a list a all the structures (one per family). A structure is represented by a list(int) of size sequenceLength which
                                            #gives for each position the position of the nucleo it is paired with (or -1 if unpaired)
        self.sequenceLength = len(structuresList[0])  #the length of every sequence (right now we have the same length for family 1 and family 2)
        self.root = root  #The root of the tree (SpeciesNode)
        ##useless##self.leavesList = leavesList  #a list of all the leaves of the tree (list of SpeciesNode)
        self.structuresList = structuresList  #a list of structures -> one structure is an array of size seqLength giving the position of the other base or -1
        self.maxDepth = -1  #the maximum depth of the tree, will be updated after the call to setDepths

        
    ###
    ### Accessors
    ###

    def getNbStructs(self):
        return self.nbStructs

    
    def getSeqLength(self):
        return self.sequenceLength

    
    def getRoot(self):
        return self.root


    def isRoot(self, node):
        if node == self.root:
            return True

        return False


    def isLeaf(self, node):
        if node.leftChild == None and node.rightChild == None:
            return True

        return False


    #Returns the list of paired positions for familyIndex
    def getStruct(self, familyIndex):
        return self.structuresList[familyIndex]


    #To print the whole tree in level order.
    #printCosts is a boolean indicating if we want to print the cost of every nucleotide for every position.
    def printTree(self, printCosts = False):
        queue = deque()
        queue.append(self.root)

        while queue:  #while the queue is not empty

            theNode = queue.pop()
            theNode.printNode(printCosts)
            if not theNode.isLeaf():
                queue.appendleft(theNode.leftChild)
                queue.appendleft(theNode.rightChild)

        
    #To print everything below the given node (including the node)
    def printSubtree(self, node):

        if node == None:
            return
        
        node.printNode()
        self.printSubtree(node.leftChild)
        self.printSubtree(node.rightChild)


    #To create a clone of the tree for which we remove the sequences at the ancestral nodes (to create a copy of the simulated tree, without the answers (ie anc. sequences))
    def cloneWithoutAncSeqs(self):

        treeClone = SpeciesTree(self.structuresList)  #creating the new tree with the same structures and root -> not using a copy of structuresList, but shouldn't be a problem
        treeClone.root = self.root.clone(treeClone)  #setting the root

        self.cloneWithoutAncSeqsRec(self.root, treeClone.root, treeClone)  #calling the recursive method

        return treeClone

                
    #Recursive method to clone the tree
    #internalNode and internalNodeClone correspond to the same node
    def cloneWithoutAncSeqsRec(self, internalNode, internalNodeClone, treeClone):

        if internalNode.isLeaf():
            return
        
        leftChildClone = internalNode.leftChild.clone(treeClone)
        rightChildClone = internalNode.rightChild.clone(treeClone)

        internalNodeClone.leftChild = leftChildClone
        internalNodeClone.rightChild = rightChildClone

        leftChildClone.parent = internalNodeClone
        rightChildClone.parent = internalNodeClone

        #Recursive calls
        self.cloneWithoutAncSeqsRec(internalNode.leftChild, leftChildClone, treeClone)
        self.cloneWithoutAncSeqsRec(internalNode.rightChild, rightChildClone, treeClone)
        

    ###
    ### Mutators
    ###

    #Method to set the depth of every node in the tree (root has depth 0)
    def setDepths(self):

        depth = 0
        self.setDepthsRec(self.root, depth)  #call to the recursive method

    #Recursive method to set the depths of the nodes.
    #Parameter 'node' is the node to update and 'depth' is the depth it has.
    def setDepthsRec(self, node, depth):

        node.depth = depth
        if depth > self.maxDepth:
            self.maxDepth = depth
        if not node.isLeaf():
            self.setDepthsRec(node.leftChild, depth+1)
            self.setDepthsRec(node.rightChild, depth+1)
        


    #General method to do the reconstruction; calls the bottom-up step and then the top-down step
    def reconstructAncestralSeqs(self, algo):

        if algo == SpeciesTree.ALGO.Enumerate:
            for familyIndex in range(self.getNbStructs()):
                self.enumerateMutatedSequences(self.root, SpeciesTree.NB_MUTATIONS, familyIndex)
            for familyIndex in range(self.getNbStructs()):
                #test
                #for x in range(len(self.root.listMutatedSequences[familyIndex])):
                #    print(str(x) + ": " + self.root.listMutatedSequences[familyIndex][x].sequence + "  cost = " + str(self.root.listMutatedSequences[familyIndex][x].cost))

                minCost = self.root.listMutatedSequences[familyIndex][0].cost  #Since listMutatedSequences should be sorted, the best cost is at [0]
                index = 0
                while self.root.listMutatedSequences[familyIndex][index].cost == minCost:  #this will consider all the optimal sequences
                    self.selectMutatedSequences(familyIndex, self.root, self.root.listMutatedSequences[familyIndex][index])
                    index += 1
            
        else:
            self.calculateCosts(algo)
            self.selectNucleos(algo)


    #The bottom-up step for the Enumerate algorithm, uses a post-order traversal
    def enumerateMutatedSequences(self, node, nbMuts, familyIndex):

        if node.isLeaf():
            node.addToListMutatedSequences(familyIndex, MutatedSequence(node.getOnlySequence(familyIndex), None, None, 0))  #Creating the only MutatedSequence object for the leaf

        else:  #internal node
            self.enumerateMutatedSequences(node.leftChild, nbMuts, familyIndex)
            self.enumerateMutatedSequences(node.rightChild, nbMuts, familyIndex)

            node.enumerateMutSeqs(familyIndex, nbMuts)


    #The top-down step for the Enumerate algorithm. optimalMutSeq is the MutatedSequence object that is on the optimal path from the root to the 'node' parameter.
    def selectMutatedSequences(self, familyIndex, node, optimalMutSeq):

        if not node.isLeaf():  #otherwise, there is nothing to do

            node.addSequence(familyIndex, optimalMutSeq.sequence)  #adding the sequence as an optimal ancestral sequence

            self.selectMutatedSequences(familyIndex, node.leftChild, optimalMutSeq.leftChild)
            self.selectMutatedSequences(familyIndex, node.rightChild, optimalMutSeq.rightChild)  #recursive calls

            
    #The bottom-up step, calculating the costs using the algo specified by the parameter 'algo'
    def calculateCosts(self, algo):
        self.root.calculateCosts(algo)


    #The top-down step, called after calculateCosts; this method selects the nucleotide of minimum cost for each position and builds the ancestral sequences
    def selectNucleos(self, algo):

        self.findSuperAncestorAndUpdateMatrix(algo)

        #We start by choosing the nucleotides of minimum cost in the root. WARNING: For now, we only build 1 of all the possible optimal sequences
        for familyIndex in range(self.getNbStructs()):

            #One optimal sequence
            sequenceList = [[" " for x in range(self.getSeqLength())]]  #A list of optimal sequences (each sequence is a list of chars here)
            
            for pos in range(self.getSeqLength()):

                #print(pos)
                #print(len(sequenceList))

                minCost = float("inf")
                nucleoOfMinCost = []  #a list of all optimal nucleos for the position
                
                for key in self.root.posToNucleoCosts[familyIndex][pos]:  #Finding all nucleotides of minimum cost

                    cost = self.root.getCostOf(familyIndex, pos, key)
                    if cost == minCost:
                        #print("appending " + key)
                        nucleoOfMinCost.append(key)  #appending to the list
                    if cost < minCost:
                        #print("Updating minCost: minCost = " + str(cost) + " and nucleo = " + nucleo)
                        minCost = cost
                        nucleoOfMinCost = [key]  #starting a new list

                if len(nucleoOfMinCost[0]) == 1:
                    SpeciesNode.addNucleoToSequencesList(sequenceList, pos, nucleoOfMinCost)
                elif len(nucleoOfMinCost[0]) == 2:  #base pair position
                    posPair = self.root.getPairedPos(familyIndex, pos)
                    if pos < posPair:  #otherwise, it has been updated already
                        #print("pos = " + str(pos) + "  and posPair = " + str(posPair) + " and seqLength = " + str(self.getSeqLength()) + " and len(sequence) = " + str(len(sequence)))
                        SpeciesNode.addNucleoToSequencesList(sequenceList, pos, [nuc[0] for nuc in nucleoOfMinCost])
                        SpeciesNode.addNucleoToSequencesList(sequenceList, posPair, [nuc[1] for nuc in nucleoOfMinCost])

            for seq in sequenceList:  #adding all the sequences
                #print(seq)
                self.root.addSequence(familyIndex, "".join(seq))

        #NEW: Keeping only the intersection of family 0 and family 1 at the root. We want only optimal sequences that were found for both (or more) structures.
        #intersection = self.root.getSequencesListsSet(0) & self.root.getSequencesListsSet(1)  #initializing with the first 2 structures (there are at least 2)
        #for familyIndex in range(2, self.getNbStructs()):  #starting at 2
        #    intersection &= self.root.getSequencesListsSet(familyIndex)

        #if len(intersection) == 0:
        #    print("HUGE PROBLEM: intersection of optimal sequences at the root is empty...")
          #  sys.exit()

        #for familyIndex in range(self.getNbStructs()):
         #   self.root.sequencesLists[familyIndex] = intersection  #putting only the intersection set for all structures at the root 
         
        #Now that the root sequences have been found, we call the SpeciesNode selectNucleos method on the root to select the nucleos for its two children nodes
        self.root.selectNucleos(algo)


    #To calculate the matrix costs, using Fitch, for the super ancestor (ancestor of family 1 and family 2 of the root).
    #Then, this method will update the costMatrix of the root with this super ancestor matrix for both families (same matrix copied for both)
    def findSuperAncestorAndUpdateMatrix(self, algo):

        #First, we need to create a fake tree of 3 nodes, so that we can call computeFitch normally
        fakeSpeciesTree = SpeciesTree([[-1 for i in range(self.getSeqLength())]])  #fake tree with no structure, we don't need it. We need only one list, so that it will be like there is only one family. The length of the structureList is important, as it defines the length of the sequences in the tree.
        superAncNode = SpeciesNode(None, None, None, fakeSpeciesTree)  #corresponds to the super ancestor
        fakeSpeciesTree.root = superAncNode
        leftChild = SpeciesNode(superAncNode, None, None, fakeSpeciesTree)  #corresponds to Family 0 of the root
        rightChild = SpeciesNode(superAncNode, None, None, fakeSpeciesTree)  #corresponds to Family 1 of the root
        superAncNode.leftChild = leftChild
        superAncNode.rightChild = rightChild

        #We need to check if we are dealing with di-nucleotides (which is the case for algos 3 and 4)
        if (algo == SpeciesTree.ALGO.SeqAndStruct or algo == SpeciesTree.ALGO.TwoStructsApprox):

            for familyIndex in range(self.root.getNbStructs()):
                for pos in range(self.getSeqLength()):
                    posPair = self.root.getPairedPos(familyIndex, pos)

                    if posPair != -1:  #we are dealing with di-nucleotides in the costMatrix

                        dictBestScoreForFirstNucleo = {}  #Will store the lowest cost for A,C,G and U over all the di-nucleotide score (looking only at first position)
                        for dinucleo in SpeciesNode.DINUCLEOS:

                            dinucleoCost = self.root.getCostOf(familyIndex, pos, dinucleo)
                            correspondingNucleo = dinucleo[0]  #first nucleo of the dinucleo is the one corresponding to the current position

                            if correspondingNucleo not in dictBestScoreForFirstNucleo or dinucleoCost < dictBestScoreForFirstNucleo[correspondingNucleo]:
                                dictBestScoreForFirstNucleo[correspondingNucleo] = dinucleoCost

                        for nucleo, minCost in dictBestScoreForFirstNucleo.items():  #We update the costMatrix, adding new key-values for single nucleotides
                            self.root.updatePosToNucleoCosts(familyIndex, pos, nucleo, minCost)     

        
        
        leftChild.posToNucleoCosts[0] = self.root.posToNucleoCosts[0]  #Here, we map the costMatrix of family 0 to the leftChild of the fakeTree
        rightChild.posToNucleoCosts[0] = self.root.posToNucleoCosts[1]  #and the costMatrix of family 1 to the rightChild of the fakeTree

        for pos in range(fakeSpeciesTree.getSeqLength()):  #for each position
            superAncNode.computeFitch(0, pos)  #calling computeFitch for every position, in family 0 only (there is only one family in the fakeTree)

        ###TEST
        #for pos in range(self.getSeqLength()):
        #    costString = "Pos " + str(pos) + ":  "
        #    for key in sorted(superAncNode.posToNucleoCosts[0][pos]):
        #        costString += key + ": " + str(superAncNode.posToNucleoCosts[0][pos][key]) + "   "
        #    print(costString)
        #sys.exit()
            
        #Now the tricky part: we replace the cost matrices of the root (for fam 0 and fam 1) with the matrix of the superAncestor... and we hope for the best.
        self.root.posToNucleoCosts[0] = superAncNode.posToNucleoCosts[0]
        self.root.posToNucleoCosts[1] = superAncNode.posToNucleoCosts[0]  #This is just a copy of the reference, should not be a problem though
                    

        
    
#-------------------------------------------------------------------------------------------------------------------------------------------


###
### Class SpeciesNode: information on a node of the SpeciesTree and methods to calculate scores
###
class SpeciesNode:

    BP_WEIGHT = 0.001  #The weight that will be used to multiply the values in the BASEPAIR_MATRIX_SIMPLE
    #BP_WEIGHT = 0.05  #That was the best so far  (61 and 59 total error for 3 and 4 resp.)
    NUCLEOS = ['A', 'C', 'G', 'U']  #Just a list of the nucleotides, will help with for loops
    DINUCLEOS = ['AA', 'AC', 'AG', 'AU', 'CA', 'CC', 'CG', 'CU', 'GA', 'GC', 'GG', 'GU', 'UA', 'UC', 'UG', 'UU']
    FITCH_MATRIX = [[0, 1, 1, 1], [1, 0, 1, 1], [1, 1, 0, 1], [1, 1, 1, 0]]
    SANKOFF_MATRIX = [[0, 2, 1, 2], [2, 0, 2, 1], [1, 2, 0, 2], [2, 1, 2, 0]]
    #BASEPAIR_MATRIX_SIMPLE = [[5, 5, 5, 1], [5, 5, 0, 5], [5, 0, 4, 3], [1, 5, 4, 5]]  #not bad, but 3 does better than 4
    BASEPAIR_MATRIX_SIMPLE = [[3, 3, 3, 1], [3, 3, 0, 3], [3, 0, 3, 2], [1, 3, 2, 3]]  #this one is not bad at all
    #BASEPAIR_MATRIX_SIMPLE = [[2.5, 2.5, 2.5, 0], [2.5, 2.5, 0, 2.5], [2.5, 0, 2.5, 2.5], [0, 2.5, 2.5, 2.5]]  #first matrix
    for i in range(len(BASEPAIR_MATRIX_SIMPLE)):
        for j in range(len(BASEPAIR_MATRIX_SIMPLE[i])):
            BASEPAIR_MATRIX_SIMPLE[i][j] *= BP_WEIGHT
    #print(BASEPAIR_MATRIX_SIMPLE)
    #sys.exit()
    LIST_MUT_SEQS_MAX_SIZE = 1000  #maximum size of listMutatedSequences -> we want to order the list and keep the 1000 with lowest cost (most promising)
    
    lastId = 1  #Class variable; incrementing every time we call the constructor
    nucleoToIndexDict = {'A': 0, 'a': 0, 'C': 1, 'c': 1, 'G': 2, 'g': 2, 'T': 3, 't': 3, 'U': 3, 'u': 3}  #mapping a nucleotide to an index, starting from 0
    nucToNucDict = {'A': 'A', 'a': 'A', 'C': 'C', 'c': 'C', 'G': 'G', 'g': 'G', 'T': 'U', 't': 'U', 'U': 'U', 'u': 'U'}  #translating nucleotides so that we deal only with A,C,G,U

    ###
    ### static methods
    ###
    
    #To return the index of the nucleo, using nucleoToIndexDict (shorter to write)
    def nucToInd(nucleo):
        return SpeciesNode.nucleoToIndexDict[nucleo]

    
    def addNucleoToSequencesList(sequencesList, position, listOfNucsToAdd):

        #print("Call to addNucleoToSequencesList")
        #print("sequencesList = " + str(sequencesList) + "   position = " + str(position) + "  and listOfNucsToAdd = " + str(listOfNucsToAdd))

        #New: to protect from memory explosion
        if len(sequencesList) > 2500000:
            print("Too many optimal sequences.")
            sys.exit()

        setOfNucsToAdd = set()
        for nuc in listOfNucsToAdd:  #getting rid of duplicates (possibly coming from di-nucleotides) with a set
            setOfNucsToAdd.add(nuc)

        listOfNucsToAdd = list(setOfNucsToAdd)  #rewriting the list
        #print("--- after elimination of duplicates, listOfNucsToAdd = " + str(listOfNucsToAdd))
        
        initialNbSeqs = len(sequencesList)
        for x in range(len(setOfNucsToAdd)-1): #no need to duplicate sequences if there is only one nucleo to add, otherwise we need one additional copy for every additional nucleo to add
            for y in range(initialNbSeqs):
                
                copy = sequencesList[y][:]
                sequencesList.append(copy)
                #print("Appending copy!!!!!!!!!!!!!!!")

        for x in range(len(sequencesList)):
            nucIndex = x // initialNbSeqs  #integer division to find the index of the nucleo to add in the sequences
            sequencesList[x][position] = listOfNucsToAdd[nucIndex]

        #print("At the end, sequencesList = " + str(sequencesList))
    
    #Constructor
    def __init__(self, parent, leftChild, rightChild, speciesTree, id = None):

        ### Data attributes
        self.parent = parent  #the parent node in the tree (root has parent = None)
        self.leftChild = leftChild  #the leftChild in the tree (leaves have leftChild = None)
        self.rightChild = rightChild  #the rightChild in the tree (leaves have rightChild = None)
        self.speciesTree = speciesTree  #a reference to the SpeciesTree that contains this node

        self.posToNucleoCosts = [[defaultdict(lambda: float("inf")) for i in range(speciesTree.getSeqLength())] for j in range(self.getNbStructs())]  #For each family (nbStructs), we have one list of all positions, and at each position a dict mapping a nucleotide (or a dinucleotide) to its cost
        self.sequencesLists = [set() for i in range(speciesTree.getNbStructs())]  #One *set* (*new, before it was a list) of optimal sequences (strings) for every family
        if id is None:
            self.id = SpeciesNode.lastId
            SpeciesNode.lastId += 1
        else:
            self.id = id
        self.depth = -1  #intializing to -1, don't forget to call setDepths() on the tree after it is contructed

        self.listMutatedSequences = [[] for i in range(speciesTree.getNbStructs())]  #list of most promising MutatedSequence objects (only one sequence for leaves) for each family
        
    ###
    ### Accessors
    ###

    def isRoot(self):
        return self.speciesTree.isRoot(self)


    def isLeaf(self):
        return self.speciesTree.isLeaf(self)

    def getSeqLength(self):
        return self.speciesTree.getSeqLength()  #Warning: right now, we assume that all sequences have the same length

    def getNbStructs(self):
        return self.speciesTree.getNbStructs()

    def getPairedPos(self, familyIndex, position):
        return self.speciesTree.structuresList[familyIndex][position]

    def getNbSeqs(self, familyIndex):
        return len(self.sequencesLists[familyIndex])


    #Returns the list of paired positions for familyIndex
    def getStruct(self, familyIndex):
        return self.speciesTree.getStruct(familyIndex)


    #Returns the sequence at seqIndex for the family at familyIndex  ####DEPRECATED; see getOnlySequence(self, familyIndex)
    #def getSequence(self, familyIndex, seqIndex):
        #return self.sequencesLists[familyIndex][seqIndex]  ##We changed it to a set now, so we can't access with an index


    #Returns the only sequence in the set for the specified familyIndex. There should be only one.
    def getOnlySequence(self, familyIndex):
        desiredList = list(self.sequencesLists[familyIndex])
        return desiredList[0]

    #Returns the set for the specified familyIndex
    def getSequencesListsSet(self, familyIndex):
        return self.sequencesLists[familyIndex]


    #For a specified familyIndex, returns the number of optimal sequences (for a leaf, there should be only the input sequence, so just 1) 
    def getNbOfSeqs(self, familyIndex):
        return len(self.sequencesLists[familyIndex])


    #To get a cost inside posToNucleoCosts for a specified familyIndex, position and nucleotide (char)
    def getCostOf(self, familyIndex, position, nucleotide):
        return self.posToNucleoCosts[familyIndex][position][nucleotide]


    #To return the index of the nucleo, using nucleoToIndexDict (shorter to write)
    def nucToInd(self, nucleo):
        return SpeciesNode.nucleoToIndexDict[nucleo]


    #To return a translation of the nucleotide, using nucToNucDict
    def translateNuc(self, nucleo):
        return SpeciesNode.nucToNucDict[nucleo]


    #To return the value of the parameter T depending on depth of the node
    def getT(self):
        
        #using a simple linear function now; depth 0 returns 50% (0.5)
        slope = (1 - 0.5) / self.speciesTree.maxDepth  #range is from 50% (for the root) to 100% (for the leaves)
                                                       #--> 100% will never be returned, because this method will be called on the parent node
        return 0.5 + self.depth * slope


    #To get the listMutatedSequences for a familyIndex
    def getListMutSeqs(self, familyIndex):
        return self.listMutatedSequences[familyIndex]
    
        
    #Prints a description of the node
    def printNode(self, printCosts):

        parentIdString = "None"
        if not self.isRoot():
            parentIdString = str(self.parent.id)

        lcIdString = "None"
        rcIdString = "None"
        if not self.isLeaf():
            lcIdString = str(self.leftChild.id)
            rcIdString = str(self.rightChild.id)
            
        print("\n--- Id = " + str(self.id) + "  Parent = " + parentIdString + "  LeftChild = " + lcIdString + "  RightChild = " + rcIdString + "  Depth = " + str(self.depth) + "  T = " + str(self.getT()))

        for familyIndex in range(self.getNbStructs()):

            print(". Family " + str(familyIndex) + ":")
            for seq in self.sequencesLists[familyIndex]:
                print(seq)
                #pprint(self.posToNucleoCosts[familyIndex], width = 500)  #The costs might not be printed in order, we might have to sort the keys
                if printCosts:
                    for pos in range(self.getSeqLength()):
                        costString = "Pos " + str(pos) + ":  "
                        for key in sorted(self.posToNucleoCosts[familyIndex][pos]):
                            costString += key + ": " + str(self.posToNucleoCosts[familyIndex][pos][key]) + "   "
                        print(costString)



    #To return a clone of this node, without the sequences unless it's a leaf node. Parent and child nodes are not set here.
    def clone(self, clonedSpeciesTree):

        cloneNode = SpeciesNode(None, None, None, clonedSpeciesTree, self.id)  #keeping the same ids
        if self.isLeaf():
            for familyIndex in range(self.getNbStructs()):
                for seq in self.getSequencesListsSet(familyIndex):
                    cloneNode.addSequence(familyIndex, seq)  #copying the sequences, only for the leaves

        return cloneNode
        
                    
    ###
    ### Mutators
    ###

    #To add a MutatedSequence object to listMutatedSequences
    def addToListMutatedSequences(self, familyIndex, mutSeq):
        self.listMutatedSequences[familyIndex].append(mutSeq)


    #To sort listMutatedSequences and keep only the most promising ones (we keep only LIST_MUT_SEQS_MAX_SIZE objects in the list)
    def shortenListMutatedSequences(self, familyIndex):
        
        self.getListMutSeqs(familyIndex).sort(key = lambda x: x.cost)  #Sorting in place using the costs
        self.listMutatedSequences[familyIndex] = self.getListMutSeqs(familyIndex)[:SpeciesNode.LIST_MUT_SEQS_MAX_SIZE]

        #test
        #print("- Node: " + str(self.id) + "  Length of shortened list : " + str(len(self.getListMutSeqs(familyIndex))) + " and first element cost = " + str(self.listMutatedSequences[familyIndex][0].cost))
        

    #To add a sequence to sequencesList at a certain familyIndex
    #Now checks if the sequence is already present; the method adds the sequence only if it is different
    def addSequence(self, familyIndex, sequence):

        if familyIndex >= self.speciesTree.getNbStructs():
            print("familyIndex : " + str(familyIndex) + " doesn't exist! Sequence cannot be added.")
            return

        if sequence not in self.sequencesLists[familyIndex]:
            #self.sequencesLists[familyIndex].append(sequence)
            self.sequencesLists[familyIndex].add(sequence)  #NEW: this is a set now, not a list
            #print("******************* inside addSequence, appending : " + sequence + " for node " + str(self.id) + " and fam " + str(familyIndex))


    #To update posToNucleoCosts for a specified familyIndex, a specified position and a specified nucleotide (char)
    #If addToValue is True, then we add theCost to the one that is there, otherwise we set the value
    def updatePosToNucleoCosts(self, familyIndex, position, nucleotide, theCost, addToValue = False):

        if addToValue:
            self.posToNucleoCosts[familyIndex][position][nucleotide] += theCost
        else:
            self.posToNucleoCosts[familyIndex][position][nucleotide] = theCost


    #Recursive method to calculate the costs using the algo defined by the parameter 'algo' (Bottom-up step)
    def calculateCosts(self, algo):

        if self.isLeaf():  #for the leaves, the costs are either 0 or infinity (infinity is the default cost)

            for familyIndex in range(self.getNbStructs()):  #for each family
                
                if self.isLeaf() and self.getNbOfSeqs(familyIndex) != 1:
                    print("---> Huge problem, leaf node doesn't have exactly 1 sequence for family " + str(familyIndex))

                #sequence = self.getSequence(familyIndex, 0)  #There should be only one sequence, at index 0
                sequence = self.getOnlySequence(familyIndex)  #NEW: updated to deal with sets
                
                for pos in range(self.getSeqLength()):  #for each position

                    nucleo = sequence[pos]
                    posPair = self.getPairedPos(familyIndex, pos)
                    if (algo == SpeciesTree.ALGO.SeqAndStruct or algo == SpeciesTree.ALGO.TwoStructsApprox) and posPair != -1:  #If we have to consider the secondary structure
                        pairedNuc = sequence[posPair]
                        self.updatePosToNucleoCosts(familyIndex, pos, nucleo+pairedNuc, 0)  #We put the pair of nucleos as the key
                        
                    else:  #if the algo doesn't consider the 2D struct or if this position is not paired
                        self.updatePosToNucleoCosts(familyIndex, pos, nucleo, 0)  #updating only the cost of the nucleotide that is in the sequence at position pos


        else:  #if the current node is an internal node

            #postorder
            self.leftChild.calculateCosts(algo)  #Calculate for left child first
            self.rightChild.calculateCosts(algo)  #then for the right child

            #When the costs have been calculated for both children nodes, calculate for the current node
            for familyIndex in range(self.getNbStructs()):  #for each family
                for pos in range(self.getSeqLength()):  #for each position

                    if algo == SpeciesTree.ALGO.Fitch:
                        self.computeFitch(familyIndex, pos)

                    elif algo == SpeciesTree.ALGO.Sankoff:
                        self.computeSankoff(familyIndex, pos)

                    elif algo == SpeciesTree.ALGO.SeqAndStruct:
                        self.computeSeqAndStruct(familyIndex, pos)

                    elif algo == SpeciesTree.ALGO.TwoStructsApprox:
                        self.computeSeqAndStruct_2structsApprox(familyIndex, pos)


    #The top-down step, called after calculateCosts; this method selects the nucleotide of minimum cost for each position and builds the ancestral sequences
    #This method is called on a node for which we already have selected the optimal sequences, and we find the optimal sequences for its children depending
    #on that sequence
    def selectNucleos(self, algo):

        if self.leftChild.isLeaf() and self.rightChild.isLeaf():  #we go through the code if only one of the children is a leaf
            return

        for familyIndex in range(self.getNbStructs()):
            #print("selectNucleos: working on familyIndex # " + str(familyIndex))

            #print("There are " + str(self.getNbSeqs(familyIndex)) + " optimal sequences at the parent node")
            for parentSequence in self.getSequencesListsSet(familyIndex):  #for every optimal sequence
                #if seqIndex % 100 == 0:
                    #print("### selectNucleos: working on sequence # " + str(seqIndex))
                
                sequenceLeftList = [[" " for x in range(self.getSeqLength())]]  #using list representation of strings here, so we can modify easily characters
                sequenceRightList = [[" " for x in range(self.getSeqLength())]]

                #parentSequence = self.getSequence(familyIndex, seqIndex)  #now going through all the optimal sequences  --> now parentSequence is in the loop

                for pos in range(self.getSeqLength()):
                    #print("selectNucleos: working on position # " + str(pos))

                    nucleo = parentSequence[pos]

                    if algo == SpeciesTree.ALGO.Fitch or algo == SpeciesTree.ALGO.Sankoff:

                        if algo == SpeciesTree.ALGO.Fitch:
                            leftAndRightChildNucleos = self.computeFitch(familyIndex, pos, nucleo)

                        elif algo == SpeciesTree.ALGO.Sankoff:
                            leftAndRightChildNucleos = self.computeSankoff(familyIndex, pos, nucleo)

                        #print(leftAndRightChildNucleos)
                        SpeciesNode.addNucleoToSequencesList(sequenceLeftList, pos, leftAndRightChildNucleos[0])
                        SpeciesNode.addNucleoToSequencesList(sequenceRightList, pos, leftAndRightChildNucleos[1])

                    elif algo == SpeciesTree.ALGO.SeqAndStruct or algo == SpeciesTree.ALGO.TwoStructsApprox:
                        posPair = self.getPairedPos(familyIndex, pos)

                        if algo == SpeciesTree.ALGO.SeqAndStruct:
                            method = self.computeSeqAndStruct
                        else:
                            method = self.computeSeqAndStruct_2structsApprox
                        
                        if posPair == -1:  #unpaired
                            leftAndRightChildNucleos = method(familyIndex, pos, nucleo)
                            SpeciesNode.addNucleoToSequencesList(sequenceLeftList, pos, leftAndRightChildNucleos[0])
                            SpeciesNode.addNucleoToSequencesList(sequenceRightList, pos, leftAndRightChildNucleos[1])
                        elif pos < posPair:  #paired in the 2D structure, and we did not consider the pos already
                            leftAndRightChildNucleos = method(familyIndex, pos, nucleo + parentSequence[posPair])
                            #print(nucleo + parentSequence[posPair])
                            #print("leftAndRightChildNucleos = " + str(leftAndRightChildNucleos))
                            SpeciesNode.addNucleoToSequencesList(sequenceLeftList, pos, [nuc[0] for nuc in leftAndRightChildNucleos[0]])  #A list of di-nucleotides is returned:
                            SpeciesNode.addNucleoToSequencesList(sequenceRightList, pos, [nuc[0] for nuc in leftAndRightChildNucleos[1]])  # first position in the list corresponds to the  
                            SpeciesNode.addNucleoToSequencesList(sequenceLeftList, posPair, [nuc[1] for nuc in leftAndRightChildNucleos[0]])   # left child and second position
                            SpeciesNode.addNucleoToSequencesList(sequenceRightList, posPair, [nuc[1] for nuc in leftAndRightChildNucleos[1]])  # corresponds to right child
                        else:
                            continue

                #If one of the children nodes is a leaf, we don't add the sequence for this node
                if not self.leftChild.isLeaf():
                    for seq in sequenceLeftList:
                        self.leftChild.addSequence(familyIndex, "".join(seq))  #using join to transform the sequence back into a string
                if not self.rightChild.isLeaf():
                    for seq in sequenceRightList:
                        self.rightChild.addSequence(familyIndex, "".join(seq))  #using join to transform the sequence back into a string

        #Calling on the children nodes
        self.leftChild.selectNucleos(algo)
        self.rightChild.selectNucleos(algo)
        
            

    ### The different algorithms

    #The Fitch algorithm to get the costs for one position
    #If chosenNucleo is specified, then the method will return a list of all left and right children nucleotides that gave rise to this nucleotide in the current node (for top-down step)
    def computeFitch(self, familyIndex, position, chosenNucleo = None, costMatrix = None):

        if costMatrix == None:  #we use Fitch
            costMatrix = SpeciesNode.FITCH_MATRIX
        
        for nucleo in SpeciesNode.NUCLEOS:  #for every nucleotide, we want to find the minimum cost
            #print("chosenNucleo = " + str(chosenNucleo) + "  and nucleo = " + nucleo)
            if chosenNucleo is not None and nucleo != chosenNucleo:  #only for top-down step; we want to do calculation only for specified nucleo
                continue
            
            minLeft = float("inf")  #Initializing
            minRight = float("inf")
            minNucLeft = []  #for top-down step only -> now a list of all optimal choices
            minNucRight = []  #for top-down step only -> now a list of all optimal choices

            for nucleoChild in SpeciesNode.NUCLEOS:  #for every possible nucleotide in the left and right children

                costLeft = self.leftChild.getCostOf(familyIndex, position, nucleoChild) + costMatrix[self.nucToInd(nucleoChild)][self.nucToInd(nucleo)]

                if costLeft == minLeft:
                    minNucLeft.append(nucleoChild)  #appending
                elif costLeft < minLeft:
                    minLeft = costLeft
                    minNucLeft = [nucleoChild]  #starting a new list

                costRight = self.rightChild.getCostOf(familyIndex, position, nucleoChild) + costMatrix[self.nucToInd(nucleoChild)][self.nucToInd(nucleo)]

                if costRight == minRight:
                    minNucRight.append(nucleoChild)  #appending
                elif costRight < minRight:
                    minRight = costRight
                    minNucRight = [nucleoChild]  #starting a new list

            if chosenNucleo is not None:
                #print("HOLA!!!")
                #print("\n##IN FITCH## node id = " + str(self.id) + "  position = " + str(position) +  "  chosenNucleo = " + chosenNucleo)
                #print(minNucLeft)
                #print(minNucRight)
                return [minNucLeft, minNucRight]  #returning a list containing the list of optimal nucleos for the left child followed by the list of nucleos for the right child
            
            self.updatePosToNucleoCosts(familyIndex, position, nucleo, (minLeft + minRight))  #updating the minimum cost for nucleotide nucleo
                    
        
    #The Sankoff algorithm, similar to Fitch, but with a different substitution matrix
    #If chosenNucleo is specified, then the method will return the left and right children nucleotides that gave rise to this nucleotide in the current node (for top-down step)
    def computeSankoff(self, familyIndex, position, chosenNucleo = None):

        #The Sankoff cost matrix:
        costMatrix = SpeciesNode.SANKOFF_MATRIX

        #Just calling the Fitch method with the Sankoff cost matrix
        return self.computeFitch(familyIndex, position, chosenNucleo, costMatrix)


    #The Sankoff algorithm for nucleotide substitutions + the 2D structure change cost (considering only 1 structure for each sequence here)
    #chosenNucleo here can be a list of 2 nucleos (1st corresponds to position, 2nd corresponds to posPair)
    def computeSeqAndStruct(self, familyIndex, position, chosenNucleo = None):

        costMatrix = SpeciesNode.SANKOFF_MATRIX  #Using the Sankoff matrix for nucleotide substitution costs
        bpMatrix = SpeciesNode.BASEPAIR_MATRIX_SIMPLE

        minNucsList = []  #A list that will contain in order: left and right minimum di-nucleotides

        posPair = self.getPairedPos(familyIndex, position)  #Checking if position is paired with another position posPair in the 2D structure

        if posPair == -1:  #unpaired in the 2D structure
            return self.computeSankoff(familyIndex, position, chosenNucleo)  #in this case chosenNucleo should be only one nucleo (not a list)

        elif chosenNucleo is None and self.getCostOf(familyIndex, position, 'AA') < float("inf"):  #if the cost has been calculated already because of the base pair
            #print("Position already considered because of the base pair and chosenNucleo = " + str(chosenNucleo))
            return

        else:  #position not considered yet
            
            for nucleosBP in SpeciesNode.DINUCLEOS:  #for every possible di-nucleotide

                if chosenNucleo is not None and nucleosBP != chosenNucleo:
                    continue
                
                ### Calculating the minimum total cost to get nucleosBP ###
                minLeft = float("inf")  #Initializing
                minRight = float("inf")
                minNucleosLeft = []  #for top-down step only -> now a list of all optimal choices
                minNucleosRight = []  #for top-down step only -> now a list of all optimal choices

                for nucleosChild in SpeciesNode.DINUCLEOS:  #for every possible nucleotide base pair in the left and right children

                    costLeft = self.leftChild.getCostOf(familyIndex, position, nucleosChild) + costMatrix[self.nucToInd(nucleosChild[0])][self.nucToInd(nucleosBP[0])] + costMatrix[self.nucToInd(nucleosChild[1])][self.nucToInd(nucleosBP[1])] + bpMatrix[self.nucToInd(nucleosBP[0])][self.nucToInd(nucleosBP[1])]  #adding up the cost of both substitutions and bp cost

                    if costLeft == minLeft:
                        minNucleosLeft.append(nucleosChild)  #appending
                    elif costLeft < minLeft:
                        minLeft = costLeft
                        minNucleosLeft = [nucleosChild]  #starting a new list

                    costRight = self.rightChild.getCostOf(familyIndex, position, nucleosChild) + costMatrix[self.nucToInd(nucleosChild[0])][self.nucToInd(nucleosBP[0])] + costMatrix[self.nucToInd(nucleosChild[1])][self.nucToInd(nucleosBP[1])] + bpMatrix[self.nucToInd(nucleosBP[0])][self.nucToInd(nucleosBP[1])]  #adding up the cost of both substitutions and bp cost

                    if costRight == minRight:
                        minNucleosRight.append(nucleosChild)  #appending
                    if costRight < minRight:
                        minRight = costRight
                        minNucleosRight = [nucleosChild]  #starting a new list

                if chosenNucleo is not None:
                    return [minNucleosLeft, minNucleosRight]        
                    
                seqAndStructTotalCost = minLeft + minRight #+ bpMatrix[self.nucToInd(nucleosBP[0])][self.nucToInd(nucleosBP[1])] --> before I added the bp cost at the end only (no bueno)

                #Adding the cost in posToNucleoCosts for the two positions, with the di-nucleotide as the key in the dict (first nucleo is the nucleo of the position
                # and second nucleo is always the nucleo at the base pair position)
                self.updatePosToNucleoCosts(familyIndex, position, nucleosBP, seqAndStructTotalCost)
                self.updatePosToNucleoCosts(familyIndex, posPair, nucleosBP[1] + nucleosBP[0], seqAndStructTotalCost)  #reverse order of the di-nucleotides
                        
                    
        
    #The Sankoff algorithm for nucleotide substitutions + the 2D structure change cost (considering the 2 structures, but not the full cycles)
    #chosenNucleo here can be a list of 2 nucleos (1st corresponds to position, 2nd corresponds to posPair)
    def computeSeqAndStruct_2structsApprox(self, familyIndex, position, chosenNucleo = None):

        costMatrix = SpeciesNode.SANKOFF_MATRIX  #Using the Sankoff matrix for nucleotide substitution costs
        bpMatrix = SpeciesNode.BASEPAIR_MATRIX_SIMPLE

        minNucsList = []  #A list that will contain in order: left and right minimum di-nucleotides

        posPair = self.getPairedPos(familyIndex, position)  #Checking if position is paired with another position posPair in the 2D structure

        if posPair == -1:  #unpaired in the 2D structure
            nucleosOrDinucleos = SpeciesNode.NUCLEOS  #will be used in the for loop
        else:
            nucleosOrDinucleos = SpeciesNode.DINUCLEOS
            if chosenNucleo is None and self.getCostOf(familyIndex, position, 'AA') < float("inf"):  #if the cost has been calculated already because of the base pair
                #print("Position already considered because of the base pair and chosenNucleo = " + str(chosenNucleo))
                return

        for nucleo_s in nucleosOrDinucleos:  #for every possible nucleo or dinucleo (dependending of the paired state) (called nucleo_s because it's either 1 or 2 nucleo(s))

            if chosenNucleo is not None and nucleo_s != chosenNucleo:
                continue

            ### Calculating the minimum total cost to get nucleo_s ###
            minLeft = float("inf")  #Initializing
            minRight = float("inf")
            minNucleosLeft = []  #for top-down step only -> now a list of all optimal choices
            minNucleosRight = []  #for top-down step only -> now a list of all optimal choices

            for nucleo_sChild in nucleosOrDinucleos:  #for every possible nucleotide or nucleotide base pair in the left and right children

                costLeft = self.leftChild.find2StructsApproxCost(familyIndex, position, posPair, nucleo_sChild, nucleo_s)

                if costLeft == minLeft:
                    minNucleosLeft.append(nucleo_sChild)  #appending
                elif costLeft < minLeft:
                    minLeft = costLeft
                    minNucleosLeft = [nucleo_sChild]  #starting a new list

                costRight = self.rightChild.find2StructsApproxCost(familyIndex, position, posPair, nucleo_sChild, nucleo_s)

                if costRight == minRight:
                    minNucleosRight.append(nucleo_sChild)  #appending
                if costRight < minRight:
                    minRight = costRight
                    minNucleosRight = [nucleo_sChild]  #starting a new list

            if chosenNucleo is not None:
                return [minNucleosLeft, minNucleosRight]        

            seqAndStructTotalCost = minLeft + minRight

            #Adding the cost in posToNucleoCosts for the two positions, with the di-nucleotide as the key in the dict (first nucleo is the nucleo of the position
            # and second nucleo is always the nucleo at the base pair position)
            # If this position is not paired, we do only the first update
            self.updatePosToNucleoCosts(familyIndex, position, nucleo_s, seqAndStructTotalCost)
            if posPair != -1:
                self.updatePosToNucleoCosts(familyIndex, posPair, nucleo_s[1] + nucleo_s[0], seqAndStructTotalCost)  #reverse order of the di-nucleotides



    #Calculating the cost of going from nucleo_sChild to nucleo_s; **this method is called on the child node**.
    #Here we consider the 2 structures in the cost, but not cycles.
    def find2StructsApproxCost(self, familyIndex, position, posPair, nucleo_sChild, nucleo_s):

        subMatrix = SpeciesNode.SANKOFF_MATRIX
        bpMatrix = SpeciesNode.BASEPAIR_MATRIX_SIMPLE  #for now, using a very simple matrix
        T = self.parent.getT()  #getting the time parameter
        ###T = 0.5 ###THIS IS JUST A TEST!!
        
        costCurrentStruct = self.getCostOf(familyIndex, position, nucleo_sChild)  #intializing with the cost of nucleo_sChild of this node (which is a child node)
        costOtherStruct = 0

        #First dealing with the current structure
        if posPair == -1:  #unpaired in current struct
            costCurrentStruct += subMatrix[self.nucToInd(nucleo_sChild)][self.nucToInd(nucleo_s)]

        else:  #paired
            costCurrentStruct += subMatrix[self.nucToInd(nucleo_sChild[0])][self.nucToInd(nucleo_s[0])] + subMatrix[self.nucToInd(nucleo_sChild[1])][self.nucToInd(nucleo_s[1])] + T * bpMatrix[self.nucToInd(nucleo_s[0])][self.nucToInd(nucleo_s[1])]  #T multiplies the bpCost ONLY

        #Second, we deal with the base pairs of the second structure directly connected to position and posPair
        if familyIndex == 0: otherFamily = 1
        else: otherFamily = 0
            
        #Whether position is paired or unpaired, we always have to check for a paired position for 'position' in the other structure
        otherFamilyPosPair = self.getPairedPos(otherFamily, position)
        if otherFamilyPosPair != -1:  #otherwise, if == -1, there is nothing to calculate (costOtherStruct stays the same)

            costSum = 0
            for nuc in SpeciesNode.NUCLEOS:  #we're not setting the nucleo at position: otherFamilyPosPair, so we take into account all possible nucleotides
                #not counting substitutions for otherFamilyPosPair for now:
                #costSum += bpMatrix[self.nucToInd(nucleo_sChild[0])][self.nucToInd(nuc)]  #we can always access nucleo_sChild[0], whether 'position' is paired or not
                #October 22nd 2015: Pretty sure that the line above had a bug. We need to look at all possible basepairs with the proposed nucleotide, not the child one
                costSum += bpMatrix[self.nucToInd(nucleo_s[0])][self.nucToInd(nuc)]  #we can always access nucleo_sChild[0], whether 'position' is paired or not

            costOtherStruct += costSum / 4  #adding the average cost for the 4 different nucleos possible

        if posPair != -1:  #paired in the current struct, so we have to consider 'posPair' in the other structure
            
            #checking for posPair
            otherFamilyPosPair = self.getPairedPos(otherFamily, posPair)
            if otherFamilyPosPair != -1:  #otherwise, if == -1, there is nothing to calculate (costOtherStruct stays the same)

                costSum = 0
                for nuc in SpeciesNode.NUCLEOS:  #we're not setting the nucleo at position: otherFamilyPosPair, so we take into account all possible nucleotides
                    #not counting substitutions for otherFamilyPosPair for now:
                    #costSum += bpMatrix[self.nucToInd(nucleo_sChild[1])][self.nucToInd(nuc)]  #here it is safe to access nucleo_sChild[1]
                    #October 22nd 2015: Pretty sure that the line above had a bug. We need to look at all possible basepairs with the proposed nucleotide, not the child one
                    costSum += bpMatrix[self.nucToInd(nucleo_s[1])][self.nucToInd(nuc)]  #here it is safe to access nucleo_s[1]

                costOtherStruct += costSum / 4  #adding the average cost for the 4 different nucleos possible

        

        return costCurrentStruct + (1-T) * costOtherStruct  #(1-T) multiplies costOtherStruct, only because it contains only bp costs, as of now


    #The exact method, using MutatedSequence objects, to enumerate all possibles mutated sequences for each internal node and their transition cost
    #This method is called on every internal node, nbMut is the maximum number of mutations allowed on every branch
    def enumerateMutSeqs(self, familyIndex, nbMut):

        leftMutSeqsList = self.leftChild.getListMutSeqs(familyIndex)
        rightMutSeqsList = self.rightChild.getListMutSeqs(familyIndex)

        setMutSeqsLeftBranch = set()
        for mutSeq in leftMutSeqsList:  #working on the left branch
            setMutSeqsLeftBranch.update(mutSeq.getSetReachableMutSeqsStrings(nbMut))

        setMutSeqsRightBranch = set()
        for mutSeq in rightMutSeqsList:  #working on the right branch
            setMutSeqsRightBranch.update(mutSeq.getSetReachableMutSeqsStrings(nbMut))

        intersectionSet = setMutSeqsLeftBranch & setMutSeqsRightBranch  #finding the intersection

        #test#
        #print("Intersection of node " + str(self.id) + " : " + str(intersectionSet))

        if len(intersectionSet) == 0:
            print("ERROR: Can't reconstruct the history with the Enumerate algorithm because the number of mutations allowed per branch is too low for this tree.")
            sys.exit()

        #Calculating the costs (left + right branches) for each sequence (string) in the intersection set
        T = self.getT()
        currentStruct = self.getStruct(familyIndex)
        if familyIndex == 0:  #considering there are only 2 families
            otherFamilyIndex = 1
        else:
            otherFamilyIndex = 0
        otherStruct = self.getStruct(otherFamilyIndex)
        
        for seqString in intersectionSet:

            minCostLeft = float("inf")  #minimum cost on the left branch
            minMutSeqLeft = None  #considering only one possible optimal child for now
            for mutSeq in leftMutSeqsList:
                cost = mutSeq.getCostOfMutSeq(seqString, currentStruct, otherStruct, T)
                if cost < minCostLeft:
                    minCostLeft = cost
                    minMutSeqLeft = mutSeq

            
            minCostRight = float("inf")  #minimum cost on the right branch
            minMutSeqRight = None  #considering only one possible optimal child for now
            for mutSeq in rightMutSeqsList:
                cost = mutSeq.getCostOfMutSeq(seqString, currentStruct, otherStruct, T)
                if cost < minCostRight:
                    minCostRight = cost
                    minMutSeqRight = mutSeq

            #Once we have the minimums on each branch, we can create the MutatedSequence object for the internal node
            intNodeMutSeq = MutatedSequence(seqString, minMutSeqLeft, minMutSeqRight, minCostLeft + minCostRight)
            self.addToListMutatedSequences(familyIndex, intNodeMutSeq)  #adding the object to the list
            #print("For node " + str(self.id) + ", adding mutseq: " + intNodeMutSeq.sequence + "(" + "cost=" + str(intNodeMutSeq.cost))
            
        self.shortenListMutatedSequences(familyIndex)  #keeping only the best candidates
        

#-----------------------------------------------------------------------------------------------------------------------------------------


###############
## Main code ##
###############

###
### Methods
###


#RNAfold stuff
RNAFOLD = 'RNAfold'

R = 0.0019872041 
T = 310.15

def weight_in_ensemble(seq, ss):


    def frequency_ensemble(weight, en_ensemble):
        return exp((-en_ensemble+weight)/(R*T)) 

    def energy_freq(out_rnafold):

        out_rnafold = out_rnafold.split(b'\n')
        en = out_rnafold[2]
        en = float(en[en.rfind(b'(') + 1:en.rfind(b')')])
        en_ensemble = float(re.match(b'[^\d]+(-\d+\.\d+)[^\d]+', out_rnafold[3]).groups()[0])

        return en, en_ensemble

    tmp_file = NTF(dir='.', delete=False)
    tmp_file.write(bytes('>\n%s' % seq, 'UTF-8'))
    tmp_file.close()

    try:
        mfe = check_output([RNAFOLD, '--noPS', '-p0'], stdin=open(tmp_file.name, "r"))
    except:
        print("WTF")
        print(seq)
        print(ss)
        sys.exit(1)
    finally:
        os.remove(tmp_file.name)

    tmp_file = NTF(dir='.', delete=False)
    tmp_file.write(bytes('>\n%s\n%s' % (seq, ss.replace('.', 'x')), 'UTF-8'))
    tmp_file.close()

    try:
        const = check_output([RNAFOLD, '--noPS', '-p0', '-C'], stdin=open(tmp_file.name, "r"))
    except:
        print("WTF")
        print(seq)
        print(ss)
        sys.exit(1)
    finally:
        os.remove(tmp_file.name)

    _, mfe_ensemble = energy_freq(mfe)
    constr_en, _ =  energy_freq(const)

    return frequency_ensemble(mfe_ensemble, constr_en)


#To create a SpeciesTree, for testing purposes
def createATree():

    #speciesTree = SpeciesTree(["(())", "()()"])
    #root = SpeciesNode(None, None, None, speciesTree)
    #speciesTree.root = root

    #root.leftChild = SpeciesNode(root, None, None, speciesTree)
    #root.leftChild.addSequence(0, "ACGT")
    #root.leftChild.addSequence(1, "ATCG")
    #root.rightChild = SpeciesNode(root, None, None, speciesTree)
    #root.rightChild.addSequence(0, "AATT")
    #root.rightChild.addSequence(1, "ATCC")


    #speciesTree = SpeciesTree(["(())", "()()"])
    speciesTree = SpeciesTree([[3, 2, 1, 0], [1, 0, 3, 2]])
    listNodes = []

    for x in range(7):
        listNodes.append(SpeciesNode(None, None, None, speciesTree))

    speciesTree.root = listNodes[0]

    listNodes[0].addSequence(0, "GAUU")
    listNodes[0].addSequence(1, "CGCG")

    listNodes[0].leftChild = listNodes[1]
    listNodes[0].rightChild = listNodes[2]
    listNodes[1].parent = listNodes[0]
    listNodes[2].parent = listNodes[0]
    listNodes[1].addSequence(0, "GAUU")
    listNodes[1].addSequence(1, "CGCG")
    listNodes[2].addSequence(0, "GAUU")
    listNodes[2].addSequence(1, "CGCG")

    listNodes[1].leftChild = listNodes[3]
    listNodes[1].rightChild = listNodes[4]
    listNodes[3].parent = listNodes[1]
    listNodes[4].parent = listNodes[1]
    listNodes[3].addSequence(0, "AAUU")
    listNodes[3].addSequence(1, "CGCG")
    listNodes[4].addSequence(0, "AAAU")
    listNodes[4].addSequence(1, "CGCG")

    listNodes[2].leftChild = listNodes[5]
    listNodes[2].rightChild = listNodes[6]
    listNodes[5].parent = listNodes[2]
    listNodes[6].parent = listNodes[2]
    listNodes[5].addSequence(0, "UGUU")
    listNodes[5].addSequence(1, "CCCG")
    listNodes[6].addSequence(0, "GAUU")
    listNodes[6].addSequence(1, "CGCG")
    

    return speciesTree


#To read a string representing a newick tree with multiple sequences separated by a hyphen (-) and return a SpeciesTree.
#Each sequence represents a family (the sequences must be in the same order as the order of the structures in structureList)
def getSpeciesTreeFromNewick(newickString, structuresList):

    phyloTree = Phylo.read(StringIO(newickString), "newick")

    #print(phyloTree)
    #print(phyloTree.root)

    #Reinitializing SpeciesNode.lastId
    SpeciesNode.lastId = 1

    speciesTree = SpeciesTree(structuresList)
    speciesTree.root = SpeciesNode(None, None, None, speciesTree)
    
    #Calling the recursive method on the root
    getSpeciesTreeFromNewickRec(speciesTree.root, phyloTree.root, speciesTree)
    
    return speciesTree


#Recursive method to build the SpeciesTree from the newick tree
def getSpeciesTreeFromNewickRec(speciesNodeInternal, phyloNodeInternal, speciesTree):

    #First adding the sequences in the SpeciesNodeInternal
    sequences = phyloNodeInternal.name.split("-")
    for familyIndex in range(len(sequences)):
        speciesNodeInternal.addSequence(familyIndex, sequences[familyIndex])

    #Then creating the SpeciesNode objects representing the children nodes
    counter = 0
    for child in phyloNodeInternal:
        counter += 1
        if counter == 1:
            leftSpeciesNode = SpeciesNode(speciesNodeInternal, None, None, speciesTree)  #Specifying the parent in the constructor
            speciesNodeInternal.leftChild = leftSpeciesNode #Creating the parent-child link
            phyloLeftChild = child
        elif counter == 2:
            rightSpeciesNode = SpeciesNode(speciesNodeInternal, None, None, speciesTree)  #Specifying the parent in the constructor
            speciesNodeInternal.rightChild = rightSpeciesNode #Creating the parent-child link
            phyloRightChild = child
        else: #More than 2 children??!!
            print("---> BIG PROBLEM: newick tree is not binary.")

    if counter == 2:  #Should be 0 or 2. If counter == 0, it means that we're working on a leaf, and there is nothing more to do
        #Calling the recursive method on the left and right children nodes
        getSpeciesTreeFromNewickRec(leftSpeciesNode, phyloLeftChild, speciesTree)
        getSpeciesTreeFromNewickRec(rightSpeciesNode, phyloRightChild, speciesTree)


#Reading a file containing simulated trees and returns a list of SpeciesTree objects
def readFile(filename):

    file = open(filename, 'r')

    structuresRead = False
    structuresParenthesisList = []  #contains the parenthesis representations of the structures
    structuresPairedPosList = []  #The list of pairedPos lists representing the structures
    speciesTreesList = []  #the list of SpeciesTree objects that will be returned
    
    for line in file:

        if line[0] == "#":  #reading a tree index, we don't really need to store this information
            structuresRead = True  #we're done reading structures
            if not structuresPairedPosList:
                for structure in structuresParenthesisList:
                    structuresPairedPosList.append(parenthesisToPairedPos(structure))

                #print(structuresPairedPosList)
            
        elif not structuresRead:
            structuresParenthesisList.append(line.strip())

        else: #reading a newick string
            speciesTreesList.append(getSpeciesTreeFromNewick(line.strip(), structuresPairedPosList))

    return speciesTreesList
            
                
#To translate a parenthesis representation of a structure into a list of paired positions
def parenthesisToPairedPos(parenthesisString):

    pairedPosList = [-1] * len(parenthesisString)  #initializing the list with -1 everywhere

    openParenthesisStack = deque()  #stack that will be used to put the positions of open parentheses, will be poped when we read a closing parenthesis
    for pos in range(len(parenthesisString)):
        
        char = parenthesisString[pos]

        if char == '(':
            openParenthesisStack.append(pos)

        elif char == ')':
            posPair = openParenthesisStack.pop()
            pairedPosList[pos] = posPair
            pairedPosList[posPair] = pos

    return pairedPosList


###
### Main
###
if __name__ == '__main__':

    #treeSolutions = createATree()
    #treeSolutions.printTree()
    #tree = treeSolutions.cloneWithoutAncSeqs()
    #tree.setDepths()  #important to set the depths of every node
    #tree.printTree()

    #print("\nAfter computeFitch:\n")
    #tree.reconstructAncestralSeqs(SpeciesTree.ALGO.Fitch)
    #tree.printTree()

    #print("\nAfter computeSankoff:\n")
    #tree.reconstructAncestralSeqs(SpeciesTree.ALGO.Sankoff)
    #tree.printTree()
    
    #print("\nAfter computeSeqAndStruct:\n")
    #tree.reconstructAncestralSeqs(SpeciesTree.ALGO.SeqAndStruct)
    #tree.printTree()

    #print("\nAfter TwoStructsApprox:\n")
    #tree.reconstructAncestralSeqs(SpeciesTree.ALGO.TwoStructsApprox)
    #tree.printTree()

    #print("\nAfter Enumerate:\n")
    #tree.reconstructAncestralSeqs(SpeciesTree.ALGO.Enumerate)
    #tree.printTree()

    #SpeciesTree.compareTrees(treeSolutions, tree)


    ###Testing newick tree
    #newickSpeciesTreeTest = getSpeciesTreeFromNewick("((4.AAUU-CGCG,5.AAAU-CGCG)2.GAUU-CGCG,(6.UGUU-CCCG,7.GAUU-CGCG)3.GAUU-CGCG)1.GAUU-CGCG;", [[3, 2, 1, 0], [1, 0, 3, 2]])
    #newickSpeciesTreeTest.printTree()

    #newickTreeNoSolutions = newickSpeciesTreeTest.cloneWithoutAncSeqs()
    #newickTreeNoSolutions.setDepths()  #don't forget to call setDepths before reconstructing the ancestral sequences
    #newickTreeNoSolutions.printTree()

    ###Test read file
    #print("\nTEST READ FILE\n")
    #speciesTreesList = readFile("simulatedTrees.txt")


    ####################
    ## The real thing ##
    ####################

    if len(sys.argv) < 3:
        print("\n-USAGE: python fitchAndCompany.py  algoNumber  simulFile  [maxNbMutations]  [maxListSize]")
        print("\nalgoNumber:  1=Fitch  2=Sankoff  3=Sankoff + one structure  4=Sankoff + two structures  5=Enumerate")
        print("simulFile: the file containing the 2 structures and a list of simulated trees in Newick format.")
        print("maxNbMutations: optional argument for Enumerate only. The maximum number of mutations that can be inferred on one branch. (DEFAULT=1)")
        print("maxListSize: optional argument for Enumerate only. The maximum number best candidate sequences kept at each internal node during the bottom-up step. (DEFAULT=1000)")
        sys.exit()

    algoNumber = int(sys.argv[1])
    if algoNumber == 1:
        algo = SpeciesTree.ALGO.Fitch
    elif algoNumber == 2:
        algo = SpeciesTree.ALGO.Sankoff
    elif algoNumber == 3:
        algo = SpeciesTree.ALGO.SeqAndStruct
    elif algoNumber == 4:
        algo = SpeciesTree.ALGO.TwoStructsApprox
    elif algoNumber == 5:
        algo = SpeciesTree.ALGO.Enumerate
    else:
        print("\nError: invalid algoNumber.")
        sys.exit()

    startTime = time.time()  #to measure time of computation

    speciesTreesList = readFile(sys.argv[2])  #reading the simulFile argument

    if len(sys.argv) >= 4:  #maxNbMutations specified
        SpeciesTree.NB_MUTATIONS = int(sys.argv[3])

    if len(sys.argv) >= 5:  #maxListSize specified (note that to specify a maxListSize, you need to specify maxNbMutations before)
        SpeciesNode.LIST_MUT_SEQS_MAX_SIZE = int(sys.argv[4])
    
    counter = 1
    for simulTree in speciesTreesList:
        print("\nTree # " + str(counter))
        #simulTree.printTree()
        #sys.exit()
        
        inferenceTree = simulTree.cloneWithoutAncSeqs()
        inferenceTree.setDepths()  #necessary

        inferenceTree.reconstructAncestralSeqs(algo)
        inferenceTree.printTree()  #--> useful to look at the solutions for every node of the tree
        totalNumberOfErrors, totalNumberOfNucleos, totalNumberOfOptimalSeqs, averageNumberOfErrors, totalNumberOfNucleos_withStruct, totalNumberOfErrors_withStruct, totalNumberOfNucleos_withoutStruct, totalNumberOfErrors_withoutStruct, averageNumberOfErrors_withStruct, averageNumberOfErrors_withoutStruct, numberOfNodesInTheTree, totalNumberOfErrors_root, totalNumberOfOptimalSeqs_root, averageNumberOfErrors_root, averageNumberOfErrors_root_withoutStruct, averageNumberOfErrors_root_withStruct = SpeciesTree.compareTrees(simulTree, inferenceTree)

        for i in range(len(totalNumberOfErrors)):
            if i == 0:
                print("\nFAMILY 0 ONLY:\n")

            elif i == 1:
                print("\nFAMILY 1 ONLY:\n")

            elif i == 2:
                print("\nALL FAMILIES (STRUCTS):\n")
                

            ###TOTAL
            print("Total: " + str(totalNumberOfErrors[i]) + " errors / " + str(totalNumberOfNucleos[i]) + " nucleotides (" + str(totalNumberOfErrors[i]/totalNumberOfNucleos[i]*100) + " % error)")
            print("Total for unstructured positions: " + str(totalNumberOfErrors_withoutStruct[i]) + " errors / " + str(totalNumberOfNucleos_withoutStruct[i]) + " nucleotides (" + str(totalNumberOfErrors_withoutStruct[i]/totalNumberOfNucleos_withoutStruct[i]*100) + " % error)")

            print("Total for structured positions: " + str(totalNumberOfErrors_withStruct[i]) + " errors / " + str(totalNumberOfNucleos_withStruct[i]) + " nucleotides (" + str(totalNumberOfErrors_withStruct[i]/totalNumberOfNucleos_withStruct[i]*100) + " % error)")

            ###AVERAGE
            print("Average (over " + str(totalNumberOfOptimalSeqs[i]) + " optimal sequences): " + str(averageNumberOfErrors[i]) + " aver. errors per optimal sequence of length = " + str(totalNumberOfNucleos[i]/numberOfNodesInTheTree[i]) + " nucleotides (" + str(averageNumberOfErrors[i]/(totalNumberOfNucleos[i]/numberOfNodesInTheTree[i])*100) + " % error)")

            print("Average for unstructured positions: " + str(averageNumberOfErrors_withoutStruct[i]) + " aver. errors per optimal sequence, considering " + str(totalNumberOfNucleos_withoutStruct[i]/numberOfNodesInTheTree[i]) + " nucleotides per seq. (" + str(averageNumberOfErrors_withoutStruct[i]/(totalNumberOfNucleos_withoutStruct[i]/numberOfNodesInTheTree[i])*100) + " % error)")
            
            print("Average for structured positions: " + str(averageNumberOfErrors_withStruct[i]) + " aver. errors per optimal sequence, considering " + str(totalNumberOfNucleos_withStruct[i]/numberOfNodesInTheTree[i]) + " nucleotides per seq. (" + str(averageNumberOfErrors_withStruct[i]/(totalNumberOfNucleos_withStruct[i]/numberOfNodesInTheTree[i])*100) + " % error)")

            ###ROOT
            print("-For the root only:")
            print("Total: " + str(totalNumberOfErrors_root[i]) + " errors")
            print("Average (over " + str(totalNumberOfOptimalSeqs_root[i]) + " optimal sequences): " + str(averageNumberOfErrors_root[i]) + " aver. errors per optimal sequence of length = " + str(totalNumberOfNucleos[i]/numberOfNodesInTheTree[i]) + " nucleotides (" + str(averageNumberOfErrors_root[i]/(totalNumberOfNucleos[i]/numberOfNodesInTheTree[i])*100) + " % error)")

            print("Average for unstructured positions: " + str(averageNumberOfErrors_root_withoutStruct[i]) + " aver. errors per optimal sequence, considering " + str(totalNumberOfNucleos_withoutStruct[i]/numberOfNodesInTheTree[i]) + " nucleotides per seq. (" + str(averageNumberOfErrors_root_withoutStruct[i]/(totalNumberOfNucleos_withoutStruct[i]/numberOfNodesInTheTree[i])*100) + " % error)")
            
            print("Average for structured positions: " + str(averageNumberOfErrors_root_withStruct[i]) + " aver. errors per optimal sequence, considering " + str(totalNumberOfNucleos_withStruct[i]/numberOfNodesInTheTree[i]) + " nucleotides per seq. (" + str(averageNumberOfErrors_root_withStruct[i]/(totalNumberOfNucleos_withStruct[i]/numberOfNodesInTheTree[i])*100) + " % error)")

        elapsedTime = time.time() - startTime
        print("\n### Computation took " + str(elapsedTime) + " seconds.")