Commit 650146c8 authored by Roman SARRAZIN GENDRON's avatar Roman SARRAZIN GENDRON
Browse files

fixed positions in alignment parsing

parent 88589318
......@@ -226,6 +226,11 @@ def jared(seq, module, cutoff=-111):
def parse_alignment(sequences, modules, ss, dataset, BNs, t=-5, samplesize=20000, Lambda=0.35, Theta=1, Delta=None, fuzzy=False, verbose=False, constraints = ''):
seqs = [x[0] for x in sequences]
consensuses = []
#getting the consensus of each column
for x in range(len(seqs[0])):
consensuses.append(consensus(seqs,x))
fc = Fold(seqs)
ss_mfe, mfe, fee = fc.constraint_folding()
nb = samplesize
......@@ -290,14 +295,24 @@ def parse_alignment(sequences, modules, ss, dataset, BNs, t=-5, samplesize=20000
real_pos = find_significant_columns(seqs,struct)
#skipped_pos contains the number of columns to add to each column ID in the reduced alignment.
skipped_pos = []
toAdd = 0
for i in range(0,max(real_pos)):
if i not in real_pos:
toAdd+=1
else:
skipped_pos.append(toAdd)
real_ss = "".join([struct[pos] for pos in real_pos])
real_seq = "".join([sequences[0][0][pos] for pos in real_pos])
if real_ss == "":
print("problematic struct",struct)
print("accepted columns",real_pos)
print("bps",parseRNAStructure(struct))
exit()
#print("problematic struct",struct)
#print("accepted columns",real_pos)
#print("bps",parseRNAStructure(struct))
#print("skipped",skipped_pos)
struct = real_ss
......@@ -331,10 +346,36 @@ def parse_alignment(sequences, modules, ss, dataset, BNs, t=-5, samplesize=20000
new_score=match[4]+energy_term
match[2] = new_score
#this is necessary to re-add the missing columns in the alignment so the output matches the input.
#print("attempting to insert missing columns in match", match[1])
newRes = []
for strand in match[1]:
strand2 = [x+skipped_pos[x] for x in strand]
newRes.append(strand2)
match[1] = newRes
#print("successfully updated:",match[1])
#print("updating seq, before:",match[0])
newSeq = ""
for strand in match[1]:
for x in strand:
newSeq+=consensuses[x]
match[0] = newSeq
#print("successfully updated seq, after:",match[0])
this_mod_list.append(match)
predicted_list[mod] = (sorted(this_mod_list,key=lambda k:k[2], reverse=True))
return predicted_list
def consensus(seqs, colIndex):
lst = [seq[colIndex] for seq in seqs]
cand = set(lst)
if "-" in cand and len(cand)>1:
cand.remove("-")
return max(cand, key=lst.count)
def parse_sequence(seq, modules, ss, dataset, BNs, t=-10, samplesize=20000, Lambda=0.35, Theta=1, Delta=None, fuzzy=False, verbose=False, constraints = ''):
#t=5
......
......@@ -37,7 +37,7 @@ class Fold:
#ROMAN ADDED: rescale
ss_mfe, mfe = self.fc.mfe()
print(mfe)
#print(mfe)
self.fc.exp_params_rescale(mfe)
self.fc.pf()
......
......@@ -17,11 +17,11 @@ def unpick(dataset,direc,typ):
file_path = "../"+direc+"/" + dataset + "_"+typ
if os.path.exists(file_path):
while True:
try:
nets = pickle.load(open(file_path, "rb"))
break
except:
pass
try:
nets = pickle.load(open(file_path, "rb"))
break
except:
pass
return nets
else:
print("Dataset not found!")
......@@ -656,7 +656,7 @@ if __name__ == "__main__":
if args.d != None:
dataset = args.d
else:
dataset = "3dmotifAtlas_RELIABLE"
dataset = "3dMotifAtlas_RELIABLE"
# the default dataset is rna3dmotif
# we load the modules from the dataset to get the number of modules available.
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment