Commit 22cd100f authored by Roman Sarrazin-Gendron's avatar Roman Sarrazin-Gendron
Browse files

improved accuracy on new sequences, alleviated overfitting to sequence distance

parent c9d41e20
...@@ -586,6 +586,10 @@ def seq_to_struct( ...@@ -586,6 +586,10 @@ def seq_to_struct(
g_distance=[] g_distance=[]
h_distance = [] h_distance = []
j_distance = [] j_distance = []
i_distance = []
k_distance = []
l_distance = []
m_distance = []
for junc in component_distance: for junc in component_distance:
a=junc[0] a=junc[0]
b=junc[1] b=junc[1]
...@@ -595,6 +599,7 @@ def seq_to_struct( ...@@ -595,6 +599,7 @@ def seq_to_struct(
f= max(int((a+b)/6),c+0) f= max(int((a+b)/6),c+0)
h= int(1.5*b) h= int(1.5*b)
j = int(1.2*b) j = int(1.2*b)
k = max(a, len(seq))
reduced_component_distance.append([a,b]) reduced_component_distance.append([a,b])
halfway_component_distance.append([a,d]) halfway_component_distance.append([a,d])
...@@ -603,8 +608,10 @@ def seq_to_struct( ...@@ -603,8 +608,10 @@ def seq_to_struct(
g_distance.append([a,h]) g_distance.append([a,h])
h_distance.append([3,h]) h_distance.append([3,h])
j_distance.append([a,j]) j_distance.append([a,j])
h_distance.append([3,j]) i_distance.append([3,j])
l_distance.append([a,c])
k_distance.append([a,b])
m_distance.append([a,k])
permutations = get_permutations(position_subsets,component_distance,iii) permutations = get_permutations(position_subsets,component_distance,iii)
regex_list = [] regex_list = []
...@@ -620,6 +627,10 @@ def seq_to_struct( ...@@ -620,6 +627,10 @@ def seq_to_struct(
res6 = build_regex(position_subsets,g_distance, node_dict, convert, positions, mc) res6 = build_regex(position_subsets,g_distance, node_dict, convert, positions, mc)
res7 = build_regex(position_subsets,h_distance, node_dict, convert, positions, mc) res7 = build_regex(position_subsets,h_distance, node_dict, convert, positions, mc)
res8 = build_regex(position_subsets,j_distance, node_dict, convert, positions, mc) res8 = build_regex(position_subsets,j_distance, node_dict, convert, positions, mc)
res9 = build_regex(position_subsets,i_distance, node_dict, convert, positions, mc)
res10 = build_regex(position_subsets,l_distance, node_dict, convert, positions, mc)
res11 = build_regex(position_subsets,k_distance, node_dict, convert, positions, mc)
res12 = build_regex(position_subsets,m_distance, node_dict, convert, positions, mc)
regex_list.append([res,position_subsets,node_dict]) regex_list.append([res,position_subsets,node_dict])
regex_list.append([res2,position_subsets,node_dict]) regex_list.append([res2,position_subsets,node_dict])
...@@ -629,6 +640,10 @@ def seq_to_struct( ...@@ -629,6 +640,10 @@ def seq_to_struct(
regex_list.append([res6,position_subsets,node_dict]) regex_list.append([res6,position_subsets,node_dict])
regex_list.append([res7,position_subsets,node_dict]) regex_list.append([res7,position_subsets,node_dict])
regex_list.append([res8,position_subsets,node_dict]) regex_list.append([res8,position_subsets,node_dict])
regex_list.append([res9, position_subsets, node_dict])
regex_list.append([res10, position_subsets, node_dict])
regex_list.append([res11, position_subsets, node_dict])
regex_list.append([res12, position_subsets, node_dict])
#print("REGEX:",regex_list) #print("REGEX:",regex_list)
output = [] output = []
final_model = {} final_model = {}
......
...@@ -135,10 +135,11 @@ def run_fasta(input, modules_to_parse, dataset, ss="", arguments={}): ...@@ -135,10 +135,11 @@ def run_fasta(input, modules_to_parse, dataset, ss="", arguments={}):
seq = str(record.seq) seq = str(record.seq)
seq = seq.upper() seq = seq.upper()
sequences.append(seq) sequences.append(seq)
print("LENGTH OF SEQUENCE:",len(seq))
#print("PARSING SEQUENCE ", id, "\n") #print("PARSING SEQUENCE ", id, "\n")
if "T" in seq: if "T" in seq:
seq = str(seq).replace("T", "U") seq = str(seq).replace("T", "U")
if len(seq) <= 200: if len(seq) <= 300:
#print("Running BayesPairing on full sequence") #print("Running BayesPairing on full sequence")
maxs = run_BP(seq, ss, modules_to_parse, dataset, "NONE", m, n, sm, mc, p, k) maxs = run_BP(seq, ss, modules_to_parse, dataset, "NONE", m, n, sm, mc, p, k)
if interm: if interm:
...@@ -249,11 +250,12 @@ def run_fasta(input, modules_to_parse, dataset, ss="", arguments={}): ...@@ -249,11 +250,12 @@ def run_fasta(input, modules_to_parse, dataset, ss="", arguments={}):
fOUTPUT= fOUTPUT+"\n"+stats fOUTPUT= fOUTPUT+"\n"+stats
pickle.dump(prediction_scores,open("../output/"+o+".pickle","wb")) pickle.dump(prediction_scores,open("../output/"+o+".pickle","wb"))
else: else:
print("LENGTH OF SEQUENCE:",len(input))
if "T" in input: if "T" in input:
input = input.upper() input = input.upper()
input = str(input).replace("T", "U") input = str(input).replace("T", "U")
sequences = [input] sequences = [input]
if len(input) <= 200: if len(input) <= 300:
maxs = run_BP(input, ss, modules_to_parse, dataset, "NONE", m, n, sm, mc, p, k,sscons) maxs = run_BP(input, ss, modules_to_parse, dataset, "NONE", m, n, sm, mc, p, k,sscons)
all_maxes = [] all_maxes = []
for ind, mod in enumerate(modules_to_parse): for ind, mod in enumerate(modules_to_parse):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment