Commit 22cd100f by Roman Sarrazin-Gendron

### improved accuracy on new sequences, alleviated overfitting to sequence distance

parent c9d41e20
 ... @@ -586,6 +586,10 @@ def seq_to_struct( ... @@ -586,6 +586,10 @@ def seq_to_struct( g_distance=[] g_distance=[] h_distance = [] h_distance = [] j_distance = [] j_distance = [] i_distance = [] k_distance = [] l_distance = [] m_distance = [] for junc in component_distance: for junc in component_distance: a=junc[0] a=junc[0] b=junc[1] b=junc[1] ... @@ -595,6 +599,7 @@ def seq_to_struct( ... @@ -595,6 +599,7 @@ def seq_to_struct( f= max(int((a+b)/6),c+0) f= max(int((a+b)/6),c+0) h= int(1.5*b) h= int(1.5*b) j = int(1.2*b) j = int(1.2*b) k = max(a, len(seq)) reduced_component_distance.append([a,b]) reduced_component_distance.append([a,b]) halfway_component_distance.append([a,d]) halfway_component_distance.append([a,d]) ... @@ -603,8 +608,10 @@ def seq_to_struct( ... @@ -603,8 +608,10 @@ def seq_to_struct( g_distance.append([a,h]) g_distance.append([a,h]) h_distance.append([3,h]) h_distance.append([3,h]) j_distance.append([a,j]) j_distance.append([a,j]) h_distance.append([3,j]) i_distance.append([3,j]) l_distance.append([a,c]) k_distance.append([a,b]) m_distance.append([a,k]) permutations = get_permutations(position_subsets,component_distance,iii) permutations = get_permutations(position_subsets,component_distance,iii) regex_list = [] regex_list = [] ... @@ -620,6 +627,10 @@ def seq_to_struct( ... @@ -620,6 +627,10 @@ def seq_to_struct( res6 = build_regex(position_subsets,g_distance, node_dict, convert, positions, mc) res6 = build_regex(position_subsets,g_distance, node_dict, convert, positions, mc) res7 = build_regex(position_subsets,h_distance, node_dict, convert, positions, mc) res7 = build_regex(position_subsets,h_distance, node_dict, convert, positions, mc) res8 = build_regex(position_subsets,j_distance, node_dict, convert, positions, mc) res8 = build_regex(position_subsets,j_distance, node_dict, convert, positions, mc) res9 = build_regex(position_subsets,i_distance, node_dict, convert, positions, mc) res10 = build_regex(position_subsets,l_distance, node_dict, convert, positions, mc) res11 = build_regex(position_subsets,k_distance, node_dict, convert, positions, mc) res12 = build_regex(position_subsets,m_distance, node_dict, convert, positions, mc) regex_list.append([res,position_subsets,node_dict]) regex_list.append([res,position_subsets,node_dict]) regex_list.append([res2,position_subsets,node_dict]) regex_list.append([res2,position_subsets,node_dict]) ... @@ -629,6 +640,10 @@ def seq_to_struct( ... @@ -629,6 +640,10 @@ def seq_to_struct( regex_list.append([res6,position_subsets,node_dict]) regex_list.append([res6,position_subsets,node_dict]) regex_list.append([res7,position_subsets,node_dict]) regex_list.append([res7,position_subsets,node_dict]) regex_list.append([res8,position_subsets,node_dict]) regex_list.append([res8,position_subsets,node_dict]) regex_list.append([res9, position_subsets, node_dict]) regex_list.append([res10, position_subsets, node_dict]) regex_list.append([res11, position_subsets, node_dict]) regex_list.append([res12, position_subsets, node_dict]) #print("REGEX:",regex_list) #print("REGEX:",regex_list) output = [] output = [] final_model = {} final_model = {} ... ...
 ... @@ -135,10 +135,11 @@ def run_fasta(input, modules_to_parse, dataset, ss="", arguments={}): ... @@ -135,10 +135,11 @@ def run_fasta(input, modules_to_parse, dataset, ss="", arguments={}): seq = str(record.seq) seq = str(record.seq) seq = seq.upper() seq = seq.upper() sequences.append(seq) sequences.append(seq) print("LENGTH OF SEQUENCE:",len(seq)) #print("PARSING SEQUENCE ", id, "\n") #print("PARSING SEQUENCE ", id, "\n") if "T" in seq: if "T" in seq: seq = str(seq).replace("T", "U") seq = str(seq).replace("T", "U") if len(seq) <= 200: if len(seq) <= 300: #print("Running BayesPairing on full sequence") #print("Running BayesPairing on full sequence") maxs = run_BP(seq, ss, modules_to_parse, dataset, "NONE", m, n, sm, mc, p, k) maxs = run_BP(seq, ss, modules_to_parse, dataset, "NONE", m, n, sm, mc, p, k) if interm: if interm: ... @@ -249,11 +250,12 @@ def run_fasta(input, modules_to_parse, dataset, ss="", arguments={}): ... @@ -249,11 +250,12 @@ def run_fasta(input, modules_to_parse, dataset, ss="", arguments={}): fOUTPUT= fOUTPUT+"\n"+stats fOUTPUT= fOUTPUT+"\n"+stats pickle.dump(prediction_scores,open("../output/"+o+".pickle","wb")) pickle.dump(prediction_scores,open("../output/"+o+".pickle","wb")) else: else: print("LENGTH OF SEQUENCE:",len(input)) if "T" in input: if "T" in input: input = input.upper() input = input.upper() input = str(input).replace("T", "U") input = str(input).replace("T", "U") sequences = [input] sequences = [input] if len(input) <= 200: if len(input) <= 300: maxs = run_BP(input, ss, modules_to_parse, dataset, "NONE", m, n, sm, mc, p, k,sscons) maxs = run_BP(input, ss, modules_to_parse, dataset, "NONE", m, n, sm, mc, p, k,sscons) all_maxes = [] all_maxes = [] for ind, mod in enumerate(modules_to_parse): for ind, mod in enumerate(modules_to_parse): ... ...
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!