Commit 03c95e75 by Carlos GO

### contiguity computation

parent ede3913d
Src/contiguity.py 0 → 100644
 import numpy as np import pandas as pd def contiguity(stem): """ return: contiguity score for stem """ stem_length = len(stem) bps = stem.count('(') stacks = 0 #count number of contiguous stacks i = 0 j = len(stem) - 1 stack_size = 0 in_stack = False while i < j: l = stem[i] r = stem[j] if l == "(" and r == ")": stack_size += 1 i += 1 j -= 1 if stack_size > 1: in_stack = True else: if in_stack: stacks += 1 in_stack = False if l == "(" and r == ".": j -= 1 if l == "." and r == ")": i += 1 if l == "." and r == ".": i += 1 j -= 1 return np.log((stem_length - bps) / stacks) def stem_find(ss): """ return: list containint start and end indices of all stems in RNA """ #remove dangles ss = ss.strip('.') print(ss) stack = [] stem_start = None stem_end= None #add bases to stack if open #remove from stack if closing #if stack is empty we have a stem stem_indices = [] for i, b in enumerate(ss): if b == "(": if len(stack) == 0: stem_start = i stack.append(b) elif b == ")": stack.pop() if len(stack) == 0: stem_end = i stem_indices.append((stem_start, stem_end)) else: continue if len(stack) != 0: print("UNBALANCED!") return stem_indices def mean_contig(ss, stems): contigs = [] for s in stems: start, end = s contigs.append(contiguity(ss[start: end+1]) * float(start-end)) return np.mean(contigs) def ml_contiguity(): df = pd.read_csv("../Data/rnamuts_multiloops.csv") sss = df['structure'] for ss in sss: print(stem_find(ss)) break if __name__ == "__main__": ml_contiguity() pass
