Commit 4bb1aec6 authored by Jerome Waldispuhl's avatar Jerome Waldispuhl
Browse files

modif

parent a37a45a9
import os,sys
import re
import csv
from Bio.SeqUtils import GC
from math import log
import matplotlib.pyplot as plt
import distance
from random import randint
# List diretory
path = "../Data/RNAmuts_mfe_csv/"
......@@ -52,7 +52,8 @@ for file in dirs:
# parse files
mystats = {'all':{10:{},30:{},50:{},70:{},90:{}},'ml':{10:{},30:{},50:{},70:{},90:{}}}
mystats = {10:{},30:{},50:{},70:{},90:{}}
seqdists = {}
for gcr in seqfiles.keys():
if not gcr in list_gcr:
......@@ -64,88 +65,105 @@ for gcr in seqfiles.keys():
for filename in seqfiles[gcr]:
print filename
idfile = len(mystats['all'][gcr])
idfile = len(mystats[gcr])
# init data structure
mystats['all'][gcr][idfile] = {}
mystats['ml'][gcr][idfile] = {}
for k in range(seqlen+1):
mystats['all'][gcr][idfile][k] = {}
mystats['ml'][gcr][idfile][k] = {}
for i in range(seqlen):
mystats['all'][gcr][idfile][k][i] = {'count':{'A':0,'C':0,'G':0,'U':0},'freq':{'A':0.,'C':0.,'G':0.,'U':0.}}
mystats['ml'][gcr][idfile][k][i] = {'count':{'A':0,'C':0,'G':0,'U':0},'freq':{'A':0.,'C':0.,'G':0.,'U':0.}}
mystats[gcr][idfile] = {}
# read file
f = open(filename)
reader = csv.reader(f, delimiter=',')
next(f)
hdist_list = {}
seqs = {}
for row in reader:
nmut = int(row[2])
mseq = row[3].upper()
is_ml_ssa = has_ml(row[4])
for i in range(len(mseq)):
mystats['all'][gcr][idfile][nmut][i]['count'][mseq[i]] += 1
if is_ml_ssa:
mystats['ml'][gcr][idfile][nmut][i]['count'][mseq[i]] += 1
if is_ml_ssa:
accept_rate = 0
else:
accept_rate = 99
if randint(0,accept_rate) == 0:
if not seqs.has_key(nmut):
seqs[nmut] = []
if not hdist_list.has_key(nmut):
hdist_list[nmut] = {'HH':[],'HM':[],'MM':[]}
for prevseq,prevml in seqs[nmut]:
hdist = distance.hamming(prevseq,mseq)
if is_ml_ssa:
if prevml:
hdist_list[nmut]['MM'].append(hdist)
else:
hdist_list[nmut]['HM'].append(hdist)
else:
if prevml:
hdist_list[nmut]['HM'].append(hdist)
else:
hdist_list[nmut]['HH'].append(hdist)
seqs[nmut].append((mseq,is_ml_ssa))
for nmut in range(seqlen+1):
avg_dist = {'HH':0.,'HM':0.,'MM':0.}
if hdist_list.has_key(nmut):
if len(hdist_list[nmut]['HH']) > 0:
avg_dist['HH'] = float(sum(hdist_list[nmut]['HH'])) / len(hdist_list[nmut]['HH'])
if len(hdist_list[nmut]['HM']) > 0:
avg_dist['HM'] = float(sum(hdist_list[nmut]['HM'])) / len(hdist_list[nmut]['HM'])
if len(hdist_list[nmut]['MM']) > 0:
avg_dist['MM'] = float(sum(hdist_list[nmut]['MM'])) / len(hdist_list[nmut]['MM'])
mystats[gcr][idfile][nmut] = avg_dist
f.close()
# compute freq
for sst in ['all','ml']:
for idf in mystats['all'][gcr].keys():
for nmut in range(seqlen+1):
for i in range(seqlen):
nttotal = mystats[sst][gcr][idf][nmut][i]['count']['A'] + mystats[sst][gcr][idf][nmut][i]['count']['C'] + mystats[sst][gcr][idf][nmut][i]['count']['G'] + mystats[sst][gcr][idf][nmut][i]['count']['U']
if nttotal>0:
mystats[sst][gcr][idf][nmut][i]['freq']['A'] = float(mystats[sst][gcr][idf][nmut][i]['count']['A']) / nttotal
mystats[sst][gcr][idf][nmut][i]['freq']['C'] = float(mystats[sst][gcr][idf][nmut][i]['count']['C']) / nttotal
mystats[sst][gcr][idf][nmut][i]['freq']['G'] = float(mystats[sst][gcr][idf][nmut][i]['count']['G']) / nttotal
mystats[sst][gcr][idf][nmut][i]['freq']['U'] = float(mystats[sst][gcr][idf][nmut][i]['count']['U']) / nttotal
## compute entropy
xvalues = range(seqlen+1)
yvalues = {10:{'all':[],'ml':[]},30:{'all':[],'ml':[]},50:{'all':[],'ml':[]},70:{'all':[],'ml':[]},90:{'all':[],'ml':[]}}
for gcr in list_gcr:
for nmut in range(seqlen+1):
entropy = {'all':{'avg':0.},'ml':{'avg':0.}}
list_ent_all = []
list_ent_ml = []
for idf in mystats['all'][gcr].keys():
entropy['all'][idf] = 0.
entropy['ml'][idf] = 0.
for i in range(seqlen):
colentropy_all = 0.
colentropy_ml = 0.
for nt in ['A','C','G','U']:
if mystats['all'][gcr][idf][nmut][i]['freq'][nt] > 0:
colentropy_all -= mystats['all'][gcr][idf][nmut][i]['freq'][nt] * log(mystats['all'][gcr][idf][nmut][i]['freq'][nt],2)
if mystats['ml'][gcr][idf][nmut][i]['freq'][nt] > 0:
colentropy_ml -= mystats['ml'][gcr][idf][nmut][i]['freq'][nt] * log(mystats['ml'][gcr][idf][nmut][i]['freq'][nt],2)
entropy['all'][idf] += colentropy_all
entropy['ml'][idf] += colentropy_ml
list_ent_all.append(entropy['all'][idf])
list_ent_ml.append(entropy['ml'][idf])
if len(list_ent_all) > 0:
entropy['all']['avg'] = float(sum(list_ent_all)) / len(list_ent_all)
if len(list_ent_ml) > 0:
entropy['ml']['avg'] = float(sum(list_ent_ml)) / len(list_ent_ml)
if True:
yvalues[gcr]['all'].append(entropy['all']['avg'])
yvalues[gcr]['ml'].append(entropy['ml']['avg'])
# plot
plt.style.use('fivethirtyeight')
color_scheme = {10:'C0',30:'C1',50:'C2',70:'C3',90:'C4'}
for gcr in list_gcr:
if True:
plt.plot(xvalues,yvalues[gcr]['all'],label='all (' + str(gcr) + '%)',ls='-',color=color_scheme[gcr])
plt.plot(xvalues,yvalues[gcr]['ml'],label='ml (' + str(gcr) + '%)',ls='--',color=color_scheme[gcr])
else:
plt.hist([yvalues[gcr]['all'],yvalues[gcr]['ml']])
plt.legend(loc=3)
# show plot
plt.show()
plt.style.use('fivethirtyeight')
color_scheme = {10:'C0',30:'C1',50:'C2',70:'C3',90:'C4'}
# plot data
xvalues = range(seqlen+1)
yvalues = {}
for gcr in list_gcr:
yvalues[gcr] = {'HH':[],'HM':[],'MM':[]}
for idf in mystats[gcr].keys():
tmp = {}
for nm in xvalues:
if not tmp.has_key(nm):
tmp[nm] = {'HH':[],'HM':[],'MM':[]}
if mystats[gcr][idfile].has_key(nm):
tmp[nm]['HH'].append(mystats[gcr][idf][nm]['HH'])
tmp[nm]['HM'].append(mystats[gcr][idf][nm]['HM'])
tmp[nm]['MM'].append(mystats[gcr][idf][nm]['MM'])
else:
tmp[nm]['HH'].append(0.)
tmp[nm]['HM'].append(0.)
tmp[nm]['MM'].append(0.)
for nm in xvalues:
avgHH = float(sum(tmp[nm]['HH'])) / len(tmp[nm]['HH'])
avgHM = float(sum(tmp[nm]['HM'])) / len(tmp[nm]['HM'])
avgMM = float(sum(tmp[nm]['MM'])) / len(tmp[nm]['MM'])
yvalues[gcr]['HH'].append(avgHH)
yvalues[gcr]['HM'].append(avgHM)
yvalues[gcr]['MM'].append(avgMM)
plt.plot(xvalues,yvalues[gcr]['HH'],label='HH (' + str(gcr) + '%)',ls='-',color=color_scheme[gcr])
#plt.plot(xvalues,yvalues[gcr]['HM'],label='HM (' + str(gcr) + '%)',ls='--',color=color_scheme[gcr])
plt.plot(xvalues,yvalues[gcr]['MM'],label='MM (' + str(gcr) + '%)',ls=':',color=color_scheme[gcr])
plt.legend(loc=3)
# show plot
plt.show()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment