Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Vladimir Reinharz
mateRNAl_all
Commits
4bb1aec6
Commit
4bb1aec6
authored
Nov 22, 2017
by
Jerome Waldispuhl
Browse files
modif
parent
a37a45a9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Misc/ml_density.py
View file @
4bb1aec6
import
os
,
sys
import
re
import
csv
from
Bio.SeqUtils
import
GC
from
math
import
log
import
matplotlib.pyplot
as
plt
import
distance
from
random
import
randint
# List diretory
path
=
"../Data/RNAmuts_mfe_csv/"
...
...
@@ -52,7 +52,8 @@ for file in dirs:
# parse files
mystats
=
{
'all'
:{
10
:{},
30
:{},
50
:{},
70
:{},
90
:{}},
'ml'
:{
10
:{},
30
:{},
50
:{},
70
:{},
90
:{}}}
mystats
=
{
10
:{},
30
:{},
50
:{},
70
:{},
90
:{}}
seqdists
=
{}
for
gcr
in
seqfiles
.
keys
():
if
not
gcr
in
list_gcr
:
...
...
@@ -64,88 +65,105 @@ for gcr in seqfiles.keys():
for
filename
in
seqfiles
[
gcr
]:
print
filename
idfile
=
len
(
mystats
[
'all'
][
gcr
])
idfile
=
len
(
mystats
[
gcr
])
# init data structure
mystats
[
'all'
][
gcr
][
idfile
]
=
{}
mystats
[
'ml'
][
gcr
][
idfile
]
=
{}
for
k
in
range
(
seqlen
+
1
):
mystats
[
'all'
][
gcr
][
idfile
][
k
]
=
{}
mystats
[
'ml'
][
gcr
][
idfile
][
k
]
=
{}
for
i
in
range
(
seqlen
):
mystats
[
'all'
][
gcr
][
idfile
][
k
][
i
]
=
{
'count'
:{
'A'
:
0
,
'C'
:
0
,
'G'
:
0
,
'U'
:
0
},
'freq'
:{
'A'
:
0.
,
'C'
:
0.
,
'G'
:
0.
,
'U'
:
0.
}}
mystats
[
'ml'
][
gcr
][
idfile
][
k
][
i
]
=
{
'count'
:{
'A'
:
0
,
'C'
:
0
,
'G'
:
0
,
'U'
:
0
},
'freq'
:{
'A'
:
0.
,
'C'
:
0.
,
'G'
:
0.
,
'U'
:
0.
}}
mystats
[
gcr
][
idfile
]
=
{}
# read file
f
=
open
(
filename
)
reader
=
csv
.
reader
(
f
,
delimiter
=
','
)
next
(
f
)
hdist_list
=
{}
seqs
=
{}
for
row
in
reader
:
nmut
=
int
(
row
[
2
])
mseq
=
row
[
3
].
upper
()
is_ml_ssa
=
has_ml
(
row
[
4
])
for
i
in
range
(
len
(
mseq
)):
mystats
[
'all'
][
gcr
][
idfile
][
nmut
][
i
][
'count'
][
mseq
[
i
]]
+=
1
if
is_ml_ssa
:
mystats
[
'ml'
][
gcr
][
idfile
][
nmut
][
i
][
'count'
][
mseq
[
i
]]
+=
1
if
is_ml_ssa
:
accept_rate
=
0
else
:
accept_rate
=
99
if
randint
(
0
,
accept_rate
)
==
0
:
if
not
seqs
.
has_key
(
nmut
):
seqs
[
nmut
]
=
[]
if
not
hdist_list
.
has_key
(
nmut
):
hdist_list
[
nmut
]
=
{
'HH'
:[],
'HM'
:[],
'MM'
:[]}
for
prevseq
,
prevml
in
seqs
[
nmut
]:
hdist
=
distance
.
hamming
(
prevseq
,
mseq
)
if
is_ml_ssa
:
if
prevml
:
hdist_list
[
nmut
][
'MM'
].
append
(
hdist
)
else
:
hdist_list
[
nmut
][
'HM'
].
append
(
hdist
)
else
:
if
prevml
:
hdist_list
[
nmut
][
'HM'
].
append
(
hdist
)
else
:
hdist_list
[
nmut
][
'HH'
].
append
(
hdist
)
seqs
[
nmut
].
append
((
mseq
,
is_ml_ssa
))
for
nmut
in
range
(
seqlen
+
1
):
avg_dist
=
{
'HH'
:
0.
,
'HM'
:
0.
,
'MM'
:
0.
}
if
hdist_list
.
has_key
(
nmut
):
if
len
(
hdist_list
[
nmut
][
'HH'
])
>
0
:
avg_dist
[
'HH'
]
=
float
(
sum
(
hdist_list
[
nmut
][
'HH'
]))
/
len
(
hdist_list
[
nmut
][
'HH'
])
if
len
(
hdist_list
[
nmut
][
'HM'
])
>
0
:
avg_dist
[
'HM'
]
=
float
(
sum
(
hdist_list
[
nmut
][
'HM'
]))
/
len
(
hdist_list
[
nmut
][
'HM'
])
if
len
(
hdist_list
[
nmut
][
'MM'
])
>
0
:
avg_dist
[
'MM'
]
=
float
(
sum
(
hdist_list
[
nmut
][
'MM'
]))
/
len
(
hdist_list
[
nmut
][
'MM'
])
mystats
[
gcr
][
idfile
][
nmut
]
=
avg_dist
f
.
close
()
# compute freq
for
sst
in
[
'all'
,
'ml'
]:
for
idf
in
mystats
[
'all'
][
gcr
].
keys
():
for
nmut
in
range
(
seqlen
+
1
):
for
i
in
range
(
seqlen
):
nttotal
=
mystats
[
sst
][
gcr
][
idf
][
nmut
][
i
][
'count'
][
'A'
]
+
mystats
[
sst
][
gcr
][
idf
][
nmut
][
i
][
'count'
][
'C'
]
+
mystats
[
sst
][
gcr
][
idf
][
nmut
][
i
][
'count'
][
'G'
]
+
mystats
[
sst
][
gcr
][
idf
][
nmut
][
i
][
'count'
][
'U'
]
if
nttotal
>
0
:
mystats
[
sst
][
gcr
][
idf
][
nmut
][
i
][
'freq'
][
'A'
]
=
float
(
mystats
[
sst
][
gcr
][
idf
][
nmut
][
i
][
'count'
][
'A'
])
/
nttotal
mystats
[
sst
][
gcr
][
idf
][
nmut
][
i
][
'freq'
][
'C'
]
=
float
(
mystats
[
sst
][
gcr
][
idf
][
nmut
][
i
][
'count'
][
'C'
])
/
nttotal
mystats
[
sst
][
gcr
][
idf
][
nmut
][
i
][
'freq'
][
'G'
]
=
float
(
mystats
[
sst
][
gcr
][
idf
][
nmut
][
i
][
'count'
][
'G'
])
/
nttotal
mystats
[
sst
][
gcr
][
idf
][
nmut
][
i
][
'freq'
][
'U'
]
=
float
(
mystats
[
sst
][
gcr
][
idf
][
nmut
][
i
][
'count'
][
'U'
])
/
nttotal
## compute entropy
xvalues
=
range
(
seqlen
+
1
)
yvalues
=
{
10
:{
'all'
:[],
'ml'
:[]},
30
:{
'all'
:[],
'ml'
:[]},
50
:{
'all'
:[],
'ml'
:[]},
70
:{
'all'
:[],
'ml'
:[]},
90
:{
'all'
:[],
'ml'
:[]}}
for
gcr
in
list_gcr
:
for
nmut
in
range
(
seqlen
+
1
):
entropy
=
{
'all'
:{
'avg'
:
0.
},
'ml'
:{
'avg'
:
0.
}}
list_ent_all
=
[]
list_ent_ml
=
[]
for
idf
in
mystats
[
'all'
][
gcr
].
keys
():
entropy
[
'all'
][
idf
]
=
0.
entropy
[
'ml'
][
idf
]
=
0.
for
i
in
range
(
seqlen
):
colentropy_all
=
0.
colentropy_ml
=
0.
for
nt
in
[
'A'
,
'C'
,
'G'
,
'U'
]:
if
mystats
[
'all'
][
gcr
][
idf
][
nmut
][
i
][
'freq'
][
nt
]
>
0
:
colentropy_all
-=
mystats
[
'all'
][
gcr
][
idf
][
nmut
][
i
][
'freq'
][
nt
]
*
log
(
mystats
[
'all'
][
gcr
][
idf
][
nmut
][
i
][
'freq'
][
nt
],
2
)
if
mystats
[
'ml'
][
gcr
][
idf
][
nmut
][
i
][
'freq'
][
nt
]
>
0
:
colentropy_ml
-=
mystats
[
'ml'
][
gcr
][
idf
][
nmut
][
i
][
'freq'
][
nt
]
*
log
(
mystats
[
'ml'
][
gcr
][
idf
][
nmut
][
i
][
'freq'
][
nt
],
2
)
entropy
[
'all'
][
idf
]
+=
colentropy_all
entropy
[
'ml'
][
idf
]
+=
colentropy_ml
list_ent_all
.
append
(
entropy
[
'all'
][
idf
])
list_ent_ml
.
append
(
entropy
[
'ml'
][
idf
])
if
len
(
list_ent_all
)
>
0
:
entropy
[
'all'
][
'avg'
]
=
float
(
sum
(
list_ent_all
))
/
len
(
list_ent_all
)
if
len
(
list_ent_ml
)
>
0
:
entropy
[
'ml'
][
'avg'
]
=
float
(
sum
(
list_ent_ml
))
/
len
(
list_ent_ml
)
if
True
:
yvalues
[
gcr
][
'all'
].
append
(
entropy
[
'all'
][
'avg'
])
yvalues
[
gcr
][
'ml'
].
append
(
entropy
[
'ml'
][
'avg'
])
# plot
plt
.
style
.
use
(
'fivethirtyeight'
)
color_scheme
=
{
10
:
'C0'
,
30
:
'C1'
,
50
:
'C2'
,
70
:
'C3'
,
90
:
'C4'
}
for
gcr
in
list_gcr
:
if
True
:
plt
.
plot
(
xvalues
,
yvalues
[
gcr
][
'all'
],
label
=
'all ('
+
str
(
gcr
)
+
'%)'
,
ls
=
'-'
,
color
=
color_scheme
[
gcr
])
plt
.
plot
(
xvalues
,
yvalues
[
gcr
][
'ml'
],
label
=
'ml ('
+
str
(
gcr
)
+
'%)'
,
ls
=
'--'
,
color
=
color_scheme
[
gcr
])
else
:
plt
.
hist
([
yvalues
[
gcr
][
'all'
],
yvalues
[
gcr
][
'ml'
]])
plt
.
legend
(
loc
=
3
)
# show plot
plt
.
show
()
plt
.
style
.
use
(
'fivethirtyeight'
)
color_scheme
=
{
10
:
'C0'
,
30
:
'C1'
,
50
:
'C2'
,
70
:
'C3'
,
90
:
'C4'
}
# plot data
xvalues
=
range
(
seqlen
+
1
)
yvalues
=
{}
for
gcr
in
list_gcr
:
yvalues
[
gcr
]
=
{
'HH'
:[],
'HM'
:[],
'MM'
:[]}
for
idf
in
mystats
[
gcr
].
keys
():
tmp
=
{}
for
nm
in
xvalues
:
if
not
tmp
.
has_key
(
nm
):
tmp
[
nm
]
=
{
'HH'
:[],
'HM'
:[],
'MM'
:[]}
if
mystats
[
gcr
][
idfile
].
has_key
(
nm
):
tmp
[
nm
][
'HH'
].
append
(
mystats
[
gcr
][
idf
][
nm
][
'HH'
])
tmp
[
nm
][
'HM'
].
append
(
mystats
[
gcr
][
idf
][
nm
][
'HM'
])
tmp
[
nm
][
'MM'
].
append
(
mystats
[
gcr
][
idf
][
nm
][
'MM'
])
else
:
tmp
[
nm
][
'HH'
].
append
(
0.
)
tmp
[
nm
][
'HM'
].
append
(
0.
)
tmp
[
nm
][
'MM'
].
append
(
0.
)
for
nm
in
xvalues
:
avgHH
=
float
(
sum
(
tmp
[
nm
][
'HH'
]))
/
len
(
tmp
[
nm
][
'HH'
])
avgHM
=
float
(
sum
(
tmp
[
nm
][
'HM'
]))
/
len
(
tmp
[
nm
][
'HM'
])
avgMM
=
float
(
sum
(
tmp
[
nm
][
'MM'
]))
/
len
(
tmp
[
nm
][
'MM'
])
yvalues
[
gcr
][
'HH'
].
append
(
avgHH
)
yvalues
[
gcr
][
'HM'
].
append
(
avgHM
)
yvalues
[
gcr
][
'MM'
].
append
(
avgMM
)
plt
.
plot
(
xvalues
,
yvalues
[
gcr
][
'HH'
],
label
=
'HH ('
+
str
(
gcr
)
+
'%)'
,
ls
=
'-'
,
color
=
color_scheme
[
gcr
])
#plt.plot(xvalues,yvalues[gcr]['HM'],label='HM (' + str(gcr) + '%)',ls='--',color=color_scheme[gcr])
plt
.
plot
(
xvalues
,
yvalues
[
gcr
][
'MM'
],
label
=
'MM ('
+
str
(
gcr
)
+
'%)'
,
ls
=
':'
,
color
=
color_scheme
[
gcr
])
plt
.
legend
(
loc
=
3
)
# show plot
plt
.
show
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment