Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Roman Sarrazin-Gendron
RNABayesPairing
Commits
f554cf23
Commit
f554cf23
authored
Jun 03, 2019
by
Roman Sarrazin-Gendron
Browse files
updated mapping of structures to Rfam alignments
parent
ceff9cea
Changes
10
Expand all
Hide whitespace changes
Inline
Side-by-side
bayespairing/src/BayesPairing.py
View file @
f554cf23
...
...
@@ -12,7 +12,7 @@ import math
import
make_BN_from_carnaval
as
makeBN
def
parse_sequence
(
seq
,
modules
,
ss
,
dataset
,
left_out
,
sm
=
0.3
,
mc
=
0
,
p
=
10000
):
def
parse_sequence
(
seq
,
modules
,
ss
,
dataset
,
left_out
,
sm
=
0.3
,
mc
=
0
,
p
=
10000
,
leave_out_sequence
=
False
,
left_out_sequence
=
""
):
pdbs
=
pickle
.
load
(
open
(
"../models/"
+
dataset
+
"_PDB_names.cPickle"
,
"rb"
))
graphs
=
pickle
.
load
(
open
(
"../models/"
+
dataset
+
"_one_of_each_graph.cPickle"
,
"rb"
))
return_dict
=
{}
...
...
@@ -23,11 +23,14 @@ def parse_sequence(seq, modules, ss, dataset, left_out, sm=0.3, mc=0, p=10000):
this_graph
=
graphs
[
mod
][
0
]
#print("Creating Bayesian Network", flush=True)
makeBN
.
call_makeBN
(
mod
,
dataset
,
left_out
)
makeBN
.
call_makeBN
(
mod
,
dataset
,
left_out
,
leave_out_seq
=
leave_out_sequence
,
left_out_seq
=
left_out_sequence
)
#print("Bayesian Network created", flush=True)
motif
=
pickle
.
load
(
open
(
str
(
"../models/"
+
dataset
+
"_BN_"
+
str
(
mod
)),
"rb"
))
pickle_name
=
dataset
+
"_samp"
+
str
(
mod
)
+
".cPickle"
print
(
"LEFT OUT"
,
left_out
)
if
left_out
!=
"NONE"
:
print
(
"LEFT OUT"
,
left_out
,
"building new model without"
)
sample_df
,
sample_dict
,
struct_list
=
bs
.
setup_sampling
(
motif
,
p
)
scores
,
return_struct
,
max
=
bs
.
get_best_struct
(
motif
,
struct_list
,
sm
)
pickle
.
dump
(
struct_list
,
open
(
"../models/"
+
pickle_name
,
"wb"
))
...
...
@@ -107,8 +110,11 @@ def returner(scores, seq, ss="", m=8, n=5, k=1, sscons=2):
nocons_score
=
testSS
.
call_rnafold
(
seq
)
cons_score
=
testSS
.
call_rnafold
(
seq
,
cons_seq
)
quotient
=
(
nocons_score
/
cons_score
)
current_score
=
current_score
+
float
((
float
(
1
)
/
float
(
k
))
*
math
.
log
(
quotient
))
seq_score
=
current_score
current_score
=
seq_score
+
float
((
float
(
1
)
/
float
(
k
))
*
math
.
log
(
quotient
))
final_top
[
str
(
poz
)]
=
current_score
#if current_score>20:
# print(position_subsets,current_score,nocons_score,cons_score,quotient, seq_score,math.log(quotient))
# end1 = time.time()
# print("FOLDING TIME")
# print(end1-start1)
...
...
bayespairing/src/Rfam_to_module.py
View file @
f554cf23
from
Bio
import
SeqIO
import
numpy
as
np
from
numpy.random
import
choice
def
get_consensus
(
module_seqs
):
consensus
=
""
consensus
=
[]
for
i
in
range
(
len
(
module_seqs
[
0
])):
scores
=
[
0
,
0
,
0
,
0
]
alph
=
list
(
"ACGU"
)
...
...
@@ -17,10 +20,42 @@ def get_consensus(module_seqs):
#print(scores)
#print(scores.index(max(scores)))
#print(np.argmax(scores))
consensus
=
consensus
+
alph
[
scores
.
index
(
max
(
scores
))]
tot
=
sum
(
scores
)
consensus
.
append
([
x
/
tot
for
x
in
scores
])
return
consensus
def
rfam_to_module
(
positions
,
family_file
,
output_name
):
def
rfam_to_module
(
positions
,
family_file
,
output_name
,
as_list
=
False
,
seqs
=
[]):
alph
=
[
"A"
,
"C"
,
"G"
,
"U"
]
if
as_list
:
module_seqs
=
[]
for
seq
in
seqs
:
module_seqs
.
append
(
""
.
join
(
seq
))
cons
=
get_consensus
(
module_seqs
)
fmodule_seqs
=
[]
for
s
in
module_seqs
:
this_mod
=
""
for
col
in
range
(
len
(
s
)):
if
str
(
s
[
col
])
!=
"-"
:
this_mod
=
this_mod
+
str
(
s
[
col
])
else
:
this_mod
=
this_mod
+
alph
[
choice
(
np
.
array
([
0
,
1
,
2
,
3
]),
p
=
cons
[
col
])]
fmodule_seqs
.
append
(
this_mod
)
with
open
(
output_name
,
"w"
)
as
f
:
for
j
in
range
(
len
(
fmodule_seqs
)):
f
.
write
(
">module_seq"
)
f
.
write
(
str
(
j
))
f
.
write
(
"
\n
"
)
f
.
write
(
fmodule_seqs
[
j
])
f
.
write
(
"
\n
"
)
return
module_seqs
=
[]
modules_handles
=
[]
for
record
in
SeqIO
.
parse
(
family_file
,
"fasta"
):
...
...
@@ -40,7 +75,7 @@ def rfam_to_module(positions, family_file,output_name):
if
str
(
s
[
col
])
!=
"-"
:
this_mod
=
this_mod
+
str
(
s
[
col
])
else
:
this_mod
=
this_mod
+
cons
[
col
]
this_mod
=
this_mod
+
alph
[
choice
(
np
.
array
([
0
,
1
,
2
,
3
]),
p
=
cons
[
col
]
)]
fmodule_seqs
.
append
(
this_mod
)
#print(module_seqs)
...
...
@@ -48,7 +83,7 @@ def rfam_to_module(positions, family_file,output_name):
#print(fmodule_seqs)
with
open
(
output_name
,
"w"
)
as
f
:
with
open
(
output_name
,
"w"
)
as
f
:
for
j
in
range
(
len
(
modules_handles
)):
f
.
write
(
">"
)
f
.
write
(
modules_handles
[
j
])
...
...
bayespairing/src/bayes_to_seqpy.py
View file @
f554cf23
...
...
@@ -312,6 +312,7 @@ def build_regex(position_subsets,component_distance, iii, convert, positions, mc
#dis = component_distance[junction]
if
len
(
re_call
)
>
0
:
dis
=
component_distance
[
junction
]
#print(dis)
re_call
=
re_call
+
r
")([ACGU]{"
+
str
(
dis
[
0
]
-
1
)
+
","
+
str
(
dis
[
1
]
-
1
)
+
"})"
junction
=
junction
+
1
...
...
@@ -423,6 +424,16 @@ def adjust_aln(best_struct,positions,better_positions):
new_best_struct
=
(
full_sample
,
new_nucs_for_each_nodes
,
best_of_sample
)
return
new_best_struct
def
rotate_list
(
lst
):
return
lst
[
1
:]
+
lst
[:
1
]
def
get_all_rotations
(
lst
):
rots
=
[
lst
]
for
i
in
range
(
len
(
lst
)
-
1
):
lst
=
rotate_list
(
lst
)
rots
.
append
(
lst
)
return
rots
def
get_position_permutations
(
position_subsets
,
component_distance
):
final_position_permutations
=
[]
to_permute
=
[]
...
...
@@ -440,7 +451,7 @@ def get_position_permutations(position_subsets,component_distance):
for
position
in
range
(
len
(
component_distance
)
+
1
):
if
position
not
in
set
(
permuted
):
not_permuted
.
append
(
position
)
permutations
=
list
(
itertools
.
permu
tations
(
to_permute
)
)
permutations
=
get_all_ro
tations
(
to_permute
)
for
perm
in
permutations
:
this_perm
=
[]
perm_counter
=
0
...
...
@@ -581,13 +592,13 @@ def seq_to_struct(
h
=
int
(
1.5
*
b
)
j
=
int
(
1.2
*
b
)
reduced_component_distance
.
append
([
a
,
c
])
reduced_component_distance
.
append
([
a
,
b
])
halfway_component_distance
.
append
([
a
,
d
])
e_distance
.
append
([
a
,
e
])
f_distance
.
append
([
a
,
f
])
g_distance
.
append
([
a
,
c
])
h_distance
.
append
([
a
,
h
])
j_distance
.
append
([
a
,
j
])
j_distance
.
append
([
a
,
b
])
permutations
=
get_permutations
(
position_subsets
,
component_distance
,
iii
)
regex_list
=
[]
for
perm
in
permutations
:
...
...
@@ -611,7 +622,7 @@ def seq_to_struct(
regex_list
.
append
([
res6
,
position_subsets
,
node_dict
])
regex_list
.
append
([
res7
,
position_subsets
,
node_dict
])
regex_list
.
append
([
res8
,
position_subsets
,
node_dict
])
#("REGEX:",re
s
)
#
print
("REGEX:",re
gex_list
)
output
=
[]
final_model
=
{}
import
time
...
...
bayespairing/src/display_modules.py
View file @
f554cf23
...
...
@@ -214,7 +214,7 @@ def plot_module(graphs,name,latex=False):
BP
=
mpatches
.
Patch
(
color
=
"black"
,
label
=
"Backbone"
)
SP
=
mpatches
.
Patch
(
color
=
"red"
,
label
=
"Stacking"
)
# plt.legend(handles=[NCP,CP,BP,SP],prop={'size': 16})
#
plt.show()
plt
.
show
()
plt
.
savefig
(
"../Graphs/"
+
name
+
"graph"
+
str
(
m
)
+
".png"
,
format
=
"png"
)
plt
.
clf
()
...
...
bayespairing/src/make_BN_from_carnaval.py
View file @
f554cf23
...
...
@@ -190,16 +190,17 @@ def make_graph_from_carnaval(g, aln_dict):
#plt.show()
return
motif
def
call_makeBN
(
mod
,
dataset
,
left_out
):
def
call_makeBN
(
mod
,
dataset
,
left_out
,
leave_out_seq
=
False
,
left_out_seq
=
""
):
current_ID
=
mod
excluded
=
left_out
bayesname
=
dataset
+
"_BN_"
+
str
(
current_ID
)
#
if excluded != "NONE":
#
removecall = "rm " +"../models/"+bayesname
#
subprocess.call(removecall, shell=True)
if
excluded
!=
"NONE"
:
removecall
=
"rm "
+
"../models/"
+
bayesname
subprocess
.
call
(
removecall
,
shell
=
True
)
#if not os.path.isfile(bayesname), changed for cross-validation; always remake the model
if
not
os
.
path
.
isfile
(
"../models/"
+
bayesname
):
print
(
"building and sampling bayesian network"
)
aln_list
=
pickle
.
load
(
open
(
"../models/"
+
dataset
+
"_aligned_modulegraphs.cPickle"
,
'rb'
))
g_list
=
pickle
.
load
(
open
(
"../models/"
+
dataset
+
"_one_of_each_graph.cPickle"
,
'rb'
))
pdb_names
=
pickle
.
load
(
open
(
"../models/"
+
dataset
+
"_PDB_names.cPickle"
,
'rb'
))
...
...
@@ -207,8 +208,9 @@ def call_makeBN(mod,dataset,left_out):
if
excluded
!=
"NONE"
:
excluded_indexes
=
[]
for
pdb
in
range
(
len
(
pdb_names
[
mod
])):
if
excluded
in
pdb_names
[
mod
][
pdb
]:
excluded_indexes
.
append
(
pdb
)
motif_seq
=
""
.
join
([
x
[
1
][
"nuc"
]
for
x
in
sorted
(
g_list
[
mod
][
pdb
].
nodes
(
data
=
True
))])
if
excluded
in
pdb_names
[
mod
][
pdb
]
or
(
leave_out_seq
and
motif_seq
==
left_out_seq
):
excluded_indexes
.
append
(
pdb
)
if
0
in
excluded_indexes
:
excluded_indexes
.
remove
(
0
)
g
=
g_list
[
current_ID
]
...
...
bayespairing/src/map_PDB_to_Rfam.py
View file @
f554cf23
...
...
@@ -5,6 +5,8 @@ import networkx as nx
from
Bio
import
pairwise2
from
Bio.pairwise2
import
format_alignment
from
Bio.Align.Applications
import
ClustalwCommandline
from
Bio.Align.Applications
import
ClustalOmegaCommandline
import
os
from
bs4
import
BeautifulSoup
from
bs4.element
import
Comment
...
...
@@ -14,6 +16,9 @@ import time
import
pickle
import
requests
rfam
=
pickle
.
load
(
open
(
"../models/RFAM_databse.cPickle"
,
"rb"
))
def
get_from_rfam
(
fam
):
url
=
"http://rfam.org/family/"
+
fam
+
"/alignment/stockholm"
r
=
requests
.
get
(
url
)
...
...
@@ -33,61 +38,61 @@ def get_rfam_alignment(family):
def
get_PDB_sequence
(
PDBid
):
#print("PDB")
#
print(PDBid)
print
(
PDBid
)
PDB
,
chain
=
PDBid
.
split
(
"."
)
#print(PDB)
#print("../all_graphs_pickled/" + PDB + ".nxpickled")
#
print(PDB)
#
print("../all_graphs_pickled/" + PDB + ".nxpickled")
try
:
g
=
pickle
.
load
(
open
(
"../models/all_graphs_pickled/"
+
PDB
+
".nxpickled"
,
"rb"
))
except
FileNotFoundError
:
print
(
"PDB FILE NOT FOUND"
)
return
(
""
,
0
)
seq
=
""
nodes
=
[]
return
(
""
,
0
)
seq
=
""
nodes
=
[]
for
node
in
g
.
nodes
(
data
=
True
):
#print(node)
#print(node[0][0],chain)
if
node
[
0
][
0
]
==
chain
:
#
print(node)
#
print(node[0][0],chain)
if
node
[
0
][
0
]
==
chain
:
nodecode
=
(
node
[
0
][
1
])
nodes
.
append
((
int
(
nodecode
),
node
[
1
][
"nt"
]))
nodes
.
append
((
int
(
nodecode
),
node
[
1
][
"nt"
]))
sortednodes
=
sorted
(
list
(
nodes
))
nuc_by_node
=
{}
nuc_by_node
=
{}
missing_nuc
=
False
#print("NODES")
#
print("NODES")
numericals
=
[
x
[
0
]
for
x
in
sortednodes
]
decalage
=
0
if
1
not
in
numericals
:
decalage
=
decalage
+
1
sortednodes
.
append
((
1
,
"N"
))
decalage
=
decalage
+
1
sortednodes
.
append
((
1
,
"N"
))
sortednodes
=
sorted
(
sortednodes
)
#missing_nuc=True
#decalage = decalage +1
#
missing_nuc=True
#
decalage = decalage +1
newnodes
=
[]
#for i in sortednodes:
#
for i in sortednodes:
# newnodes.append((i[0],i[1]))
#sortednodes = sorted(list(newnodes))
#
sortednodes = sorted(list(newnodes))
numericals
=
[
x
[
0
]
for
x
in
sortednodes
]
#print("MISSING 1", PDBid)
#print(numericals)
#
print("MISSING 1", PDBid)
#
print(numericals)
#
#sortednodes=sorted(sortednodes)
#numericals = [x[0]-1 for x in sortednodes]
#numericals.insert(0,0)
#else:
#print("NOT MISSING", PDBid)
#
sortednodes=sorted(sortednodes)
#
numericals = [x[0]-1 for x in sortednodes]
#
numericals.insert(0,0)
#
else:
#
print("NOT MISSING", PDBid)
for
i
in
sortednodes
:
nuc_by_node
[
i
[
0
]]
=
i
[
1
]
#print(sortednodes)
for
i
in
range
(
1
,
int
(
sortednodes
[
-
1
][
0
])
+
1
):
nuc_by_node
[
i
[
0
]]
=
i
[
1
]
#
print(sortednodes)
for
i
in
range
(
1
,
int
(
sortednodes
[
-
1
][
0
])
+
1
):
if
i
not
in
numericals
:
"NOT IN NODES"
seq
=
seq
+
"-"
seq
=
seq
+
"-"
else
:
seq
=
seq
+
nuc_by_node
[
i
]
ss
=
g
.
graph
[
'ss'
]
#
print(seq)
#print("MISSING_NUC",PDBid,missing_nuc)
return
(
seq
,
decalage
)
print
(
seq
)
#
print("MISSING_NUC",PDBid,missing_nuc)
return
(
seq
,
0
)
def
match_rfam_to_PDB
(
PDB
,
family
,
range
=
""
,
written
=
[],
newfam
=
False
):
if
newfam
==
True
:
...
...
@@ -108,9 +113,9 @@ def match_rfam_to_PDB(PDB, family, range="",written=[],newfam=False):
# print("VALID")
positions
=
rfam
[
family
][
PDB
.
split
(
"."
)[
0
]][
1
]
seq_range
=
[
int
(
i
)
for
i
in
positions
.
split
(
" - "
)]
#print(seq_range)
seq
=
seq
[
seq_range
[
0
]
-
1
:
seq_range
[
1
]
-
1
]
#
seq_range= [int(i) for i in positions.split(" - ")]
#print(
"SEQ RANGE",
seq_range)
#
seq=seq[seq_range[0]-1:seq_range[1]-1]
#print(seq)
#print(written)
...
...
@@ -133,7 +138,7 @@ def match_rfam_to_PDB(PDB, family, range="",written=[],newfam=False):
# break
#if len(gapped_seq)==0:
# exit("NO MATCH FOUND")
return
gapped_seq
,
written
,
seq_range
,
decalage
return
gapped_seq
,
written
,
0
,
0
def
get_matches
(
rfam
,
rfam_by_PDB
,
PDBs
):
...
...
@@ -168,21 +173,23 @@ def get_matches(rfam,rfam_by_PDB, PDBs):
return
matches
def
gapped_column_from_ungapped_nuc_pos
(
gapped
,
ungapped
,
ungapped_pos
):
# print(gapped)
#print(ungapped)
#print("CONVERTING GAPPED TO UNGAPPED")
#print("POSITION ",ungapped_pos)
gapped_counter
=
0
ungapped_counter
=
0
# --01-2
gapped_counter
=-
1
ungapped_counter
=
-
1
for
i
in
range
(
len
(
gapped
)):
if
ungapped_counter
==
ungapped_pos
+
1
:
#print("GAPPED",gapped_counter-1)
return
gapped_counter
-
1
if
gapped
[
i
]
==
"-"
:
gapped_counter
=
gapped_counter
+
1
else
:
gapped_counter
=
gapped_counter
+
1
ungapped_counter
=
ungapped_counter
+
1
if
ungapped_counter
==
ungapped_pos
:
#print("GAPPED",gapped_counter-1)
return
gapped_counter
return
-
1000
def
pos_from_fred
(
fred_list
):
...
...
@@ -260,7 +267,7 @@ def find_good_columns(module_columns_by_family, family):
cols
=
[]
module_length
=
-
1
for
col
in
module_columns_by_family
[
family
]:
module_length
=
len
(
module_columns_by_family
[
family
][
col
]
[
0
]
)
module_length
=
len
(
module_columns_by_family
[
family
][
col
])
break
# print(module_length)
for
node_number
in
range
(
module_length
):
...
...
@@ -268,7 +275,7 @@ def find_good_columns(module_columns_by_family, family):
counts
=
{}
total
=
0
for
example
in
module_columns_by_family
[
family
]:
this_col
=
module_columns_by_family
[
family
][
example
][
0
][
node_number
]
this_col
=
module_columns_by_family
[
family
][
example
][
node_number
]
#print(this_col)
if
this_col
in
counts
:
counts
[
this_col
]
=
counts
[
this_col
]
+
1
...
...
@@ -383,6 +390,25 @@ def get_ss_from_rfam_family(family,cols):
print
(
cols
,
ss
)
return
ss
def
alignment_column_from_seq_positions
(
positions_for_this_pdb
,
seq
):
print
(
"GETTING ALIGNMENT COLUMNS"
)
gapped_seq
=
seq
s2
=
str
(
seq
)
ungapped_seq
=
s2
.
replace
(
"-"
,
""
)
print
(
gapped_seq
,
ungapped_seq
)
aln_pos
=
[]
for
i
in
positions_for_this_pdb
:
j
=
gapped_column_from_ungapped_nuc_pos
(
gapped_seq
,
ungapped_seq
,
i
)
aln_pos
.
append
(
j
)
return
(
aln_pos
)
def
get_module_seqs_for_family
(
CURRENTLY_STUDIED_FAMILY
,
matches
,
module_PDB_addresses
):
newfam
=
True
written
=
[]
...
...
@@ -418,19 +444,21 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
print
(
"sequence file is empty"
)
return
-
1
a
=
ClustalwCommandline
(
"./clustalw2"
,
infile
=
"seq2.fasta"
,
outfile
=
"seq3.clustal"
)
print
(
a
)
a
()
#
a =ClustalwCommandline("./clustalw2", infile="seq2.fasta", outfile="seq3.clustal")
#
print(a)
#
a()
print
(
module_PDB_addresses
)
b
=
ClustalwCommandline
(
"./clustalw2"
,
profile1
=
profile
,
profile2
=
"seq2.fasta"
,
outfile
=
"new_aln3.clustal"
)
if
os
.
path
.
isfile
(
"new_aln3.fasta"
):
os
.
remove
(
"new_aln3.fasta"
)
b
=
ClustalOmegaCommandline
(
"../src/clustalo"
,
profile1
=
profile
,
profile2
=
"seq2.fasta"
,
outfile
=
"new_aln3.fasta"
)
#./clustalw2 -PROFILE1=RF02540.clustal -SEQUENCES -PROFILE2=seq.clustal -OUTPUT=new_alignment.clustal
print
(
b
)
b
()
module_columns_by_family
=
{}
module_columns_by_family
[
CURRENTLY_STUDIED_FAMILY
]
=
{}
aln
=
AlignIO
.
read
(
open
(
"new_aln3.
clu
sta
l
"
),
"
clu
sta
l
"
)
aln
=
AlignIO
.
read
(
open
(
"new_aln3.
fa
sta"
),
"
fa
sta"
)
appearances_of_this_module_in_this_family
=
[]
for
record
in
aln
:
current_PDB
=
record
.
id
[
3
:]
...
...
@@ -444,10 +472,12 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
if
current_PDB
in
module_PDB_addresses
:
positions_for_this_pdb
=
module_PDB_addresses
[
current_PDB
]
print
(
"positions"
,
positions_for_this_pdb
)
module_columns_by_family
[
CURRENTLY_STUDIED_FAMILY
][
current_PDB
]
=
positions_for_this_pdb
columns_for_this_pdb
=
alignment_column_from_seq_positions
(
positions_for_this_pdb
,
record
.
seq
)
print
(
"COLUMNS FOR THIS PDB"
,
columns_for_this_pdb
)
module_columns_by_family
[
CURRENTLY_STUDIED_FAMILY
][
current_PDB
]
=
columns_for_this_pdb
#print("---------")
#
print(module_columns_by_family)
print
(
"Module columns positions by family"
,
module_columns_by_family
)
good_cols
=
find_good_columns
(
module_columns_by_family
,
CURRENTLY_STUDIED_FAMILY
)
print
(
'FOUND COLUMNS:'
,
good_cols
)
if
[
-
1000
]
==
good_cols
[
0
]:
...
...
@@ -462,10 +492,10 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
#print(columns)
#print(good_cols)
if
columns
[
0
]
in
good_cols
[
0
]
:
if
columns
in
good_cols
:
#print(sequence)
aln
=
AlignIO
.
read
(
open
(
"new_aln3.
clu
sta
l
"
),
"
clu
sta
l
"
)
aln
=
AlignIO
.
read
(
open
(
"new_aln3.
fa
sta"
),
"
fa
sta"
)
for
record
in
aln
:
#print("PDB :", PDB)
if
PDB
in
record
.
id
and
-
1000
not
in
columns
:
...
...
@@ -498,9 +528,9 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
good_cols
=
find_good_columns
(
module_columns_by_family
,
CURRENTLY_STUDIED_FAMILY
)
if
good_cols
==
[]
or
[
-
1000
]
in
good_cols
:
return
-
1
aln
=
AlignIO
.
read
(
open
(
"new_aln3.
clu
sta
l
"
),
"
clu
sta
l
"
)
aln
=
AlignIO
.
read
(
open
(
"new_aln3.
fa
sta"
),
"
fa
sta"
)
for
PDB
in
module_columns_by_family
[
CURRENTLY_STUDIED_FAMILY
]:
columns
=
module_columns_by_family
[
CURRENTLY_STUDIED_FAMILY
][
PDB
]
[
0
]
columns
=
module_columns_by_family
[
CURRENTLY_STUDIED_FAMILY
][
PDB
]
sequence
=
module_columns_by_family
[
CURRENTLY_STUDIED_FAMILY
][
PDB
][
1
]
experiment_seq
=
[]
#print(columns)
...
...
@@ -543,10 +573,11 @@ def sanity_check_existing_seqs():
print
(
"--------------------"
)
pickle
.
dump
(
fixed_seqs
,
open
(
"test3_ram.cPickle"
,
"wb"
))
if
__name__
==
"__main__"
:
#sanity_check_existing_seqs()
rfam
=
pickle
.
load
(
open
(
"../models/RFAM_databse.cPickle"
,
"rb"
))
def
PDB_to_Rfam
(
PDB_code
,
PDB_positions
):
rfam_by_PDB
=
{}
number
=
0
for
i
in
rfam
:
...
...
@@ -571,15 +602,18 @@ if __name__ == "__main__":
positions
=
{}
module_PDB_addresses
=
{}
rfam_module_seqs
=
[]
print
(
matches
)
#module_PDB = ['3D2V.B.2',"2GDI.Y.1"]
module_PDB
=
[
'3D2V.B.2'
]
#module_positions= {'3D2V.B' : [6,7,39,40,41,42,43,44,45,70,71,72,73],"2GDI.Y": [14,15,51,52,53,54,55,56,82,83,84,85]}
module_positions
=
{
'3D2V.B'
:
[
6
,
7
,
39
,
40
,
41
,
42
,
43
,
44
,
45
,
70
,
71
,
72
,
73
]}
print
(
rfam_by_PDB
)
module_PDB
=
PDB_code
print
(
module_PDB
,
PDB_positions
)
module_positions
=
{}
for
i
in
range
(
len
(
module_PDB
)):
module_positions
[
module_PDB
[
i
]]
=
[
x
-
1
for
x
in
PDB_positions
[
i
]]
print
(
"Rfam by PDB"
,
rfam_by_PDB
)
matches
=
get_matches
(
rfam
,
rfam_by_PDB
,
module_PDB
)
print
(
matches
)
print
(
"KEYS"
,
list
(
matches
.
keys
()))
print
(
"matches"
,
matches
)
#
print("KEYS",list(matches.keys()))
if
os
.
path
.
isfile
(
"seq2.fasta"
):
os
.
remove
(
"seq2.fasta"
)
with
open
(
"seq2.fasta"
,
"w"
)
as
f
:
...
...
@@ -600,3 +634,10 @@ if __name__ == "__main__":
for
seq
in
module_seqs
:
rfam_module_seqs
.
append
(
seq
)
print
(
rfam_module_seqs
)
return
(
rfam_module_seqs
)
if
__name__
==
"__main__"
:
PDB_pos
=
[
6
,
7
,
39
,
40
,
41
,
42
,
43
,
44
,
45
,
70
,
71
,
72
,
73
]
a
=
PDB_to_Rfam
([
"3D2V.B"
],
[
PDB_pos
])
print
(
a
)
\ No newline at end of file
bayespairing/src/module_from_desc.py
View file @
f554cf23
...
...
@@ -114,7 +114,12 @@ def create_module(desc,fasta,model_name, PDBs=[]):
graphs_name
=
"../models/"
+
model_name
+
"_one_of_each_graph.cPickle"
aln_name
=
"../models/"
+
model_name
+
"_aligned_modulegraphs.cPickle"
PDB_name
=
"../models/"
+
model_name
+
"_PDB_names.cPickle"
g
=
make_graph
(
desc
)
if
'desc'
in
desc
.
lower
():
g
=
make_graph
(
desc
)
else
:
g
=
pickle
.
load
(
open
(
desc
,
'rb'
))
graph_list
=
make_new_graph_examples
(
g
,
fasta
)
if
os
.
path
.
isfile
(
graphs_name
):
...
...
bayespairing/src/testSS.py
View file @
f554cf23
...
...
@@ -41,31 +41,22 @@ def get_constraints_from_BN(graph, positions):
visited
.
append
(
n
)
return
constraints
def
call_rnafold
(
seq
,
cons
=
""
):
def
call_rnafold
(
seq
,
cons
=
""
,
just_ss
=
False
):
E
=
0
p
=
0
Z
=
0
"""
with open("seq.txt","w") as f:
f.write(seq)
if cons!="":
f.write("
\n
")
f.write(cons)
subcall = ""
"""
if
cons
==
""
:
if
cons
==
""
:
out
=
run
([
'RNAfold'
,
'-p'
,
'--noPS'
],
input
=
seq
,
stdout
=
PIPE
,
universal_newlines
=
True
)
#subcall = "RNAfold -p < seq.txt > result.txt"
else
:
#subcall = "RNAfold -p -C < seq.txt > result.txt"
out
=
run
([
'RNAfold'
,
'-p'
,
'--noPS'
,
'-C'
],
input
=
'{}
\n
{}'
.
format
(
seq
,
cons
),
stdout
=
PIPE
,
universal_newlines
=
True
)
#subprocess.call(subcall,shell=True)
"""
with open("result.txt", "r") as f:
lines = f.readlines()
"""
out
=
run
([
'RNAfold'
,
'-p'
,
'--noPS'
,
'-C'
],
input
=
'{}
\n
{}'
.
format
(
seq
,
cons
),
stdout
=
PIPE
,
universal_newlines
=
True
)
lines
=
out
.
stdout
.
splitlines
()
ss
=
lines
[
1
].
split
(
" "
)[
0
]
if
just_ss
:
return
ss
energy
=
(
lines
[
1
].
split
(
" "
)[
-
1
]).
strip
()
eval
=
energy
[
1
:]
final_energy
=
eval
[:
-
1
]
...
...
@@ -73,16 +64,17 @@ def call_rnafold(seq,cons=""):
weight
=
(
lines
[
-
1
].
split
(
" "
)[
7
]).
strip
()
p
=
float
(
weight
[:
-
1
])
#formula : Z = 1/p * exp(E/ RT))
#
formula : Z = 1/p * exp(E/ RT))
T
=
274.5
R
=
0.00198717