Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Roman Sarrazin-Gendron
BP6.1
Commits
ea1be472
Commit
ea1be472
authored
Jun 20, 2018
by
Roman Sarrazin-Gendron
Browse files
added rfam ss
parent
ae0cdc4b
Changes
1
Hide whitespace changes
Inline
Side-by-side
models/family_for_each_PDB.py
View file @
ea1be472
...
...
@@ -239,7 +239,33 @@ def define_acceptable_columns(counts):
acceptable_cols
.
remove
(
i
)
return
acceptable_cols
def
find_good_columns
(
module_columns_by_family
,
family
):
def
get_bps_from_ss
(
ss
,
columns
):
starts
=
[]
ends
=
[]
ss
=
list
(
ss
)
for
i
in
range
(
len
(
list
(
ss
))):
if
ss
[
i
]
==
"("
or
ss
[
i
]
==
"<"
or
ss
[
i
]
==
"{"
:
starts
.
append
(
i
)
if
ss
[
i
]
==
")"
or
ss
[
i
]
==
">"
or
ss
[
i
]
==
"}"
:
ends
.
append
(
i
)
#struct = [(columns[starts[i]],columns[ends[i]]) for i in range(min(len(starts),len(ends)))]
if
min
(
len
(
starts
),
len
(
ends
))
==
0
:
return
()
else
:
struct
=
[(
starts
[
i
],
ends
[
i
])
for
i
in
range
(
min
(
len
(
starts
),
len
(
ends
)))]
return
struct
def
verify_columns_ss
(
columns
,
family
,
module
,
graphs
):
rfam_ss
=
get_ss_from_rfam_family
(
family
,
columns
)
rfam_bps
=
get_bps_from_ss
(
rfam_ss
,
columns
)
graph_ss
=
get_ss_from_graph
(
graphs
[
module
][
0
])
print
(
"COMPARING :"
,
rfam_bps
,
graph_ss
)
if
rfam_ss
==
graph_ss
:
return
True
else
:
return
False
def
find_good_columns
(
module_columns_by_family
,
family
,
module
,
graphs
):
good_cols
=
[]
cols
=
[]
module_length
=
-
1
...
...
@@ -266,9 +292,19 @@ def find_good_columns(module_columns_by_family, family):
good_cols_for_this_node
.
append
(
i
)
good_cols
.
append
(
good_cols_for_this_node
)
#print(counts)
return
good_cols
def
check_columns_ss
(
good_cols
,
family
,
module
,
graphs
):
final_cols
=
[[]
for
x
in
good_cols
]
for
i
in
range
(
len
(
good_cols
[
0
])):
this_cols
=
[
x
[
i
]
for
x
in
good_cols
]
print
(
"GETTING SS FOR COLUMNS:"
,
this_cols
)
it_fits
=
verify_columns_ss
(
this_cols
,
family
,
module
,
graphs
)
if
it_fits
==
True
:
for
col
in
range
(
len
(
final_cols
)):
final_cols
[
col
].
append
(
this_cols
[
col
])
good_cols
=
final_cols
return
good_cols
def
most_common
(
lst
):
return
max
(
set
(
lst
),
key
=
lst
.
count
)
...
...
@@ -343,11 +379,21 @@ def sanity_check(struct_seqs,exp_seqs):
return
-
1000
def
get_ss_from_rfam_family
(
family
):
def
get_ss_from_rfam_family
(
family
,
cols
):
ss
=
""
fname
=
"RF"
+
format
(
family
,
"05d"
)
+
".stockholm.txt"
with
open
(
fname
,
"r"
)
as
f
:
lines
=
f
.
readlines
()
full_ss
=
lines
[
-
3
].
split
()[
-
1
]
print
(
"GENERATING SS"
)
for
i
in
cols
:
print
(
full_ss
[
i
])
ss
=
ss
+
full_ss
[
i
]
print
(
'SS for columns : '
)
print
(
cols
,
ss
)
return
ss
def
get_module_seqs_for_family
(
CURRENTLY_STUDIED_FAMILY
,
matches
,
module_PDB_addresses
,
CURRENT_MODULE
):
def
get_module_seqs_for_family
(
CURRENTLY_STUDIED_FAMILY
,
matches
,
module_PDB_addresses
,
CURRENT_MODULE
,
graphs
):
newfam
=
True
written
=
[]
seq_ranges
=
{}
...
...
@@ -426,7 +472,7 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
#print("---------")
#print(module_columns_by_family)
good_cols
=
find_good_columns
(
module_columns_by_family
,
CURRENTLY_STUDIED_FAMILY
)
good_cols
=
find_good_columns
(
module_columns_by_family
,
CURRENTLY_STUDIED_FAMILY
,
CURRENT_MODULE
,
graphs
)
if
[
-
1000
]
==
good_cols
[
0
]:
return
[]
print
(
"GOOD COLUMNS"
,
good_cols
)
...
...
@@ -472,9 +518,10 @@ def get_module_seqs_for_family(CURRENTLY_STUDIED_FAMILY,matches,module_PDB_addre
start_pos
=
seq_ranges
[
current_PDB
][
0
]
this_example_columns
.
append
(
gapped_column_from_ungapped_nuc_pos
(
gapped_seq
,
ungapped_seq
,
pos
-
(
start_pos
)
-
alignment_is_right
))
module_columns_by_family
[
CURRENTLY_STUDIED_FAMILY
][
current_PDB
]
=
(
this_example_columns
,
nucs_from_fred
(
positions_for_this_pdb
))
good_cols
=
find_good_columns
(
module_columns_by_family
,
CURRENTLY_STUDIED_FAMILY
)
good_cols
=
find_good_columns
(
module_columns_by_family
,
CURRENTLY_STUDIED_FAMILY
,
CURRENT_MODULE
,
graphs
)
if
good_cols
==
[]
or
[
-
1000
]
in
good_cols
:
return
-
1
good_cols
=
check_columns_ss
(
good_cols
,
CURRENTLY_STUDIED_FAMILY
,
CURRENT_MODULE
,
graphs
)
aln
=
AlignIO
.
read
(
open
(
"new_aln3.clustal"
),
"clustal"
)
for
PDB
in
module_columns_by_family
[
CURRENTLY_STUDIED_FAMILY
]:
columns
=
module_columns_by_family
[
CURRENTLY_STUDIED_FAMILY
][
PDB
][
0
]
...
...
@@ -521,8 +568,8 @@ def sanity_check_existing_seqs():
pickle
.
dump
(
fixed_seqs
,
open
(
"test3_ram.cPickle"
,
"wb"
))
if
__name__
==
"__main__"
:
sanity_check_existing_seqs
()
"""
#
sanity_check_existing_seqs()
rfam
=
pickle
.
load
(
open
(
"RFAM_databse.cPickle"
,
"rb"
))
rfam_by_PDB
=
{}
number
=
0
...
...
@@ -536,10 +583,10 @@ if __name__ == "__main__":
rfam_by_PDB
[
PDB_id
].
append
(
i
)
aligned_modulegraphs = pickle.load(open("
rna3dmotif
_aligned_modulegraphs.cPickle",'rb'))
PDB_positions = pickle.load(open("
rna3dmotif
_PDB_positions.cPickle",'rb'))
PDBs = pickle.load(open("
rna3dmotif
_PDB_names.cPickle",'rb'))
graphs = pickle.load(open("
rna3dmotif
_one_of_each_graph.cPickle",'rb'))
aligned_modulegraphs
=
pickle
.
load
(
open
(
"
test
_aligned_modulegraphs.cPickle"
,
'rb'
))
PDB_positions
=
pickle
.
load
(
open
(
"
test
_PDB_positions.cPickle"
,
'rb'
))
PDBs
=
pickle
.
load
(
open
(
"
test
_PDB_names.cPickle"
,
'rb'
))
graphs
=
pickle
.
load
(
open
(
"
test
_one_of_each_graph.cPickle"
,
'rb'
))
rfam_extra_sequences
=
[]
format_aligned_modulegraphs
=
[]
format_PDB_positions
=
[]
...
...
@@ -550,7 +597,8 @@ if __name__ == "__main__":
matches
=
{}
positions
=
{}
module_PDB_addresses
=
{}
interesting = [2, 8, 9, 14, 20, 28, 36, 59, 113, 127, 133, 150, 162, 194, 195]
#interesting = [2, 8, 9, 14, 20, 28, 36, 59, 113, 127, 133, 150, 162, 194, 195]
interesting
=
range
(
len
(
graphs
))
test_modules
=
interesting
a
=
get_ss_from_graph
(
graphs
[
2
][
0
])
rfam_module_seqs
=
{}
...
...
@@ -571,7 +619,7 @@ if __name__ == "__main__":
if
matches
[
module
][
family_match
][
0
]
>
10
:
#if family_match==2541:
#print("ZIPCODE",module_addresses)
module_seqs = get_module_seqs_for_family(family_match,matches,module_addresses,module)
module_seqs
=
get_module_seqs_for_family
(
family_match
,
matches
,
module_addresses
,
module
,
graphs
)
if
module_seqs
==-
1
:
print
(
"FAILED TO FIND NEW SEQUENCES FOR FAMILY "
,
family_match
)
continue
...
...
@@ -585,11 +633,15 @@ if __name__ == "__main__":
#rfam_name = "Rfam_module_seqs_"+ str(module) + ".cPickle"
print
(
"OUTPUT"
)
for i in rfam_extra_sequences:
final_sequences
=
[]
for
i
in
range
(
len
(
rfam_extra_sequences
)):
print
(
i
)
pickle.dump(rfam_extra_sequences,open("test2_rfam.cPickle",'wb'))
pickle.dump(format_aligned_modulegraphs,open("test2_aligned_modulegraphs.cPickle",'wb'))
pickle.dump(format_graphs,open("test2_one_of_each_graph.cPickle",'wb'))
pickle.dump(format_PDB_names,open("test2_PDB_names.cPickle",'wb'))
pickle.dump(format_PDB_positions,open("test2_PDB_positions.cPickle",'wb'))
"""
new_seq
=
rule_out_impossible_seqs
(
graphs
[
interesting
[
i
]][
0
],
rfam_extra_sequences
[
i
])
final_sequences
.
append
(
new_seq
)
#pickle.dump(rfam_extra_sequences,open("test3_rfam.cPickle",'wb'))
pickle
.
dump
(
final_sequences
,
open
(
"test4_rfam.cPickle"
,
'wb'
))
pickle
.
dump
(
format_aligned_modulegraphs
,
open
(
"test4_aligned_modulegraphs.cPickle"
,
'wb'
))
pickle
.
dump
(
format_graphs
,
open
(
"test4_one_of_each_graph.cPickle"
,
'wb'
))
pickle
.
dump
(
format_PDB_names
,
open
(
"test4_PDB_names.cPickle"
,
'wb'
))
pickle
.
dump
(
format_PDB_positions
,
open
(
"test4_PDB_positions.cPickle"
,
'wb'
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment