Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Anton
CaRNAval2
Commits
7b4cf461
Commit
7b4cf461
authored
Apr 18, 2020
by
Anton
Browse files
Code & co
parent
adb5dcac
Changes
20
Hide whitespace changes
Inline
Side-by-side
README
0 → 100644
View file @
7b4cf461
SRC/Clean_new_collection.py
0 → 100644
View file @
7b4cf461
# external imports
import
sys
import
os.path
#teste
import
networkx
as
nx
# internal imports
import
shared_functions
import
RIN
import
RIN_management
import
subgraph
import
processing
#configuration
if
len
(
sys
.
argv
)
>
1
:
MODE
=
int
(
sys
.
argv
[
1
])
else
:
print
(
"Need to specify the mode as an argument in the command line"
)
sys
.
exit
(
0
)
if
MODE
==
1
:
#Cleaning a new collection with the support of an old one
old_d_RINS_file
=
'./CaRNAval_1.0_with_SSEs_distributions.nxpickled'
# new_d_RINs_file='./mpi_thesis_test_rsr_v3.1_000513_00005057.nxpickled'
# new_d_RINs_file='./mpi_thesis_test_rsr_v3.2_plusMfiltering_000513_00005081.nxpickled'
# new_d_RINs_file='./CaRNAval_2.0_soft_cleaning_no_duplicates_alt_000557_00007709.nxpickled'
# new_d_RINs_file='./CaRNAval_2.0_raw_000660_00007916.nxpickled'
# new_d_RINs_file='./CaRNAval_2.0_soft_cleaning_no_duplicates_alt_000557_00007709.nxpickled'
new_d_RINs_file
=
'./CaRNAval_2.0_raw_000660_00007916.nxpickled'
new_d_RINs_file
=
'./Local2020_002185_00015042.nxpickled'
RNA_graphs_file
=
'../data/graphs_2.92_nx3_with_SSEs.pickle'
# PATH_output_file_prefixe='./CaRNAval_2.0_scwa_'
# PATH_output_file_prefixe='./CaRNAval_2.0_TEST660_'
PATH_output_file_prefixe
=
'./Local2020_v1_'
PATH_output_file_suffixe
=
'.nxpickled'
print
(
"Cleaning the new collection:"
,
new_d_RINs_file
)
print
(
"Old collection to use as reference for IDs:"
,
old_d_RINS_file
)
old_d_RINs
=
shared_functions
.
load_data
(
old_d_RINS_file
)
new_d_RINs
=
shared_functions
.
load_data
(
new_d_RINs_file
)
data
=
shared_functions
.
load_data
(
RNA_graphs_file
)
# #First, let's remove duplicates from mode
print
(
'
\t
'
,
'1/4 Removing mode duplicates'
)
new_d_RINs
=
RIN_management
.
merge_mode_duplications
(
new_d_RINs
)
# print(RIN_management.count_RINs_and_occurrences_in_dict(new_d_RINs))
# sys.exit(0)
#Second, let's fill the gaps in the collection
print
(
'
\t
'
,
'2/4 Completing the collection'
)
new_d_RINs
=
processing
.
clean_collection
(
new_d_RINs
)
#Third, let's set the ID so they respect the old ones
print
(
'
\t
'
,
'3/4 Fixing RIN IDs'
)
# new_d_RINs=RIN_management.set_RIN_IDs_from_old_collection(old_d_RINs,new_d_RINs)
new_d_RINs
=
RIN_management
.
set_RIN_IDs_from_ID_x
(
new_d_RINs
,
1
)
#Third, let's compute the SSE distributions
print
(
'
\t
'
,
'4/4 Computing SSE distributions for each new RIN'
)
for
RIN
in
RIN_management
.
d_RINs_to_sorted_list
(
new_d_RINs
):
tmp
=
RIN
.
get_SSEs_distrib
(
data
)
#Finally, let's dump the result
nb_RINs
,
nb_occurrences
=
RIN_management
.
count_RINs_and_occurrences_in_dict
(
new_d_RINs
)
PATH_output_file
=
PATH_output_file_prefixe
+
str
(
nb_RINs
).
zfill
(
6
)
+
'_'
+
str
(
nb_occurrences
).
zfill
(
8
)
+
PATH_output_file_suffixe
shared_functions
.
dump_data
(
new_d_RINs
,
PATH_output_file
)
print
(
'Output written in:'
,
PATH_output_file
)
#JUST IN CASE
else
:
print
(
"Unknown mode"
)
SRC/RIN_management.py
0 → 100644
View file @
7b4cf461
# external imports
import
networkx
as
nx
import
queue
# internat imports
import
shared_functions
import
RIN
import
matching
import
isomorphism
import
subgraph
import
json
# import draw_graph
def
edge_composition_compatibility
(
RIN_1
,
RIN_2
):
for
category
,
count
in
RIN_1
.
d_edges_by_category
.
items
():
if
RIN_2
.
d_edges_by_category
.
get
(
category
,
0
)
<
count
:
return
False
return
True
def
are_isomorfic
(
RIN_1
,
RIN_2
):
if
RIN_1
.
order
==
RIN_2
.
order
and
RIN_1
.
size
==
RIN_2
.
size
:
if
RIN_1
.
secondary_key
==
RIN_2
.
secondary_key
:
return
isomorphism
.
isomorphism
(
RIN_1
.
graph
,
RIN_2
.
graph
)
#if either keys are different, there is no way the two are isomorphic
return
(
False
,
None
)
def
is_subgraph
(
RIN_1
,
RIN_2
):
#is RIN_1 a subgraph of RIN_2 ?
if
RIN_1
.
order
<=
RIN_2
.
order
and
RIN_1
.
size
<
RIN_2
.
size
:
if
edge_composition_compatibility
(
RIN_1
,
RIN_2
):
return
subgraph
.
subgraph
(
RIN_1
.
graph
,
RIN_2
.
graph
)
return
(
False
,
None
)
def
compare_RNA_modes
(
name_1
,
name_2
):
#We want to prevent graphs with a mode to reach the front
#As a consequence we are twiking the ordering in their defavor
PDB_ID_1
,
chain_1
,
mode_1
=
name_1
PDB_ID_2
,
chain_2
,
mode_2
=
name_2
if
(
not
mode_1
)
==
(
not
mode_2
):
#Case 1 : both have a mode or both don't
#just go with the 'lower'
return
(
name_1
<
name_2
)
else
:
#Case 1 : only one doesn't have a mode
#We pick this one (equivalent to (not mode_1) )
return
(
not
mode_1
)
def
merge_2_RINs
(
RIN_1
,
RIN_2
,
m
):
if
RIN_1
.
representing_occurrence
[
0
]
==
RIN_2
.
representing_occurrence
[
0
]:
if
RIN_1
.
representing_occurrence
[
1
]
<
RIN_2
.
representing_occurrence
[
1
]:
RIN_to_return
=
RIN_1
RIN_to_merge
=
RIN_2
d_nodes
=
m
.
d_h
else
:
RIN_to_return
=
RIN_2
RIN_to_merge
=
RIN_1
d_nodes
=
m
.
d_g
else
:
if
compare_RNA_modes
(
RIN_1
.
representing_occurrence
[
0
],
RIN_2
.
representing_occurrence
[
0
]):
RIN_to_return
=
RIN_1
RIN_to_merge
=
RIN_2
d_nodes
=
m
.
d_h
else
:
RIN_to_return
=
RIN_2
RIN_to_merge
=
RIN_1
d_nodes
=
m
.
d_g
for
graph
,
d_occurrences
in
RIN_to_merge
.
d_occurrences
.
items
():
for
key
,
occurrence
in
d_occurrences
.
items
():
if
RIN_to_return
.
d_occurrences
.
get
(
graph
,
None
)
!=
None
:
if
RIN_to_return
.
d_occurrences
[
graph
].
get
(
key
,
None
)
!=
None
:
continue
else
:
RIN_to_return
.
d_occurrences
[
graph
]
=
{}
#Now <graph> is in d_occurrence but not <graph><key>
updated_occurrence
=
{}
for
node_in_RIN_to_merge
,
node_in_graph
in
occurrence
.
items
():
updated_occurrence
[
d_nodes
[
node_in_RIN_to_merge
]]
=
node_in_graph
RIN_to_return
.
d_occurrences
[
graph
][
key
]
=
updated_occurrence
RIN_to_return
.
nb_occurrences
+=
1
return
RIN_to_return
def
merge_protoRINs
(
d_protoRINs
):
d_RINs
=
{}
for
primary_key
,
d_prime
in
d_protoRINs
.
items
():
d_RINs
[
primary_key
]
=
{}
for
secondary_key
,
l_second
in
d_prime
.
items
():
d_RINs
[
primary_key
][
secondary_key
]
=
[]
fifo
=
queue
.
Queue
()
for
pRIN
in
l_second
:
fifo
.
put
(
pRIN
)
while
not
fifo
.
empty
():
pRIN_1
=
fifo
.
get
()
next_fifo
=
queue
.
Queue
()
while
not
fifo
.
empty
():
pRIN_2
=
fifo
.
get
()
are_isomorphic
,
matching
=
are_isomorfic
(
pRIN_1
,
pRIN_2
)
if
are_isomorphic
:
pRIN_1
=
merge_2_RINs
(
pRIN_1
,
pRIN_2
,
matching
)
else
:
next_fifo
.
put
(
pRIN_2
)
d_RINs
[
primary_key
][
secondary_key
].
append
(
pRIN_1
)
fifo
=
next_fifo
return
d_RINs
def
merge_d_RINs
(
d_RINs_1
,
d_RINs_2
):
#We are going to add the content of d_RINs_2 to d_RINs_1
#First, check for common primary_key/secondary_key
#For any common pair of key, merge
for
primary_key
,
d_prime
in
d_RINs_1
.
items
():
if
d_RINs_2
.
get
(
primary_key
,
None
)
!=
None
:
#primary_key is in d_RINs_2
for
secondary_key
,
l_second
in
d_prime
.
items
():
if
d_RINs_2
[
primary_key
].
get
(
secondary_key
,
None
)
!=
None
:
l_RINs_1
=
d_RINs_1
[
primary_key
][
secondary_key
]
l_RINs_2
=
d_RINs_2
[
primary_key
][
secondary_key
]
d_RINs_1
[
primary_key
][
secondary_key
]
=
[]
d_invalid_RIN_2
=
{}
for
RIN_1
in
l_RINs_1
:
for
RIN_2
in
l_RINs_2
:
if
not
d_invalid_RIN_2
.
get
(
RIN_2
,
False
):
are_isomorphic
,
matching
=
are_isomorfic
(
RIN_1
,
RIN_2
)
if
are_isomorphic
:
RIN_1
=
merge_2_RINs
(
RIN_1
,
RIN_2
,
matching
)
d_invalid_RIN_2
[
RIN_2
]
=
True
break
#2 RINs of l_RINs_[1|2] can't be isomorphic so there's no point looking further
d_RINs_1
[
primary_key
][
secondary_key
].
append
(
RIN_1
)
for
RIN_2
in
l_RINs_2
:
if
not
d_invalid_RIN_2
.
get
(
RIN_2
,
False
):
d_RINs_1
[
primary_key
][
secondary_key
].
append
(
RIN_2
)
#now we copy the RINs specific to d_RINs_2
for
primary_key
,
d_prime
in
d_RINs_2
.
items
():
if
d_RINs_1
.
get
(
primary_key
,
None
)
==
None
:
d_RINs_1
[
primary_key
]
=
{}
for
secondary_key
,
l_second
in
d_prime
.
items
():
if
d_RINs_1
[
primary_key
].
get
(
secondary_key
,
None
)
==
None
:
d_RINs_1
[
primary_key
][
secondary_key
]
=
[]
for
RIN_2
in
l_second
:
d_RINs_1
[
primary_key
][
secondary_key
].
append
(
RIN_2
)
return
d_RINs_1
def
merge_mode_duplications
(
d_RINs_1
):
#the method used to cover exeptions may generate duplicates
#this function will remove such duplicates from the collection d_RINs_1
#for each RIN in the collection
for
RIN
in
d_RINs_to_sorted_list
(
d_RINs_1
):
#not efficient but easier to read
#first we transform the collection of occurrence
d_RNA_occ_mode
=
{}
for
key
,
d_occ
in
RIN
.
d_occurrences
.
items
():
pdb_ID
,
chain
,
mode
=
key
if
not
d_RNA_occ_mode
.
get
((
pdb_ID
,
chain
),
False
):
d_RNA_occ_mode
[(
pdb_ID
,
chain
)]
=
{}
for
key_occ
,
map
in
d_occ
.
items
():
better_key_occ
=
[]
#better_key_occ use the order in the representant instead of lexico order on the nodes names of the occurrence
for
n
in
sorted
(
map
.
keys
()):
better_key_occ
.
append
(
map
[
n
])
better_key_occ
=
json
.
dumps
(
better_key_occ
)
if
not
d_RNA_occ_mode
[(
pdb_ID
,
chain
)].
get
(
better_key_occ
,
False
):
d_RNA_occ_mode
[(
pdb_ID
,
chain
)][
better_key_occ
]
=
{}
d_RNA_occ_mode
[(
pdb_ID
,
chain
)][
better_key_occ
][
mode
]
=
key_occ
#second we identify potential duplicates
nb_occ_removed
=
0
keys_to_check
=
{}
for
(
pdb_ID
,
chain
),
d_second
in
d_RNA_occ_mode
.
items
():
for
better_key_occ
,
d_modes
in
d_second
.
items
():
if
len
(
d_modes
)
>
1
:
#there are duplicates
#we keep the less modified one (which is the first mode in the lexicographical order thanks to the way we designed modes)
i
=
0
# to skip the first step
for
mode
in
sorted
(
d_modes
.
keys
()):
if
i
>
0
:
key_occ
=
d_modes
[
mode
]
del
RIN
.
d_occurrences
[
pdb_ID
,
chain
,
mode
][
key_occ
]
nb_occ_removed
+=
1
keys_to_check
[(
pdb_ID
,
chain
,
mode
)]
=
True
i
+=
1
for
key
in
keys_to_check
.
keys
():
if
RIN
.
d_occurrences
[
key
]
==
{}:
del
RIN
.
d_occurrences
[
key
]
RIN
.
nb_occurrences
=
RIN
.
nb_occurrences
-
nb_occ_removed
return
d_RINs_1
def
count_RINs_and_occurrences_in_dict
(
d_RINs_1
):
nb_RINs
=
0
nb_occurrences
=
0
for
primary_key
,
d_prime
in
d_RINs_1
.
items
():
for
secondary_key
,
l_second
in
d_prime
.
items
():
for
RIN_1
in
l_second
:
nb_RINs
+=
1
nb_occurrences
+=
RIN_1
.
nb_occurrences
return
(
nb_RINs
,
nb_occurrences
)
def
count_RINs_in_dict
(
d_RINs_1
):
count
=
0
for
primary_key
,
d_prime
in
d_RINs_1
.
items
():
for
secondary_key
,
l_second
in
d_prime
.
items
():
count
+=
len
(
l_second
)
return
count
def
count_occurrences_in_dict
(
d_RINs_1
):
count
=
0
for
primary_key
,
d_prime
in
d_RINs_1
.
items
():
for
secondary_key
,
l_second
in
d_prime
.
items
():
for
RIN
in
l_second
:
count
+=
RIN
.
nb_occurrences
return
count
def
d_RINs_to_sorted_list
(
d_RINs_1
):
l_RINs
=
[]
for
primary_key
in
sorted
(
d_RINs_1
.
keys
()):
for
secondary_key
in
sorted
(
d_RINs_1
[
primary_key
].
keys
()):
l_RINs
.
extend
(
d_RINs_1
[
primary_key
][
secondary_key
])
return
l_RINs
def
d_RINs_to_d_ID
(
d_RINs_1
):
d_ID
=
{}
for
primary_key
,
d_prime
in
d_RINs_1
.
items
():
for
secondary_key
,
l_second
in
d_prime
.
items
():
for
RIN
in
l_second
:
d_ID
[
RIN
.
ID
]
=
RIN
return
d_ID
def
set_RIN_IDs_from_ID_x
(
new_d_RINs
,
x
):
#x is included (i.e. a new RIN may receive x as its ID)
#We go through the new collection and set all missing IDs
#NOTE : default ID is -1
l_new_RINs
=
d_RINs_to_sorted_list
(
new_d_RINs
)
for
new_RIN
in
l_new_RINs
:
if
new_RIN
.
ID
<
0
:
new_RIN
.
ID
=
x
x
+=
1
return
new_d_RINs
def
set_RIN_IDs_from_old_collection
(
old_d_RINs
,
new_d_RINs
):
#First we transmit the IDs of the old collection to the new
for
primary_key
,
d_prime
in
old_d_RINs
.
items
():
if
new_d_RINs
.
get
(
primary_key
,
False
):
#primary_key is in new_d_RINs
for
secondary_key
,
old_l_second
in
d_prime
.
items
():
if
new_d_RINs
[
primary_key
].
get
(
secondary_key
,
False
):
#secondary_key is also in new_d_RINs
new_l_second
=
new_d_RINs
[
primary_key
][
secondary_key
]
for
old_RIN
in
old_l_second
:
found
=
False
for
new_RIN
in
new_l_second
:
found
=
are_isomorfic
(
old_RIN
,
new_RIN
)[
0
]
if
found
:
new_RIN
.
ID
=
old_RIN
.
ID
break
# if found:
# break
#Now the only new RINs without an ID are trully new RINs
#First we need to find the last index of the old collection
l_old_RINs
=
d_RINs_to_sorted_list
(
old_d_RINs
)
max_ID
=
-
1
for
old_RIN
in
l_old_RINs
:
if
old_RIN
.
ID
>
max_ID
:
max_ID
=
old_RIN
.
ID
#Now we go through the new collection and set all missing IDs using set_RIN_IDs_from_ID_x
return
set_RIN_IDs_from_ID_x
(
new_d_RINs
,
max_ID
+
1
)
###########Test tools##########################################################
def
d_RINs_1_included_in_d_RINs_2
(
d_RINs_1
,
d_RINs_2
):
#compare RINs, not their occurrences
l
=
[]
for
primary_key
,
d_prime
in
d_RINs_1
.
items
():
if
d_RINs_2
.
get
(
primary_key
,
None
)
==
None
:
return
False
else
:
for
secondary_key
,
l_second
in
d_prime
.
items
():
if
d_RINs_2
[
primary_key
].
get
(
secondary_key
,
None
)
==
None
:
return
False
else
:
for
RIN_1
in
l_second
:
missing
=
True
for
RIN_2
in
d_RINs_2
[
primary_key
][
secondary_key
]:
if
are_isomorfic
(
RIN_1
,
RIN_2
)[
0
]:
missing
=
False
break
if
missing
:
return
False
return
True
def
diff_d_RINs_1_minus_RINs_2
(
d_RINs_1
,
d_RINs_2
):
#we assume d_RINs_2 is included in d_RINs_1
d_3
=
{}
for
primary_key
,
d_prime
in
d_RINs_1
.
items
():
if
d_RINs_2
.
get
(
primary_key
,
None
)
==
None
:
d_3
[
primary_key
]
=
d_prime
else
:
for
secondary_key
,
l_second
in
d_prime
.
items
():
if
d_RINs_2
[
primary_key
].
get
(
secondary_key
,
None
)
==
None
:
tmp
=
d_3
.
get
(
primary_key
,{})
tmp
[
secondary_key
]
=
l_second
d_3
[
primary_key
]
=
tmp
else
:
for
RIN_1
in
l_second
:
missing
=
True
for
RIN_2
in
d_RINs_2
[
primary_key
][
secondary_key
]:
if
are_isomorfic
(
RIN_1
,
RIN_2
)[
0
]:
missing
=
False
break
if
missing
:
tmp_1
=
d_3
.
get
(
primary_key
,{})
tmp_2
=
tmp_1
.
get
(
secondary_key
,[])
tmp_2
.
append
(
RIN_1
)
tmp_1
[
secondary_key
]
=
tmp_2
d_3
[
primary_key
]
=
tmp_1
return
d_3
def
count_duplicates_in_d_RINs_1
(
d_RINs_1
):
n
=
0
for
primary_key
,
d_prime
in
d_RINs_1
.
items
():
for
secondary_key
,
l_second
in
d_prime
.
items
():
for
RIN_1
in
l_second
:
unique
=
True
for
RIN_2
in
l_second
:
if
RIN_1
!=
RIN_2
:
if
are_isomorfic
(
RIN_1
,
RIN_2
)[
0
]:
unique
=
False
break
if
not
unique
:
n
+=
1
return
n
def
get_collection_occurrence_graph
(
d_RINs_1
,
data
):
l
=
[]
for
primary_key
,
d_prime
in
d_RINs_1
.
items
():
for
secondary_key
,
l_second
in
d_prime
.
items
():
for
RIN_1
in
l_second
:
l_g
=
RIN_1
.
create_occurrence_graphs
(
data
)
l
.
append
((
RIN_1
,
l_g
))
return
l
################Hierarchy construction ########################################
def
build_hierarchy
(
d_RINs
):
hierarchy
=
{}
return
hierarchy
SRC/isomorphism.py
0 → 100644
View file @
7b4cf461
#external imports
import
sys
import
json
#to hash data of edges (dictionnaries)
import
queue
# internal imports
import
matching
import
shared_functions
# Convention :
# G,H are the two graphs received as inputs
# g,h are generic graphs
#Algorithm testing if two graphs G and H admitting a proper edge-coloring are isomorphic
def
elligible_neighbours
(
d_n_g
,
d_n_h
):
#tested
#given two matched nodes n_g and n_h resp. from g and h
#return the list of pairs of nodes of their neighbouroughs that may be matched together according to to edge categories
common_categories
=
list
(
set
(
d_n_g
.
keys
())
&
set
(
d_n_h
.
keys
()))
if
len
(
common_categories
)
!=
len
(
d_n_g
.
keys
())
or
len
(
common_categories
)
!=
len
(
d_n_h
.
keys
()):
#stop condition
return
None
l_candidates
=
[]
for
category
in
common_categories
:
l_candidates
.
append
((
d_n_g
[
category
][
0
],
d_n_h
[
category
][
0
]))
#TODO deal with triangle double interaction
return
l_candidates
#Used in the "graph matching" algorithms (list of maximal sub-isomorphism / isomorphism / sub-isomorphism)
def
get_single_dict_of_starting_edges_least_populated
(
g
,
f
,
main
=
False
):
list_of_not_oriented_labels
=
[
'CWW'
,
'TWW'
,
'CSS'
,
'TSS'
,
'CHH'
,
'THH'
]
d_edges_by_category
=
{}
for
edge
in
g
.
edges
():
# if g.edges[edge]['long_range']:
category
=
json
.
dumps
(
f
(
g
.
edges
[
edge
]),
sort_keys
=
True
)
d_edges_by_category
[
category
]
=
d_edges_by_category
.
get
(
category
,[])
d_edges_by_category
[
category
].
append
(
edge
)
least_populated_category
=
None
numbers
=
g
.
size
()
*
2
+
1
for
category
in
sorted
(
d_edges_by_category
.
keys
()):
list_of_edges
=
d_edges_by_category
[
category
]
label
=
g
.
edges
[
list_of_edges
[
0
]][
'label'
]
if
label
in
list_of_not_oriented_labels
:
if
len
(
list_of_edges
)
*
2
<
numbers
:
least_populated_category
=
category
numbers
=
len
(
list_of_edges
)
elif
len
(
list_of_edges
)
<
numbers
:
least_populated_category
=
category
numbers
=
len
(
list_of_edges
)
if
main
:
label
=
g
.
edges
[
d_edges_by_category
[
least_populated_category
][
0
]][
'label'
]
if
label
in
list_of_not_oriented_labels
:
l_symetric_edges
=
[]
for
edge
in
d_edges_by_category
[
least_populated_category
]:
l_symetric_edges
.
append
((
edge
[
1
],
edge
[
0
]))
d_edges_by_category
[
least_populated_category
].
extend
(
l_symetric_edges
)
return
{
least_populated_category
:
d_edges_by_category
[
least_populated_category
]}
def
get_dicts_of_starting_edges
(
g
,
h
,
d_options
):
#all
#provide a dictionnary {(category):edge} of edges elligible as starting points
#we distinguish oriented intereaction such as TWS from non-oriented ones as TWW
#this distinction come from the fact that nodes of two non-oriented edges (n1_g,n2_g) and (n1_h,n2_h)
# can be matched in two ways: ((n1_g,n1_h),(n2_g,n2_h)) and ((n2_g,n1_h),(n1_g,n2_h))
# to cover this we are adding (n1_h,n2_h), (n1_g,n2_g) and (n2_g,n1_g) to the output
# distinguing g from h is covered by the optional 'main' parameter
#least_populated
# provide a dictionnary {(category):edge} of edges elligible as starting points
# we are looking for the least populated category
return
(
get_single_dict_of_starting_edges_least_populated
(
g
,
d_options
[
'hashing_filter'
],
True
),
get_single_dict_of_starting_edges_least_populated
(
h
,
d_options
[
'hashing_filter'
]))
def
extender
(
d_g
,
d_h
,
matching
,
d_options
):
m
=
matching
fifo_pair
=
queue
.
Queue
()
for
pair
in
m
.
get_list_of_matched_pairs
():
fifo_pair
.
put
(
pair
)
while
not
fifo_pair
.
empty
():
n_g
,
n_h
=
fifo_pair
.
get
()
l_candidates
=
elligible_neighbours
(
d_g
[
n_g
],
d_h
[
n_h
])
if
l_candidates
==
None
:
return
(
False
,
m
)
for
pair
in
l_candidates
:
case
,
extra
=
m
.
test_candidate_pair
(
pair
)
if
case
==
'accepted'
:
fifo_pair
.
put
(
pair
)
elif
case
==
'rejected'
:
continue
elif
case
==
'conflict'
:
return
(
False
,
m
)
if
m
.
get_size
()
==
len
(
d_g
)
and
m
.
get_size
()
==
len
(
d_h
):
return
(
True
,
m
)
return
(
False
,
m
)
def
launcher
(
G
,
H
,
d_options
):
d_G
=
shared_functions
.
dict_from_graph
(
G
,
d_options
[
'hashing_filter'
])
d_H
=
shared_functions
.
dict_from_graph
(
H
,
d_options
[
'hashing_filter'
])
d_start_G
,
d_start_H
=
get_dicts_of_starting_edges
(
G
,
H
,
d_options
)
elligible_categories
=
list
(
set
(
d_start_G
.
keys
())
&
set
(
d_start_H
.
keys
()))
# print('1',elligible_categories,set(d_start_G.keys()),set(d_start_H.keys()))
for
category
in
elligible_categories
:
for
e_G
in
d_start_G
[
category
]:
for
e_H
in
d_start_H
[
category
]:
# print("launcher",category,e_G,e_H)
n1_G
,
n2_G
=
e_G
n1_H
,
n2_H
=
e_H
pair1
=
(
n1_G
,
n1_H
)
pair2
=
(
n2_G
,
n2_H
)
success
,
m
=
extender
(
d_G
,
d_H
,
matching
.
matching
((
pair1
,
pair2
)),
d_options
)
if
success
:
return
(
True
,
m
)
return
(
False
,
None
)
#Tests if G and H are isomorphic
def
isomorphism
(
G
,
H
):
d_options
=
{}
d_options
[
'hashing_filter'
]
=
shared_functions
.
identity
return
launcher
(
G
,
H
,
d_options
)
SRC/main_mono.py
0 → 100644
View file @
7b4cf461
# external imports
import
sys
import
os.path
#WARNING : we import MPI (mpi4py) but we do it later in this code
#It impacts the clarity of the code but gives us the opportunity to segregate the pre_processing
#teste
import
networkx
as
nx
# internal imports
import
shared_functions
from
processing
import
pre_processing_raw_data
as
pre_processing_raw_data
from
processing
import
post_processing_l_matchings
as
post_processing_l_matchings
import
modular_maximal_common_subgraph
as
maximal_common_subgraph
import
RIN_management
#configuration
data_folder
=
'../data'
raw_data_file
=
'graphs_2.92_nx3_with_SSEs.pickle'
pre_processed_data_folder
=
'../pre_processed_data'
pre_processed_data_file
=
'graphs_2.92_nx3_modular_distributed_2.pickle'
# pre_processed_data_file='graphs_2.92_nx3_modular_ROMAN.pickle'
# pre_processed_data_file='INCORRECT_graphs_2.92_nx3.pickle' #faster -> good for tests
# file_list_pdb_chain='./list_pdb_chain_2.92.nxpickled'
# PATH_output_file_prefixe='../mpi_thesis_full_modular_ROMAN_'
PATH_output_file_prefixe
=
'../caRNAval_2_fmd_van_1_'
PATH_output_file_suffixe
=
'.nxpickled'
PATH_test_output_file
=
'./test_output.nxpickled'
#create paths from configuration
PATH_raw_data_file
=
data_folder
+
'/'
+
raw_data_file
PATH_pre_processed_data_file
=
pre_processed_data_folder
+
'/'
+
pre_processed_data_file
def
make_data
(
data
):
# pseudo_data={('1Y27', 'X'):data[('1Y27', 'X')],('4RGE', 'B'):data[('4RGE', 'B')]}
# pseudo_data={('4V88', 'A6'):data[('4V88', 'A6')],('3JCS', '1'):data[('3JCS', '1')]}
# pseudo_data={('5J7L', 'DA'):data[('5J7L', 'DA')],('5FDU', '1A'):data[('5FDU', '1A')]}
# ('5J7L', 'DA') 1579
# ('5FDU', '1A') 1556
# ('4V88', 'A5') 1344
# ('5DM6', 'X') 1343
# ('1FJG', 'A') 771
# ('5J5B', 'BA') 740
# ('4V88', 'A6') 680
# ('3JCS', '1') 417
# data={('4V9F', '0', ''):data[('4V9F', '0', '')],('5FDU', '1A', '456+'):data[('5FDU', '1A', '456+')]}
# data={('1L2X', 'A', ''):data[('1L2X', 'A', '')],('3FU2', 'B', ''):data[('3FU2', 'B', '')]}
#faudrait trier pour avoir les grosses paires en premier et finir par les petites
list_of_tasks
=
[]
for
(
g_PDB_ID
,
g_chain
,
g_format
,
g_wcc_ID
),
g
in
data
.
items
():
for
(
h_PDB_ID
,
h_chain
,
h_format
,
h_wcc_ID
),
h
in
data
.
items
():
if
(
g_PDB_ID
,
g_chain
)
<
(
h_PDB_ID
,
h_chain
):
list_of_tasks
.
append
(((
g
.
size
()
*
h
.
size
()),((
g_PDB_ID
,
g_chain
,
g_format
,
g_wcc_ID
),(
h_PDB_ID
,
h_chain
,
h_format
,
h_wcc_ID
))))
for
key
,
task
in
sorted
(
list_of_tasks
,
key
=
lambda
x
:
x
[
0
],
reverse
=
True
):
yield
(
task
)
# Do we need to pre-process the data ?