Commit 7521a1d3 authored by Roman Sarrazin-Gendron's avatar Roman Sarrazin-Gendron
Browse files

added clustal

parent 59917ecd
#external libraries
import json #to hash data of edges (dictionnaries)
class RIN:
graph=None
order=0
size=0
primary_key=""
canonical=False
ID=-1
d_edges_by_category={}
secondary_key=""
nb_occurences=0
d_occurences={}
representing_occurence=None
def __init__(self,graph,d_edges_by_category,g_name,h_name,d_generic_nodes_to_nodes_in_g,d_generic_nodes_to_nodes_in_h):
self.graph=graph
self.order=self.graph.order()
self.size=self.graph.size()
self.primary_key='o'+str(self.order).zfill(6)+'s'+str(self.size).zfill(6) #zfill so primary keys are ordered
self.canonical=False
self.ID=-1
self.d_edges_by_category=d_edges_by_category
self.secondary_key=json.dumps(d_edges_by_category, sort_keys=True)
self.d_occurences={}
g_occurence_key=json.dumps(sorted(d_generic_nodes_to_nodes_in_g.values()))
self.representing_occurence=(g_name,g_occurence_key)
self.d_occurences[g_name]={g_occurence_key:d_generic_nodes_to_nodes_in_g}
h_occurence_key=json.dumps(sorted(d_generic_nodes_to_nodes_in_h.values()))
self.d_occurences[h_name]={h_occurence_key:d_generic_nodes_to_nodes_in_h}
self.nb_occurences=2
def make_canonical(self,ID):
self.canonical=True
self.ID=ID
def get_keys(self):
return (self.primary_key,self.secondary_key)
# external imports
import networkx as nx
import queue
# internat imports
import shared_functions
import RIN
import matching
import core
def edge_composition_compatibility(RIN_1,RIN_2):
for category,count in RIN_1.d_edges_by_category.items():
if RIN_2.d_edges_by_category.get(category,0) < count:
return False
return True
def are_isomorfic(RIN_1,RIN_2):
if RIN_1.order == RIN_2.order and RIN_1.size == RIN_2.size:
if RIN_1.secondary_key == RIN_2.secondary_key:
return core.test_isomorphism(RIN_1.graph,RIN_2.graph)
#return (are_isomorphic,matching)
return(False,None)
def merge_2_RINs(RIN_1,RIN_2,m):
if RIN_1.representing_occurence <= RIN_2.representing_occurence:
RIN_to_return=RIN_1
RIN_to_merge=RIN_2
d_nodes=m.d_h
else:
RIN_to_return=RIN_2
RIN_to_merge=RIN_1
d_nodes=m.d_g
for graph,d_occurences in RIN_to_merge.d_occurences.items():
for key,occurence in d_occurences.items():
if RIN_to_return.d_occurences.get(graph,None) != None:
if RIN_to_return.d_occurences[graph].get(key,None) != None:
continue
else:
RIN_to_return.d_occurences[graph]={}
#Now <graph> is in d_occurence but not <graph><key>
updated_occurence={}
for node_in_RIN_to_merge,node_in_graph in occurence.items():
updated_occurence[d_nodes[node_in_RIN_to_merge]]=node_in_graph
RIN_to_return.d_occurences[graph][key]=updated_occurence
RIN_to_return.nb_occurences+=1
return RIN_to_return
def merge_protoRINs(d_protoRINs):
d_RINs={}
for primary_key,d_prime in d_protoRINs.items():
d_RINs[primary_key]={}
for secondary_key,l_second in d_prime.items():
d_RINs[primary_key][secondary_key]=[]
fifo=queue.Queue()
for pRIN in l_second:
fifo.put(pRIN)
while not fifo.empty():
pRIN_1=fifo.get()
next_fifo=queue.Queue()
while not fifo.empty():
pRIN_2=fifo.get()
are_isomorphic,matching=are_isomorfic(pRIN_1,pRIN_2)
if are_isomorphic:
pRIN_1=merge_2_RINs(pRIN_1,pRIN_2,matching)
else:
next_fifo.put(pRIN_2)
d_RINs[primary_key][secondary_key].append(pRIN_1)
fifo=next_fifo
return d_RINs
def merge_d_RINs(d_RINs_1,d_RINs_2):
#We are going to add the content of d_RINs_2 to d_RINs_1
#First, check for common primary_key/secondary_key
#For any common pair, merge
for primary_key,d_prime in d_RINs_1.items():
if d_RINs_2.get(primary_key,None)!= None:
for secondary_key,l_second in d_prime.items():
if d_RINs_2[primary_key].get(secondary_key,None)!= None:
l_RINs_1=d_RINs_1[primary_key][secondary_key]
l_RINs_2=d_RINs_1[primary_key][secondary_key]
d_RINs_1[primary_key][secondary_key]=[]
d_invalid_RIN_2={}
for RIN_1 in l_RINs_1:
for RIN_2 in l_RINs_2:
if not d_invalid_RIN_2.get(RIN_2,False):
are_isomorphic,matching=are_isomorfic(RIN_1,RIN_2)
if are_isomorphic:
RIN_1=merge_2_RINs(RIN_1,RIN_2,matching)
is_valid=False
d_invalid_RIN_2[RIN_2]=True
break
d_RINs_1[primary_key][secondary_key].append(RIN_1)
for RIN_2 in l_RINs_2:
if not d_invalid_RIN_2.get(RIN_2,False):
d_RINs_1[primary_key][secondary_key].append(RIN_2)
#now we copy the RINs specific to d_RINs_2
for primary_key,d_prime in d_RINs_2.items():
if d_RINs_1.get(primary_key,None)== None:
d_RINs_1[primary_key]={}
for secondary_key,l_second in d_prime.items():
if d_RINs_1[primary_key].get(secondary_key,None)== None:
d_RINs_1[primary_key][secondary_key]=[]
for RIN_2 in l_second:
d_RINs_1[primary_key][secondary_key].append(RIN_2)
return d_RINs_1
def count_RINs_in_dict(d_RINs_1):
count=0
for primary_key,d_prime in d_RINs_1.items():
for secondary_key,l_second in d_prime.items():
count+=len(l_second)
return count
def d_RINs_to_sorted_list(d_RINs_1):
l_RINs=[]
for primary_key in sorted(d_RINs_1.keys()):
for secondary_key in sorted(d_RINs_1[primary_key].keys()):
l_RINs.extend(d_RINs_1[primary_key][secondary_key])
return l_RINs
###########Test tools##########################################################
def d_RINs_1_included_in_d_RINs_2(d_RINs_1,d_RINs_2):
l=[]
for primary_key,d_prime in d_RINs_1.items():
if d_RINs_2.get(primary_key,None)== None:
return False
else:
for secondary_key,l_second in d_prime.items():
if d_RINs_2[primary_key].get(secondary_key,None)== None:
return False
else:
for RIN_1 in l_second:
missing=True
for RIN_2 in d_RINs_2[primary_key][secondary_key]:
if are_isomorfic(RIN_1,RIN_2)[0]:
missing=False
break
if missing:
return False
return True
def count_duplicates_in_d_RINs_1(d_RINs_1):
n=0
for primary_key,d_prime in d_RINs_1.items():
for secondary_key,l_second in d_prime.items():
for RIN_1 in l_second:
unique=True
for RIN_2 in l_second:
if RIN_1 != RIN_2:
if are_isomorfic(RIN_1,RIN_2)[0]:
unique=False
break
if not unique:
n+=1
return n
###########Hierarchy construction tools########################################
def preliminary_test_inclusion(RIN_1,RIN_2):
if RIN_1.order <= RIN_2.order and RIN_1.size < RIN_2.size:
if edge_composition_compatibility(RIN_1,RIN_2):
return True
return False
def build_hierarchy(d_RINs):
hierarchy={}
return hierarchy
#external imports
import sys
import json #to hash data of edges (dictionnaries)
import queue
# internal imports
import matching
import shared_functions
# Convention :
# G,H are the two graphs received as inputs
# g(,h) are generic graphs
def elligible_neighbours_search_RINs(d_n_g,d_n_h):#tested
#given two matched nodes n_g and n_h resp. from g and h
#return the list of pairs of nodes of their neighbouroughs that may be matched together according to to edge categories
common_categories=list(set(d_n_g.keys()) & set(d_n_h.keys()))
l_candidates=[]
backbone=True
# backbone=False
for category in common_categories:
if not ('"label": "B53"' in category or '"label": "B35"' in category):
backbone=False
l_candidates.append((d_n_g[category][0],d_n_h[category][0]))#TODO deal with triangle double interaction
if not backbone:
return l_candidates
else:
return []
def elligible_neighbours_test_isomorphism(d_n_g,d_n_h):#tested
#given two matched nodes n_g and n_h resp. from g and h
#return the list of pairs of nodes of their neighbouroughs that may be matched together according to to edge categories
common_categories=list(set(d_n_g.keys()) & set(d_n_h.keys()))
if len(common_categories)!=len(d_n_g.keys()) or len(common_categories)!=len(d_n_h.keys()):
return None
l_candidates=[]
for category in common_categories:
l_candidates.append((d_n_g[category][0],d_n_h[category][0]))#TODO deal with triangle double interaction
return l_candidates
def elligible_neighbours_test_sub_isomorphism(d_n_g,d_n_h):#tested
#given two matched nodes n_g and n_h resp. from g and h
#return the list of pairs of nodes of their neighbouroughs that may be matched together according to to edge categories
common_categories=list(set(d_n_g.keys()) & set(d_n_h.keys()))
if len(common_categories)!=len(d_n_g.keys()): #what's the purpose of this ?
return None
l_candidates=[]
for category in common_categories:
l_candidates.append((d_n_g[category][0],d_n_h[category][0]))#TODO deal with triangle double interaction
return l_candidates
def get_single_dict_of_starting_edges_all(g,f,main=False):#tested
list_of_not_oriented_labels=['CWW','TWW','CSS','TSS','CHH','THH']
#this function is to be changed to fit the current needs
#current : dict of all long_range==True edges in g gathered by labels, labels being the key in the dict (label:list_e)
d_label={}
for edge in g.edges():
if g.edges[edge]['long_range']:
data=g.edges[edge]
category=json.dumps(f(g.edges[edge]), sort_keys=True)#TODO ajouter f
not_oriented=(g.edges[edge]['label'] in list_of_not_oriented_labels)
if not_oriented:
if edge[0] < edge[1]:
d_label[category]=d_label.get(category,[])
d_label[category].append(edge)
if main:
d_label[category].append((edge[1],edge[0]))
else:
d_label[category]=d_label.get(category,[])
d_label[category].append(edge)
return d_label
#Used in the "graph matching" algorithms (list of maximal sub-isomorphism / isomorphism / sub-isomorphism)
def get_single_dict_of_starting_edges_least_populated(g,f,main=False):
list_of_not_oriented_labels=['CWW','TWW','CSS','TSS','CHH','THH']
d_edges_by_category={}
for edge in g.edges():
category=json.dumps(f(g.edges[edge]), sort_keys=True)
d_edges_by_category[category]=d_edges_by_category.get(category,[])
d_edges_by_category[category].append(edge)
least_populated_category=None
numbers=g.size()*2+1
for category in sorted(d_edges_by_category.keys()):
list_of_edges=d_edges_by_category[category]
label=g.edges[list_of_edges[0]]['label']
if label in list_of_not_oriented_labels:
if len(list_of_edges)*2 < numbers:
least_populated_category=category
numbers=len(list_of_edges)
elif len(list_of_edges) < numbers:
least_populated_category=category
numbers=len(list_of_edges)
if main:
label=g.edges[d_edges_by_category[least_populated_category][0]]['label']
if label in list_of_not_oriented_labels:
l_symetric_edges=[]
for edge in d_edges_by_category[least_populated_category]:
l_symetric_edges.append((edge[1],edge[0]))
d_edges_by_category[least_populated_category].extend(l_symetric_edges)
return {least_populated_category:d_edges_by_category[least_populated_category]}
def get_dicts_of_starting_edges(g,h,d_options):
#all
#provide a dictionnary {(category):edge} of edges elligable as starting points
#we distinguish oriented intereaction such as TWS from non-oriented ones as TWW
#this distinction come from the fact that nodes of two non-oriented edges (n1_g,n2_g) and (n1_h,n2_h)
# can be matched in two ways: ((n1_g,n1_h),(n2_g,n2_h)) and ((n2_g,n1_h),(n1_g,n2_h))
# to cover this we are adding (n1_h,n2_h), (n1_g,n2_g) and (n2_g,n1_g) to the output
# distinguing g from h is covered by the optional 'main' parameter
#least_populated
# provide a dictionnary {(category):edge} of edges elligable as starting points
# we are looking for the least populated category
get_single_dict_of_starting_edges=d_options['get_dicts_of_starting_edges']
return (get_single_dict_of_starting_edges(g,d_options['hashing_filter'],True),get_single_dict_of_starting_edges(h,d_options['hashing_filter']))
def extension_blocked_by_blacklist(d_g,d_h,matching,d_options):
for n_g,n_h in matching.get_list_of_matched_pairs():
for pair in d_options['elligible_neighbours'](d_g[n_g],d_h[n_h]):
if matching.pair_is_elligible_but_blacklisted(pair):
return True
return False
def extender(d_g,d_h,matching,d_options):
lifo_branching=queue.LifoQueue()
lifo_branching.put(matching)
l_results=[]
while not lifo_branching.empty():
m=lifo_branching.get()
branch_original_dicts=m.get_copies_of_white_and_black_dicts()
list_conflicts=[]
fifo_pair=queue.Queue()
for pair in m.get_list_of_matched_pairs():
fifo_pair.put(pair)
while not fifo_pair.empty():
n_g,n_h=fifo_pair.get()
l_candidates=d_options['elligible_neighbours'](d_g[n_g],d_h[n_h])
if l_candidates==None:
return (False,m)
for pair in l_candidates:
n_g,n_h=pair
if d_options['elligible_neighbours'](d_g[n_g],d_h[n_h]) != []:
case,extra=m.test_candidate_pair(pair)
if case == 'accepted':
fifo_pair.put(pair)
elif case == 'rejected':
continue
elif case == 'conflict':
if d_options['exit_on_conflict']:
return (False,m)
else:
list_conflicts.extend(extra)
if d_options['backtracking']:
if not extension_blocked_by_blacklist(d_g,d_h,m,d_options):
l_results.append(m)
#non_maximal_results DO generate interesting conflicts
for branched_matching in m.create_list_branched_matchings(branch_original_dicts,list_conflicts):
lifo_branching.put(branched_matching)
return d_options['return_format'](l_results,m,d_g,d_h)
def return_search_RINs(l_results,m,d_g,d_h):
return l_results
def return_test_isomorphism(l_results,m,d_g,d_h):
if m.get_size()==len(d_g) and m.get_size()==len(d_h):
return (True,m)
return (False,m)
def return_test_sub_isomorphism(l_results,m,d_g,d_h):
if m.get_size()==len(d_g):
return (True,m)
return (False,m)
def launcher(G,H,d_options):
l_matchings=[]
d_G=shared_functions.dict_from_graph(G,d_options['hashing_filter'])
d_H=shared_functions.dict_from_graph(H,d_options['hashing_filter'])
d_start_G,d_start_H=get_dicts_of_starting_edges(G,H,d_options)
elligable_categories = list(set(d_start_G.keys()) & set(d_start_H.keys()))
for category in elligable_categories:
for e_G in d_start_G[category]:
for e_H in d_start_H[category]:
n1_G,n2_G=e_G
n1_H,n2_H=e_H
pair1=(n1_G,n1_H)
pair2=(n2_G,n2_H)
need_to_return,result=d_options['intermediary_result_processing'](extender(d_G,d_H,matching.matching((pair1,pair2)),d_options),l_matchings)
if need_to_return:
return result
return d_options['final_result_processing'](l_matchings)
def intermediary_result_processing_multiple(extender_output,l_matchings):
l_matchings.extend(extender_output)
return (False,None)
def intermediary_result_processing_single(extender_output,l_matchings):
success,m=extender_output
if success:
return (True,(success,m))
else:
return (False,None)
def final_result_processing_multiple(l_matchings):
return l_matchings
def final_result_processing_single(l_matchings):
return (False,None)
#Return all common maximal sub-isomorphism between G and H
def search_RINs(G,H):
d_options={}
d_options['elligible_neighbours']=elligible_neighbours_search_RINs
d_options['get_dicts_of_starting_edges']=get_single_dict_of_starting_edges_all
d_options['hashing_filter']=shared_functions.identity
d_options['exit_on_conflict']=False
d_options['backtracking']=True
d_options['return_format']=return_search_RINs
d_options['intermediary_result_processing']=intermediary_result_processing_multiple
d_options['final_result_processing']=final_result_processing_multiple
return launcher(G,H,d_options)
#Tests if G and H are isomorphic
def test_isomorphism(G,H):
d_options={}
d_options['elligible_neighbours']=elligible_neighbours_test_isomorphism
d_options['get_dicts_of_starting_edges']=get_single_dict_of_starting_edges_least_populated
d_options['hashing_filter']=shared_functions.identity
d_options['exit_on_conflict']=True
d_options['backtracking']=False
d_options['return_format']=return_test_isomorphism
d_options['intermediary_result_processing']=intermediary_result_processing_single
d_options['final_result_processing']=final_result_processing_single
return launcher(G,H,d_options)
#Tests if G is a sub-isomorphism of H
def test_sub_isomorphism(G,H):
d_options={}
d_options['elligible_neighbours']=elligible_neighbours_test_sub_isomorphism
d_options['get_dicts_of_starting_edges']=get_single_dict_of_starting_edges_least_populated
d_options['hashing_filter']=shared_functions.identity
d_options['exit_on_conflict']=True
d_options['backtracking']=False
d_options['return_format']=return_test_sub_isomorphism
d_options['intermediary_result_processing']=intermediary_result_processing_single
d_options['final_result_processing']=final_result_processing_single
return launcher(G,H,d_options)
# external imports
import sys
import os.path
#teste
import networkx as nx
# internal imports
import shared_functions
import pre_processing
import core
import post_processing
import RIN_management
#configuration
data_folder='../data/'
raw_data_file='graphs_2.92.nxpickled'
pre_processed_data_folder='./pre_processed_data'
# file_list_pdb_chain='./list_pdb_chain_2.92.nxpickled'
PATH_output_file='./Myst_??.nxpickled'
PATH_test_output_file='./test_output.nxpickled'
#create paths from configuration
PATH_raw_data_file=data_folder+raw_data_file
PATH_pre_processed_data_file=pre_processed_data_folder+'/'+raw_data_file
#Do we need to pre-process the data ?
if not os.path.isfile(PATH_pre_processed_data_file):
#We do need to pre-process the data
data = shared_functions.load_data(PATH_raw_data_file)
pre_processed_data={}
for key,graph in data.items():
pre_processed_graph=pre_processing.pre_processing_graph(graph)
if pre_processed_graph.order() > 0:
pre_processed_data[key]=pre_processed_graph
shared_functions.dump_data(pre_processed_data,PATH_pre_processed_data_file)
data=pre_processed_data
else:
#We do not
data = shared_functions.load_data(PATH_pre_processed_data_file)
i=0
d_RINs={}
# fake_data={('1Y27', 'X'):data[('1Y27', 'X')],('4RGE', 'B'):data[('4RGE', 'B')]}
fake_data={('1FJG', 'A'):data[('1FJG', 'A')],('5J5B', 'BA'):data[('5J5B', 'BA')]}
# fake_data={('5J7L', 'DA'):data[('5J7L', 'DA')],('5FDU', '1A'):data[('5FDU', '1A')]}
# ('5J7L', 'DA') 1579
# ('5FDU', '1A') 1556
# ('4V88', 'A5') 1344
# ('5DM6', 'X') 1343
# ('1FJG', 'A') 771
# ('5J5B', 'BA') 740
# ('4V88', 'A6') 680
# ('3JCS', '1') 417
for g_name,g in fake_data.items():
for h_name,h in fake_data.items():
# for g_name,g in data.items():
# print(i,g_name)
# for h_name,h in data.items():
if g_name < h_name:
l_matchings=core.search_RINs(g,h)
# shared_functions.dump_data(l_matchings,PATH_output_file)
d_RINs_g_h=post_processing.post_processing_l_matchings(l_matchings,
g,
h,
g_name,
h_name)
d_RINs=RIN_management.merge_d_RINs(d_RINs,d_RINs_g_h)
i+=1
print(RIN_management.count_RINs_in_dict(d_RINs))
# shared_functions.dump_data(d_RINs,PATH_output_file)
#external libraries
import json #to hash data of edges (dictionnaries)
class matching:
#contains graph of valid RINs as well as their source graph ID and some information about their origin.
l_matches=[]
d_g={}
d_h={}
#we record decisions to prevent infinite cycling
#we whitelist some matching that shouldn't be challenged
whitedict={}
#we blacklist some matching that shouldn't be considered
blackdict={}
#any whitelisted matching is blacklisted in another branch and vice versa except the initial matching which is automatically whitelisted
#we still use matching after the extension process
#thus we benefit from a quicker access to some informations
#NOTE : the unique_key can only be considered unique once the matching extension is finished
unique_key=""
unique_key_updated=False
size=0
history=""
def __init__(self, l_pair,incremental_blacklist=[]):
self.l_matches=[]
self.d_g={}
self.d_h={}
self.whitedict={}
self.blackdict={}
self.unique_key=""
self.unique_key_updated=False
self.size=0
self.history="1"#remove
for n_g,n_h in l_pair:
self.l_matches.append((n_g,n_h))
self.d_g[n_g]=n_h
self.d_h[n_h]=n_g
self.whitedict[(n_g,n_h)]=True
self.size+=1
for pair in incremental_blacklist:
if not self.whitedict.get(pair,False):
self.blackdict[pair]=True
def get_list_of_matched_pairs(self):
return self.l_matches
def get_size(self):
return self.size
def get_unique_key(self):
if not self.unique_key_updated:
self.unique_key=""
for n_g in sorted(self.d_g.keys()):