Commit 7b4cf461 authored by Anton's avatar Anton
Browse files

Code & co

parent adb5dcac
# external imports
import sys
import os.path
#teste
import networkx as nx
# internal imports
import shared_functions
import RIN
import RIN_management
import subgraph
import processing
#configuration
if len(sys.argv) > 1:
MODE=int(sys.argv[1])
else:
print("Need to specify the mode as an argument in the command line")
sys.exit(0)
if MODE == 1:#Cleaning a new collection with the support of an old one
old_d_RINS_file='./CaRNAval_1.0_with_SSEs_distributions.nxpickled'
# new_d_RINs_file='./mpi_thesis_test_rsr_v3.1_000513_00005057.nxpickled'
# new_d_RINs_file='./mpi_thesis_test_rsr_v3.2_plusMfiltering_000513_00005081.nxpickled'
# new_d_RINs_file='./CaRNAval_2.0_soft_cleaning_no_duplicates_alt_000557_00007709.nxpickled'
# new_d_RINs_file='./CaRNAval_2.0_raw_000660_00007916.nxpickled'
# new_d_RINs_file='./CaRNAval_2.0_soft_cleaning_no_duplicates_alt_000557_00007709.nxpickled'
new_d_RINs_file='./CaRNAval_2.0_raw_000660_00007916.nxpickled'
new_d_RINs_file='./Local2020_002185_00015042.nxpickled'
RNA_graphs_file='../data/graphs_2.92_nx3_with_SSEs.pickle'
# PATH_output_file_prefixe='./CaRNAval_2.0_scwa_'
# PATH_output_file_prefixe='./CaRNAval_2.0_TEST660_'
PATH_output_file_prefixe='./Local2020_v1_'
PATH_output_file_suffixe='.nxpickled'
print("Cleaning the new collection:",new_d_RINs_file)
print("Old collection to use as reference for IDs:",old_d_RINS_file)
old_d_RINs = shared_functions.load_data(old_d_RINS_file)
new_d_RINs = shared_functions.load_data(new_d_RINs_file)
data = shared_functions.load_data(RNA_graphs_file)
# #First, let's remove duplicates from mode
print('\t','1/4 Removing mode duplicates')
new_d_RINs=RIN_management.merge_mode_duplications(new_d_RINs)
# print(RIN_management.count_RINs_and_occurrences_in_dict(new_d_RINs))
# sys.exit(0)
#Second, let's fill the gaps in the collection
print('\t','2/4 Completing the collection')
new_d_RINs=processing.clean_collection(new_d_RINs)
#Third, let's set the ID so they respect the old ones
print('\t','3/4 Fixing RIN IDs')
# new_d_RINs=RIN_management.set_RIN_IDs_from_old_collection(old_d_RINs,new_d_RINs)
new_d_RINs=RIN_management.set_RIN_IDs_from_ID_x(new_d_RINs,1)
#Third, let's compute the SSE distributions
print('\t','4/4 Computing SSE distributions for each new RIN')
for RIN in RIN_management.d_RINs_to_sorted_list(new_d_RINs):
tmp=RIN.get_SSEs_distrib(data)
#Finally, let's dump the result
nb_RINs,nb_occurrences=RIN_management.count_RINs_and_occurrences_in_dict(new_d_RINs)
PATH_output_file=PATH_output_file_prefixe+str(nb_RINs).zfill(6)+'_'+str(nb_occurrences).zfill(8)+PATH_output_file_suffixe
shared_functions.dump_data(new_d_RINs,PATH_output_file)
print('Output written in:',PATH_output_file)
#JUST IN CASE
else:
print("Unknown mode")
# external imports
import networkx as nx
import queue
# internat imports
import shared_functions
import RIN
import matching
import isomorphism
import subgraph
import json
# import draw_graph
def edge_composition_compatibility(RIN_1,RIN_2):
for category,count in RIN_1.d_edges_by_category.items():
if RIN_2.d_edges_by_category.get(category,0) < count:
return False
return True
def are_isomorfic(RIN_1,RIN_2):
if RIN_1.order == RIN_2.order and RIN_1.size == RIN_2.size:
if RIN_1.secondary_key == RIN_2.secondary_key:
return isomorphism.isomorphism(RIN_1.graph,RIN_2.graph)
#if either keys are different, there is no way the two are isomorphic
return (False,None)
def is_subgraph(RIN_1,RIN_2):
#is RIN_1 a subgraph of RIN_2 ?
if RIN_1.order <= RIN_2.order and RIN_1.size < RIN_2.size:
if edge_composition_compatibility(RIN_1,RIN_2):
return subgraph.subgraph(RIN_1.graph,RIN_2.graph)
return (False,None)
def compare_RNA_modes (name_1,name_2):
#We want to prevent graphs with a mode to reach the front
#As a consequence we are twiking the ordering in their defavor
PDB_ID_1,chain_1,mode_1=name_1
PDB_ID_2,chain_2,mode_2=name_2
if (not mode_1) == (not mode_2):
#Case 1 : both have a mode or both don't
#just go with the 'lower'
return (name_1 < name_2)
else:
#Case 1 : only one doesn't have a mode
#We pick this one (equivalent to (not mode_1) )
return (not mode_1)
def merge_2_RINs(RIN_1,RIN_2,m):
if RIN_1.representing_occurrence[0]==RIN_2.representing_occurrence[0]:
if RIN_1.representing_occurrence[1] < RIN_2.representing_occurrence[1]:
RIN_to_return=RIN_1
RIN_to_merge=RIN_2
d_nodes=m.d_h
else:
RIN_to_return=RIN_2
RIN_to_merge=RIN_1
d_nodes=m.d_g
else:
if compare_RNA_modes(RIN_1.representing_occurrence[0],RIN_2.representing_occurrence[0]):
RIN_to_return=RIN_1
RIN_to_merge=RIN_2
d_nodes=m.d_h
else:
RIN_to_return=RIN_2
RIN_to_merge=RIN_1
d_nodes=m.d_g
for graph,d_occurrences in RIN_to_merge.d_occurrences.items():
for key,occurrence in d_occurrences.items():
if RIN_to_return.d_occurrences.get(graph,None) != None:
if RIN_to_return.d_occurrences[graph].get(key,None) != None:
continue
else:
RIN_to_return.d_occurrences[graph]={}
#Now <graph> is in d_occurrence but not <graph><key>
updated_occurrence={}
for node_in_RIN_to_merge,node_in_graph in occurrence.items():
updated_occurrence[d_nodes[node_in_RIN_to_merge]]=node_in_graph
RIN_to_return.d_occurrences[graph][key]=updated_occurrence
RIN_to_return.nb_occurrences+=1
return RIN_to_return
def merge_protoRINs(d_protoRINs):
d_RINs={}
for primary_key,d_prime in d_protoRINs.items():
d_RINs[primary_key]={}
for secondary_key,l_second in d_prime.items():
d_RINs[primary_key][secondary_key]=[]
fifo=queue.Queue()
for pRIN in l_second:
fifo.put(pRIN)
while not fifo.empty():
pRIN_1=fifo.get()
next_fifo=queue.Queue()
while not fifo.empty():
pRIN_2=fifo.get()
are_isomorphic,matching=are_isomorfic(pRIN_1,pRIN_2)
if are_isomorphic:
pRIN_1=merge_2_RINs(pRIN_1,pRIN_2,matching)
else:
next_fifo.put(pRIN_2)
d_RINs[primary_key][secondary_key].append(pRIN_1)
fifo=next_fifo
return d_RINs
def merge_d_RINs(d_RINs_1,d_RINs_2):
#We are going to add the content of d_RINs_2 to d_RINs_1
#First, check for common primary_key/secondary_key
#For any common pair of key, merge
for primary_key,d_prime in d_RINs_1.items():
if d_RINs_2.get(primary_key,None)!= None:
#primary_key is in d_RINs_2
for secondary_key,l_second in d_prime.items():
if d_RINs_2[primary_key].get(secondary_key,None)!= None:
l_RINs_1=d_RINs_1[primary_key][secondary_key]
l_RINs_2=d_RINs_2[primary_key][secondary_key]
d_RINs_1[primary_key][secondary_key]=[]
d_invalid_RIN_2={}
for RIN_1 in l_RINs_1:
for RIN_2 in l_RINs_2:
if not d_invalid_RIN_2.get(RIN_2,False):
are_isomorphic,matching=are_isomorfic(RIN_1,RIN_2)
if are_isomorphic:
RIN_1=merge_2_RINs(RIN_1,RIN_2,matching)
d_invalid_RIN_2[RIN_2]=True
break #2 RINs of l_RINs_[1|2] can't be isomorphic so there's no point looking further
d_RINs_1[primary_key][secondary_key].append(RIN_1)
for RIN_2 in l_RINs_2:
if not d_invalid_RIN_2.get(RIN_2,False):
d_RINs_1[primary_key][secondary_key].append(RIN_2)
#now we copy the RINs specific to d_RINs_2
for primary_key,d_prime in d_RINs_2.items():
if d_RINs_1.get(primary_key,None)== None:
d_RINs_1[primary_key]={}
for secondary_key,l_second in d_prime.items():
if d_RINs_1[primary_key].get(secondary_key,None)== None:
d_RINs_1[primary_key][secondary_key]=[]
for RIN_2 in l_second:
d_RINs_1[primary_key][secondary_key].append(RIN_2)
return d_RINs_1
def merge_mode_duplications(d_RINs_1):
#the method used to cover exeptions may generate duplicates
#this function will remove such duplicates from the collection d_RINs_1
#for each RIN in the collection
for RIN in d_RINs_to_sorted_list(d_RINs_1):#not efficient but easier to read
#first we transform the collection of occurrence
d_RNA_occ_mode={}
for key,d_occ in RIN.d_occurrences.items():
pdb_ID,chain,mode=key
if not d_RNA_occ_mode.get((pdb_ID,chain),False):
d_RNA_occ_mode[(pdb_ID,chain)]={}
for key_occ,map in d_occ.items():
better_key_occ = [] #better_key_occ use the order in the representant instead of lexico order on the nodes names of the occurrence
for n in sorted(map.keys()):
better_key_occ.append(map[n])
better_key_occ=json.dumps(better_key_occ)
if not d_RNA_occ_mode[(pdb_ID,chain)].get(better_key_occ,False):
d_RNA_occ_mode[(pdb_ID,chain)][better_key_occ]={}
d_RNA_occ_mode[(pdb_ID,chain)][better_key_occ][mode]=key_occ
#second we identify potential duplicates
nb_occ_removed=0
keys_to_check={}
for (pdb_ID,chain),d_second in d_RNA_occ_mode.items():
for better_key_occ,d_modes in d_second.items():
if len(d_modes) > 1:
#there are duplicates
#we keep the less modified one (which is the first mode in the lexicographical order thanks to the way we designed modes)
i=0# to skip the first step
for mode in sorted(d_modes.keys()):
if i > 0:
key_occ = d_modes[mode]
del RIN.d_occurrences[pdb_ID,chain,mode][key_occ]
nb_occ_removed+=1
keys_to_check[(pdb_ID,chain,mode)]=True
i+=1
for key in keys_to_check.keys():
if RIN.d_occurrences[key] == {}:
del RIN.d_occurrences[key]
RIN.nb_occurrences=RIN.nb_occurrences - nb_occ_removed
return d_RINs_1
def count_RINs_and_occurrences_in_dict(d_RINs_1):
nb_RINs=0
nb_occurrences=0
for primary_key,d_prime in d_RINs_1.items():
for secondary_key,l_second in d_prime.items():
for RIN_1 in l_second:
nb_RINs+=1
nb_occurrences+=RIN_1.nb_occurrences
return (nb_RINs,nb_occurrences)
def count_RINs_in_dict (d_RINs_1):
count=0
for primary_key,d_prime in d_RINs_1.items():
for secondary_key,l_second in d_prime.items():
count+=len(l_second)
return count
def count_occurrences_in_dict (d_RINs_1):
count=0
for primary_key,d_prime in d_RINs_1.items():
for secondary_key,l_second in d_prime.items():
for RIN in l_second:
count+=RIN.nb_occurrences
return count
def d_RINs_to_sorted_list(d_RINs_1):
l_RINs=[]
for primary_key in sorted(d_RINs_1.keys()):
for secondary_key in sorted(d_RINs_1[primary_key].keys()):
l_RINs.extend(d_RINs_1[primary_key][secondary_key])
return l_RINs
def d_RINs_to_d_ID(d_RINs_1):
d_ID={}
for primary_key,d_prime in d_RINs_1.items():
for secondary_key,l_second in d_prime.items():
for RIN in l_second:
d_ID[RIN.ID]=RIN
return d_ID
def set_RIN_IDs_from_ID_x(new_d_RINs,x):
#x is included (i.e. a new RIN may receive x as its ID)
#We go through the new collection and set all missing IDs
#NOTE : default ID is -1
l_new_RINs=d_RINs_to_sorted_list(new_d_RINs)
for new_RIN in l_new_RINs:
if new_RIN.ID < 0:
new_RIN.ID=x
x+=1
return new_d_RINs
def set_RIN_IDs_from_old_collection(old_d_RINs,new_d_RINs):
#First we transmit the IDs of the old collection to the new
for primary_key,d_prime in old_d_RINs.items():
if new_d_RINs.get(primary_key,False):
#primary_key is in new_d_RINs
for secondary_key,old_l_second in d_prime.items():
if new_d_RINs[primary_key].get(secondary_key,False):
#secondary_key is also in new_d_RINs
new_l_second=new_d_RINs[primary_key][secondary_key]
for old_RIN in old_l_second:
found=False
for new_RIN in new_l_second:
found=are_isomorfic(old_RIN,new_RIN)[0]
if found:
new_RIN.ID=old_RIN.ID
break
# if found:
# break
#Now the only new RINs without an ID are trully new RINs
#First we need to find the last index of the old collection
l_old_RINs=d_RINs_to_sorted_list(old_d_RINs)
max_ID = -1
for old_RIN in l_old_RINs:
if old_RIN.ID > max_ID:
max_ID=old_RIN.ID
#Now we go through the new collection and set all missing IDs using set_RIN_IDs_from_ID_x
return set_RIN_IDs_from_ID_x(new_d_RINs,max_ID+1)
###########Test tools##########################################################
def d_RINs_1_included_in_d_RINs_2(d_RINs_1,d_RINs_2):#compare RINs, not their occurrences
l=[]
for primary_key,d_prime in d_RINs_1.items():
if d_RINs_2.get(primary_key,None)== None:
return False
else:
for secondary_key,l_second in d_prime.items():
if d_RINs_2[primary_key].get(secondary_key,None)== None:
return False
else:
for RIN_1 in l_second:
missing=True
for RIN_2 in d_RINs_2[primary_key][secondary_key]:
if are_isomorfic(RIN_1,RIN_2)[0]:
missing=False
break
if missing:
return False
return True
def diff_d_RINs_1_minus_RINs_2(d_RINs_1,d_RINs_2):
#we assume d_RINs_2 is included in d_RINs_1
d_3={}
for primary_key,d_prime in d_RINs_1.items():
if d_RINs_2.get(primary_key,None)== None:
d_3[primary_key]=d_prime
else:
for secondary_key,l_second in d_prime.items():
if d_RINs_2[primary_key].get(secondary_key,None)== None:
tmp=d_3.get(primary_key,{})
tmp[secondary_key]=l_second
d_3[primary_key]=tmp
else:
for RIN_1 in l_second:
missing=True
for RIN_2 in d_RINs_2[primary_key][secondary_key]:
if are_isomorfic(RIN_1,RIN_2)[0]:
missing=False
break
if missing:
tmp_1=d_3.get(primary_key,{})
tmp_2=tmp_1.get(secondary_key,[])
tmp_2.append(RIN_1)
tmp_1[secondary_key]=tmp_2
d_3[primary_key]=tmp_1
return d_3
def count_duplicates_in_d_RINs_1(d_RINs_1):
n=0
for primary_key,d_prime in d_RINs_1.items():
for secondary_key,l_second in d_prime.items():
for RIN_1 in l_second:
unique=True
for RIN_2 in l_second:
if RIN_1 != RIN_2:
if are_isomorfic(RIN_1,RIN_2)[0]:
unique=False
break
if not unique:
n+=1
return n
def get_collection_occurrence_graph(d_RINs_1,data):
l=[]
for primary_key,d_prime in d_RINs_1.items():
for secondary_key,l_second in d_prime.items():
for RIN_1 in l_second:
l_g = RIN_1.create_occurrence_graphs(data)
l.append((RIN_1,l_g))
return l
################Hierarchy construction ########################################
def build_hierarchy(d_RINs):
hierarchy={}
return hierarchy
#external imports
import sys
import json #to hash data of edges (dictionnaries)
import queue
# internal imports
import matching
import shared_functions
# Convention :
# G,H are the two graphs received as inputs
# g,h are generic graphs
#Algorithm testing if two graphs G and H admitting a proper edge-coloring are isomorphic
def elligible_neighbours(d_n_g,d_n_h):#tested
#given two matched nodes n_g and n_h resp. from g and h
#return the list of pairs of nodes of their neighbouroughs that may be matched together according to to edge categories
common_categories=list(set(d_n_g.keys()) & set(d_n_h.keys()))
if len(common_categories)!=len(d_n_g.keys()) or len(common_categories)!=len(d_n_h.keys()): #stop condition
return None
l_candidates=[]
for category in common_categories:
l_candidates.append((d_n_g[category][0],d_n_h[category][0]))#TODO deal with triangle double interaction
return l_candidates
#Used in the "graph matching" algorithms (list of maximal sub-isomorphism / isomorphism / sub-isomorphism)
def get_single_dict_of_starting_edges_least_populated(g,f,main=False):
list_of_not_oriented_labels=['CWW','TWW','CSS','TSS','CHH','THH']
d_edges_by_category={}
for edge in g.edges():
# if g.edges[edge]['long_range']:
category=json.dumps(f(g.edges[edge]), sort_keys=True)
d_edges_by_category[category]=d_edges_by_category.get(category,[])
d_edges_by_category[category].append(edge)
least_populated_category=None
numbers=g.size()*2+1
for category in sorted(d_edges_by_category.keys()):
list_of_edges=d_edges_by_category[category]
label=g.edges[list_of_edges[0]]['label']
if label in list_of_not_oriented_labels:
if len(list_of_edges)*2 < numbers:
least_populated_category=category
numbers=len(list_of_edges)
elif len(list_of_edges) < numbers:
least_populated_category=category
numbers=len(list_of_edges)
if main:
label=g.edges[d_edges_by_category[least_populated_category][0]]['label']
if label in list_of_not_oriented_labels:
l_symetric_edges=[]
for edge in d_edges_by_category[least_populated_category]:
l_symetric_edges.append((edge[1],edge[0]))
d_edges_by_category[least_populated_category].extend(l_symetric_edges)
return {least_populated_category:d_edges_by_category[least_populated_category]}
def get_dicts_of_starting_edges(g,h,d_options):
#all
#provide a dictionnary {(category):edge} of edges elligible as starting points
#we distinguish oriented intereaction such as TWS from non-oriented ones as TWW
#this distinction come from the fact that nodes of two non-oriented edges (n1_g,n2_g) and (n1_h,n2_h)
# can be matched in two ways: ((n1_g,n1_h),(n2_g,n2_h)) and ((n2_g,n1_h),(n1_g,n2_h))
# to cover this we are adding (n1_h,n2_h), (n1_g,n2_g) and (n2_g,n1_g) to the output
# distinguing g from h is covered by the optional 'main' parameter
#least_populated
# provide a dictionnary {(category):edge} of edges elligible as starting points
# we are looking for the least populated category
return (get_single_dict_of_starting_edges_least_populated(g,d_options['hashing_filter'],True),get_single_dict_of_starting_edges_least_populated(h,d_options['hashing_filter']))
def extender(d_g,d_h,matching,d_options):
m=matching
fifo_pair=queue.Queue()
for pair in m.get_list_of_matched_pairs():
fifo_pair.put(pair)
while not fifo_pair.empty():
n_g,n_h=fifo_pair.get()
l_candidates=elligible_neighbours(d_g[n_g],d_h[n_h])
if l_candidates==None:
return (False,m)
for pair in l_candidates:
case,extra=m.test_candidate_pair(pair)
if case == 'accepted':
fifo_pair.put(pair)
elif case == 'rejected':
continue
elif case == 'conflict':
return (False,m)
if m.get_size()==len(d_g) and m.get_size()==len(d_h):
return (True,m)
return (False,m)
def launcher(G,H,d_options):
d_G=shared_functions.dict_from_graph(G,d_options['hashing_filter'])
d_H=shared_functions.dict_from_graph(H,d_options['hashing_filter'])
d_start_G,d_start_H=get_dicts_of_starting_edges(G,H,d_options)
elligible_categories = list(set(d_start_G.keys()) & set(d_start_H.keys()))
# print('1',elligible_categories,set(d_start_G.keys()),set(d_start_H.keys()))
for category in elligible_categories:
for e_G in d_start_G[category]:
for e_H in d_start_H[category]:
# print("launcher",category,e_G,e_H)
n1_G,n2_G=e_G
n1_H,n2_H=e_H
pair1=(n1_G,n1_H)
pair2=(n2_G,n2_H)
success,m=extender(d_G,d_H,matching.matching((pair1,pair2)),d_options)
if success:
return (True,m)
return (False,None)
#Tests if G and H are isomorphic
def isomorphism(G,H):
d_options={}
d_options['hashing_filter']=shared_functions.identity
return launcher(G,H,d_options)
# external imports
import sys
import os.path
#WARNING : we import MPI (mpi4py) but we do it later in this code
#It impacts the clarity of the code but gives us the opportunity to segregate the pre_processing
#teste
import networkx as nx
# internal imports
import shared_functions
from processing import pre_processing_raw_data as pre_processing_raw_data
from processing import post_processing_l_matchings as post_processing_l_matchings
import modular_maximal_common_subgraph as maximal_common_subgraph
import RIN_management
#configuration
data_folder='../data'
raw_data_file='graphs_2.92_nx3_with_SSEs.pickle'
pre_processed_data_folder='../pre_processed_data'
pre_processed_data_file='graphs_2.92_nx3_modular_distributed_2.pickle'
# pre_processed_data_file='graphs_2.92_nx3_modular_ROMAN.pickle'
# pre_processed_data_file='INCORRECT_graphs_2.92_nx3.pickle' #faster -> good for tests
# file_list_pdb_chain='./list_pdb_chain_2.92.nxpickled'
# PATH_output_file_prefixe='../mpi_thesis_full_modular_ROMAN_'
PATH_output_file_prefixe='../caRNAval_2_fmd_van_1_'
PATH_output_file_suffixe='.nxpickled'
PATH_test_output_file='./test_output.nxpickled'
#create paths from configuration
PATH_raw_data_file=data_folder+'/'+raw_data_file
PATH_pre_processed_data_file=pre_processed_data_folder+'/'+pre_processed_data_file
def make_data(data):
# pseudo_data={('1Y27', 'X'):data[('1Y27', 'X')],('4RGE', 'B'):data[('4RGE', 'B')]}
# pseudo_data={('4V88', 'A6'):data[('4V88', 'A6')],('3JCS', '1'):data[('3JCS', '1')]}
# pseudo_data={('5J7L', 'DA'):data[('5J7L', 'DA')],('5FDU', '1A'):data[('5FDU', '1A')]}
# ('5J7L', 'DA') 1579
# ('5FDU', '1A') 1556
# ('4V88', 'A5') 1344
# ('5DM6', 'X') 1343
# ('1FJG', 'A') 771
# ('5J5B', 'BA') 740
# ('4V88', 'A6') 680
# ('3JCS', '1') 417
# data={('4V9F', '0', ''):data[('4V9F', '0', '')],('5FDU', '1A', '456+'):data[('5FDU', '1A', '456+')]}
# data={('1L2X', 'A', ''):data[('1L2X', 'A', '')],('3FU2', 'B', ''):data[('3FU2', 'B', '')]}
#faudrait trier pour avoir les grosses paires en premier et finir par les petites
list_of_tasks=[]
for (g_PDB_ID,g_chain,g_format,g_wcc_ID),g in data.items():
for (h_PDB_ID,h_chain,h_format,h_wcc_ID),h in data.items():
if (g_PDB_ID,g_chain) < (h_PDB_ID,h_chain):
list_of_tasks.append(((g.size()*h.size()),((g_PDB_ID,g_chain,g_format,g_wcc_ID),(h_PDB_ID,h_chain,h_format,h_wcc_ID))))
for key,task in sorted(list_of_tasks, key=lambda x: x[0],reverse=True):
yield(task)
# Do we need to pre-process the data ?
if not os.path.isfile(PATH_pre_processed_data_file):
data = pre_processing_raw_data(shared_functions.load_data(PATH_raw_data_file))
shared_functions.dump_data(data,PATH_pre_processed_data_file)
print("pre_processing done, please launch again")
sys.exit(0)
else:
#We do not
data = shared_functions.load_data(PATH_pre_processed_data_file)