Commit d0fbcc0c authored by Carlos GO's avatar Carlos GO
Browse files

post ged

parent 5d200298
"""
Convert GED output to distance matrix.
"""
import os
import pickle
import itertools
from scipy.spatial.distance import jaccard
import networkx as nx
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# plt.style.use('fivethirtyeight')
def distance_matrix(pickle_path):
geds = pickle.load(open(pickle_path, 'rb'))
ged_dict = {}
graphs = set()
for g in geds:
dist, nodemap, paths, time = g
if time == -2:
ged_dict[(paths[0], paths[1])] = (np.nan, nodemap, time)
else:
ged_dict[(paths[0], paths[1])] = (dist, nodemap, time)
graphs.add(paths[0])
graphs.add(paths[1])
distances = []
graphlist = sorted(list(graphs))
for g in itertools.combinations(graphlist, 2):
g1, g2 = g
try:
distances.append(ged_dict[g][0])
except KeyError:
distances.append(ged_dict[(g2,g1)][0])
pass
DM = np.zeros((len(graphs), len(graphs)))
DM[np.triu_indices(len(graphs), 1)] = distances
DM = DM + DM.T
return DM,graphlist
def no_redundants(DM, L, with_ss=False, ss=None):
fams = {}
uniques = [0]
redundants = set([])
for i in range(1,DM.shape[0]):
for u in uniques:
if (DM[u][i] == 0) and (jaccard(L[u], L[i])) == 0:
redundants.add(i)
break
else:
uniques.append(i)
return DM[uniques,:][:, uniques], L[uniques], uniques
def prepare_data(geds_path, fps_path, non_redundant=True):
"""
Convert GEDS to distance matrix with corresponding ligand fingerprints.
Returns:
- Distance matrix of graphs
- Fingerprint matrix
- List of graph IDs
- Skeleton distance (optional)
"""
DM, graphlist = distance_matrix(geds_path)
e = pickle.load(open(geds_path, 'rb'))
fps = pickle.load(open(fps_path, 'rb'))
#get ligand fingerprints
ligand_ids = [os.path.basename(p).split("_")[1] for p in graphlist]
found_fps= [i for i,lig in enumerate(ligand_ids)
if lig in fps]
print(f"skipipng {len(ligand_ids) - len(found_fps)} missing fingerprints.")
L = np.array([fps[lig].astype(int) for ind, lig in enumerate(ligand_ids)
if ind in found_fps])
print(L)
print(f"num of ligands by id: {len(set(ligand_ids))}")
DM = DM[found_fps,:][:, found_fps]
print(L.shape)
print(f"Unique fingerprints: {len(np.unique(L, axis=0))}")
print(DM.shape)
if non_redundant:
print(DM, L)
DM, L, ind = no_redundants(DM, L)
return (DM, L, graphlist)
if __name__ == "__main__":
prepare_data('../data/geds_delta.pickle', '../data/all_rna_ligands_fingerprints.pickle')
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment