Commit d0fbcc0c by Carlos GO

### post ged

parent 5d200298
 """ Convert GED output to distance matrix. """ import os import pickle import itertools from scipy.spatial.distance import jaccard import networkx as nx import numpy as np import seaborn as sns import matplotlib.pyplot as plt # plt.style.use('fivethirtyeight') def distance_matrix(pickle_path): geds = pickle.load(open(pickle_path, 'rb')) ged_dict = {} graphs = set() for g in geds: dist, nodemap, paths, time = g if time == -2: ged_dict[(paths[0], paths[1])] = (np.nan, nodemap, time) else: ged_dict[(paths[0], paths[1])] = (dist, nodemap, time) graphs.add(paths[0]) graphs.add(paths[1]) distances = [] graphlist = sorted(list(graphs)) for g in itertools.combinations(graphlist, 2): g1, g2 = g try: distances.append(ged_dict[g][0]) except KeyError: distances.append(ged_dict[(g2,g1)][0]) pass DM = np.zeros((len(graphs), len(graphs))) DM[np.triu_indices(len(graphs), 1)] = distances DM = DM + DM.T return DM,graphlist def no_redundants(DM, L, with_ss=False, ss=None): fams = {} uniques = [0] redundants = set([]) for i in range(1,DM.shape[0]): for u in uniques: if (DM[u][i] == 0) and (jaccard(L[u], L[i])) == 0: redundants.add(i) break else: uniques.append(i) return DM[uniques,:][:, uniques], L[uniques], uniques def prepare_data(geds_path, fps_path, non_redundant=True): """ Convert GEDS to distance matrix with corresponding ligand fingerprints. Returns: - Distance matrix of graphs - Fingerprint matrix - List of graph IDs - Skeleton distance (optional) """ DM, graphlist = distance_matrix(geds_path) e = pickle.load(open(geds_path, 'rb')) fps = pickle.load(open(fps_path, 'rb')) #get ligand fingerprints ligand_ids = [os.path.basename(p).split("_")[1] for p in graphlist] found_fps= [i for i,lig in enumerate(ligand_ids) if lig in fps] print(f"skipipng {len(ligand_ids) - len(found_fps)} missing fingerprints.") L = np.array([fps[lig].astype(int) for ind, lig in enumerate(ligand_ids) if ind in found_fps]) print(L) print(f"num of ligands by id: {len(set(ligand_ids))}") DM = DM[found_fps,:][:, found_fps] print(L.shape) print(f"Unique fingerprints: {len(np.unique(L, axis=0))}") print(DM.shape) if non_redundant: print(DM, L) DM, L, ind = no_redundants(DM, L) return (DM, L, graphlist) if __name__ == "__main__": prepare_data('../data/geds_delta.pickle', '../data/all_rna_ligands_fingerprints.pickle') pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment