Commit 545c9830 authored by Carlos GO's avatar Carlos GO
Browse files

readme note

parent 743c5992
......@@ -11,6 +11,8 @@ This package includes tools for:
Data used for training, models and binding pocket visualizations available [here](http://csb.cs.mcgill.ca/RNAmigos/).
NOTE: The user friendly API is currently being built and tested. Sample usage described below is still being improved.
## Requirements
* Python 3.6+
......@@ -21,8 +23,25 @@ Data used for training, models and binding pocket visualizations available [here
## Building Pocket Graph Dataset
### Extracting binding pockets
The `get_binding` function extracts all binding pockts from a PDB and writes a new PDB for each ligand in the PDB.
```python
>>> from RNAmigos.binding_pockets import get_binding
>>> pdb_path = '/path/to/pdbs/'
>>> get_binding(pdb_path, '1aju.pdb')
```
### Building Graph Representations of Binding pockets
The function `build_graph` from `pocket_graph` builds a newtorkx graph from a CSV of binding pocket residues.
## Pocket Graph Drawing
From a graph object, or a pickled graph we can draw the graph.
```python
>>> G = nx.read_gpickle('../Data/sample_graphs/3ds7_GNG_P.nxpickle')
......@@ -75,16 +94,35 @@ You can convert this output to a distance matrix and a list indicating the graph
>>> DM, L, graphlist = data_prepare(geds, fps)
```
The distance matrix can be passed to a prototype selector.
The distance matrix can be passed to a prototype selector to get the indices in the DM selected as prototypes.
```python
>>> from RNAmigos.dissimilarity_embed import prototype_select
>>> prototypes = prototype_select(DM, 20)
>>> prototypes = prototype_select(DM, 20, heuristic='spanning')
```
### Embedding a full dataset
To embed the graphs used in the GED comparisons and select prototypes in one call to get a matrix of size N x r where N is the nubmer of graphs in DM and r is the number of prototypes used.
## Fingerprint Prediction
```python
>>> from RNAmigos.dissimilarity_embed import full_embed
>>> embedding = full_embed(graphlist, 20, DM=DM, dist_mat=True, heuristic='k-centers')
```
The vector embedding of the third graph in our dataset:
```python
>>> embedding[2]
array([ 8., 10., 10., 8., 22., 10., 6., 8., 8., 18., 20., 8., 10.,
10., 24., 16., 6., 16., 12., 18.])
```
### Embedding a single graph
User-friendly API coming soon.
## Fingerprint Prediction
User-friendly API coming soon.
......@@ -14,14 +14,14 @@ import cProfile
import pstats
import uuid
import random
import pickle
import networkx as nx
import numpy as np
from numpy.linalg import eig
from spectral_distance import *
from rna_ged import ged
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
......@@ -42,10 +42,6 @@ def dummy(args):
cProfile.runctx('row_compute(args)', globals(), locals(),\
f'prof_{random.randint(0,20)}.prof')
def graph_eigen(G):
L = graph_laplacian(G)
return eig(L)
def distance_matrix_para(D):
eigens = []
#compute list of eigendecompositions for each graph.
......@@ -196,7 +192,7 @@ def prototypes(DM,m, heuristic='spanning'):
return random_proto(DM, m)
pass
def graph_embed(args):
def graph_embed(G, prototypes):
"""
Embed graph G given prototype set P.
......@@ -204,8 +200,14 @@ def graph_embed(args):
`array`: numpy array representing embedding vector.
"""
i,G,P, = args
return (i, np.array([graph_distance(G, p) for p in P]))
embedding = np.zeros(len(prototypes))
for p in prototypes:
g = pickle.load(open(G, 'rb'))
p = pickle.load(open(p, 'rb'))
ops,_,_ = ged((g1,p), source_only=True)
embedding.append(ops.cost)
return embedding
def full_embed(D, m, DM=None, dist_mat=None, heuristic='spanning'):
"""
For a dataset of graphs D, perform prototype selection and embedding.
......@@ -215,22 +217,12 @@ def full_embed(D, m, DM=None, dist_mat=None, heuristic='spanning'):
"""
if dist_mat == None:
DM = distance_matrix_para(D)
P_idx = prototypes(D, m, DM, heuristic=heuristic)
print(P_idx)
P_idx = prototypes(DM,m, heuristic=heuristic)
logging.info("Embedding graphs.")
# embeddings = [graph_embed(g, P) for g in D]
# embeddings = [graph_embed(g, P) for g in D]
# todo = ((i,g,P) for i,g in enumerate(D))
embeddings = np.zeros((len(D), m))
for i, g in enumerate(D):
embeddings[i] = DM[i][P_idx]
# with multiprocessing.Pool(PROCS) as pool:
# for e in pool.map(graph_embed, todo):
# i,embed = e
# embeddings[i] = embed
return embeddings
if __name__ == "__main__":
......
......@@ -89,6 +89,7 @@ def prepare_data(geds_path, fps_path, non_redundant=True):
if non_redundant:
print(DM, L)
DM, L, ind = no_redundants(DM, L)
return (DM, L, [g for i, g in enumerate(graphlist) if i in ind])
return (DM, L, graphlist)
if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment