Commit 0fbd98fe authored by Carlos GO's avatar Carlos GO
Browse files

package path

parent a21026c0
......@@ -10,7 +10,7 @@ import numpy as np
import pandas as pd
from Bio.PDB import *
from sdf_parse import ligand_dict as sdf_info
from .sdf_parse import ligand_dict as sdf_info
# PDB_PATH = os.path.join("..", "Data", "rna_ligand_full_structures")
PDB_PATH = os.path.join("..", "Data", "rna_ligand_with_protein")
......
......@@ -4,8 +4,8 @@ Embed graphs from a GED run.
import pickle
from ligand_knn import prepare_data
from dissimilarity_embed import full_embed
from .ligand_knn import prepare_data
from .dissimilarity_embed import full_embed
def get_DM(ged_pickle):
......
......@@ -21,7 +21,7 @@ import networkx as nx
import numpy as np
from numpy.linalg import eig
from rna_ged import ged
from .rna_ged import ged
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
......
......@@ -44,7 +44,6 @@ def weighted_vote(labels, distances):
consensus.append(max(weight_count, key=weight_count.get))
return consensus
def knn(ind, distances, k=1):
"""
Compute indices of k-nearest neighbours to ind.
......@@ -233,297 +232,6 @@ def lig_kill(DM, L, graphlist, lig):
inds = [i for i,n in enumerate(graphlist) if lig not in n]
return DM[inds,:][:,inds], L[inds], [g for i,g in enumerate(graphlist)
if i in inds]
def embed_DM(embeddings):
"""
Build DM for vector embeddings.
"""
DM = [[jaccard(embeddings[i], embeddings[j]) for j in range(len(embeddings))]
for i in range(len(embeddings))]
return DM
def grid_accuracy(e, ms, ks):
acc_dict = {label:[] for label in ['regular', 'nosize', 'size', 'swap', 'embed']}
fps = pickle.load(open(f"rna_smiles_fp_{e}ent_np_maccs.pickle", "rb"))
fps_prot = pickle.load(open(f"rna_with_prot_maccs_fp.pickle", "rb"))
fps = {**fps, **fps_prot}
# DM, L, graphlist = prepare_data('geds_alpha_t30.pickle',
# DM, L, graphlist = prepare_data('geds_beta1_max20.pickle',
# fps, non_redundant=True)
# DM, L, graphlist = prepare_data('geds_gamma.pickle',
# fps, non_redundant=True)
DM, L, graphlist = prepare_data('geds_delta.pickle',
fps, non_redundant=True)
print("data prepared")
# DM, L, graphlist = lig_kill(DM, L, graphlist, 'EPE')
# DM_ss_only, L_ss_only, graphlist_ss = prepare_data('geds_alphanei_skeleton.pickle', fps)
#get non-redundant graph+ss matrices
# DMs, DM_ss, L = prepare_data('geds_alpha_t30.pickle', fps,
# with_ss=True, ss_geds='geds_alphanei_skeleton.pickle')
for m in ms:
print(f"m: {m}")
ligand_N = len(set(tuple(l) for l in L))
print(f'no red: {DM.shape}')
print(len(np.unique(L, axis=0)))
#regular
# acc = knn_test(DM, L,ks, m, graphlist)
# # plt.plot(acc, label='normal')
# # print('regular')
# print(acc)
# acc_dict['regular'].append(acc)
#embedding
# embeddings = pickle.load(open("test_embeddings.pickle", "rb"))
# acc = knn_test(embed_DM(embeddings), L, ks, m, graphlist)
# print('embed')
# print(acc)
# acc_dict['embed'].append(acc)
#ss
# print(DM_ss_only.shape)
# acc = knn_test(DM_ss_only, L_ss_only,ks, m)
# plt.plot(acc, label='ss only')
# print('ss only')
# print(acc)
# acc_dict['ss_only'] = acc
#ss
# acc = knn_test(DMs + DM_ss, L_ss,ks, m)
# plt.plot(acc, label='ss')
# print('ss')
# print(acc)
# acc_dict['ss'] = acc
#size distance
# print("size diff")
# sDM = size_DM(graphlist, root="../Data/pockets_gamma")
# # # sDM = np.delete(sDM, i, 0)
# # # sDM = np.delete(sDM, i, 1)
# acc = knn_test(sDM, L, ks, m, graphlist)
# acc_dict['size'].append(acc)
# print(f"size: {acc}")
# sdm = np.ravel(sDM)
# dm = np.ravel(DM)
# sdm = (sdm - np.mean(sdm)) / np.std(sdm)
# dm = (dm - np.mean(dm)) / np.std(dm)
# acc = knn_test(DM- (2* sDM), L, ks, m, graphlist)
# acc_dict['nosize'].append(acc)
# print(f"remove size: {acc}")
# # sns.distplot(sdm, label='sdm')
# sns.distplot(dm, label='dm')
# plt.legend()
# plt.show()
# plt.plot(acc, label='size difference')
# print(acc)
#pair shuffle
acc = knn_test(DM, L, ks, m, graphlist, swap=True)
# plt.plot(acc, label='pair shuffle')
acc_dict['swap'].append(acc)
print(f"swap: {acc}")
# print('pair shuffle')
# random ligand
# rand_ligs = np.array([np.random.randint(2, size=L.shape[1])
# for _ in range(len(L))])
# # print(rand_ligs.shape)
# acc = knn_test(DM, rand_ligs, ks, m, graphlist)
# # acc_dict['random_ligand'] = acc
# # plt.plot(acc,label='random ligand')
# print('random ligand')
# print(acc)
print(acc_dict)
#ligand shuffle
# L_shuff = np.copy(L)
# for i in range(L_shuff.shape[0]):
# np.random.shuffle(L_shuff[i])
# acc = knn_test(DM, L_shuff, ks, m)
# # plt.plot(acc, label='ligand shuffle')
# print("ligand shuffle")
# acc_dict['ligand_shuffle'] = acc
# print(acc)
#DM shuffle
# distances = DM[np.triu_indices(len(DM), 1)]
# np.random.shuffle(distances)
# DM_shuff = np.zeros(DM.shape)
# DM_shuff[np.triu_indices(len(DM),1)] = distances
# DM_shuff = DM_shuff + DM_shuff.T
# acc = knn_test(DM_shuff, L, ks, m)
# acc_dict['distance_matrix_shuffle'] = acc
# plt.plot(acc, label='distance shuffle')
# print("dm shuffle")
# print(acc)
# pickle.dump(acc_dict, open(f'knn_acc_maccs_m{m}_alpha_red.pickle', 'wb'))
# plt.title("k-NN ranks (m = 10)")
# plt.xlabel("Number of neighbours (k)")
# plt.ylabel("Recall")
# plt.xticks(range(len(ks)), ks)
# # plt.ylim([0.5,0.85])
# plt.legend(loc=0)
# # plt.savefig('../Figures/alphatrain_knn_uniqueligand.pdf', format='pdf')
# plt.show()
return acc_dict
def prepare_data(geds, fps, ss_geds=None, with_ss=False, non_redundant=True):
"""
Convert GEDS to distance matrix with corresponding ligand fingerprints.
Returns:
- Distance matrix of graphs
- Fingerprint matrix
- List of graph IDs
- Skeleton distance (optional)
"""
DM, graphlist = distance_matrix(geds)
e = pickle.load(open(geds, 'rb'))
#get ligand fingerprints
ligand_ids = [os.path.basename(p).split("_")[1] for p in graphlist]
found_fps= [i for i,lig in enumerate(ligand_ids)
if lig in fps]
print(f"skipipng {len(ligand_ids) - len(found_fps)} missing fingerprints.")
L = np.array([fps[lig].astype(int) for ind, lig in enumerate(ligand_ids)
if ind in found_fps])
print(L)
print(f"num of ligands by id: {len(set(ligand_ids))}")
DM = DM[found_fps,:][:, found_fps]
print(L.shape)
print(f"Unique fingerprints: {len(np.unique(L, axis=0))}")
print(DM.shape)
if non_redundant and not with_ss:
DM, L, ind = no_redundants(DM, L)
return (DM, L, [g for i,g in enumerate(graphlist) if i in ind])
elif with_ss:
#build ss distance matrix from pockets
pockets = [os.path.basename(p) for p in graphlist]
ss_DM_dirty, ss_list = distance_matrix(ss_geds)
ss_list = [os.path.basename(p) for p in ss_list]
ss_DM = np.zeros((len(DM), len(DM)))
#get list of distances
dists = []
found = set()
for g1, g2 in itertools.combinations(pockets, 2):
ss1, ss2 = (False, False)
try:
ss1 = ss_list.index(g1)
except:
pass
try:
ss2 = ss_list.index(g2)
except:
pass
if ss1 and ss2:
dists.append(ss_DM_dirty[ss_list.index(g1)])
found.add(g1)
found.add(g2)
found = sorted(list(found))
ss_DM[np.triu_indices(len(found), 1)] = dists
#make symmetric
ss_DM = ss_DM + ss_DM.T
DM = DM[found,:][:,found]
DM, L, inds = no_redundants(DM, L, with_ss=True, ss=ss_DM)
return DM, L, ss_DM[inds,:][:,inds]
else:
return (DM, L, graphlist)
def ss_gridsearch():
DM, graphlist_1 = distance_matrix("geds_alpha_t30.pickle")
del_ind = graphlist_1.index(
'../Data/pocket_pickles_interchain_5A_bfs2_nostack/4nya_2QB_S.nxpickle')
DM = np.delete(DM, del_ind, 0)
DM = np.delete(DM, del_ind, 1)
DM_ss, graphlist = distance_matrix("geds_alphanei_skeleton.pickle")
# fps = pickle.load(open(f"rna_smiles_fp_04ent_np.pickle", "rb"))
fps = pickle.load(open(f"rna_smiles_fp_0ent_np_maccs.pickle", "rb"))
ligand_ids = [os.path.basename(p).split("_")[1] for p in graphlist_1]
L = np.array([fps[i].astype(int) for i in ligand_ids])
print(L.shape)
twins = 0
ok = 0
for i in range(len(DM)):
for j in range(i, len(DM)):
if i == j:
continue
if DM[i][j] == 0 and jaccard(L[i], L[j]) == 0:
twins += 1
print("twinzies")
else:
ok += 1
print(ok, twins)
# clean_DM, clean_L = unique_ligands(DM, L)
# clean_DM_ss, clean_L = unique_ligands(DM_ss, L)
# print(clean_L.shape)
# sys.exit()
# DM = clean_DM
# L = clean_L
# DM_ss = clean_DM_ss
weights = np.arange(1, 10, 0.5)
# weights = [0, 0.5, 5]
accs = []
c=0
for i, w1 in enumerate(weights):
row = []
for j,w2 in enumerate(weights):
row.append(knn_validate((w1 * DM) +
(w2 * DM_ss), L, k=5, m=20))
# row.append(c)
c += 1
accs.append(row)
print(w1, row)
pickle.dump(accs, open('knn_ss_weights.pickle', 'wb'))
sns.heatmap(accs, cmap='Blues')
plt.xlabel("ss weight")
plt.ylabel("pocket weight")
plt.xticks(range(len(weights)), weights)
plt.yticks(range(len(weights)), weights)
plt.tight_layout()
# plt.savefig("../Figures/knn_ss_weights_hmp.pdf", format="pdf")
plt.show()
if __name__ == "__main__":
# grid_accuracy(['02', '04', '08'], [5, 10, 15], np.arange(1, 21, 1))
accs = grid_accuracy('0', np.arange(10, 200, 10), [1,3,5,7])
print(accs)
pickle.dump(accs, open('delta_accs_post_roman.pickle', 'wb'))
# grid_accuracy(['0'], [10], np.arange(1, 10, 1))
# ss_gridsearch()
# DM, graphlist = distance_matrix("geds_alpha_t30.pickle")
# fps = pickle.load(open(f"rna_smiles_fp_0ent_np_maccs.pickle", "rb"))
# ligand_ids = [os.path.basename(p).split("_")[1] for p in graphlist]
# L = np.array([fps[i].astype(int) for i in ligand_ids])
# preds = predictions(DM, L, graphlist, k=5)
# pickle.dump(preds, open('preds_test.pickle', 'wb'))
acc = knn_test(DM, L,ks, m, graphlist)
"""
Predict ligand vectors.
"""
import sys
import os
import pickle
import multiprocessing
import numpy as np
from scipy.stats import entropy
from pystruct.learners import NSlackSSVM
from pystruct.models import MultiLabelClf
from pystruct.plot_learning import plot_learning
from scipy.spatial.distance import jaccard
from scipy.spatial.distance import pdist
from scipy.spatial.distance import euclidean
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import MDS
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import *
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.utils import shuffle
from data_embed import *
from ligand_cluster import l_cluster
def train_model(G, L, algo="random_forest", regress=False):
"""
Train decision tree multi-output regressor on graph vectors in G and
ligand vectors in L.
"""
single_dim = False
if len(L.shape) == 1:
single_dim = True
if algo == "random_forest":
# model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100))
if regress:
print("random forest regressor")
model = RandomForestRegressor(n_estimators=100)
else:
print("random forest classifier")
if single_dim:
model = RandomForestClassifier(n_estimators=100)
else:
# model = ClassifierChain(RandomForestClassifier(n_estimators=100))
model = RandomForestClassifier(n_estimators=100)
if algo == "decision_tree":
model = DecisionTreeClassifier()
if algo == "svm":
if regress:
print("regression svm")
model = RegressorChain(MultiOutputRegressor(SVR()))
else:
print("classifier svm")
model = ClassifierChain(MultiOutputClassifier(SVC()))
if algo == "ssvm":
model = NSlackSSVM(MultiLabelClf(), verbose=1, max_iter=50)
if algo == "mlp":
if regress:
print("mlp regressor")
model = MLPRegressor(max_iter=2000)
else:
print("mlp classifier")
if single_dim:
model = MLPClassifier(max_iter=2000)
else:
# model = ClassifierChain(MLPClassifier(max_iter=2000))
model = MLPClassifier(max_iter=2000)
print("fitting")
model.fit(G, L)
print("fitted")
return model
def output_MDS(L, n_components=10):
dists = pdist(L, metric='euclidean')
L_DM = np.zeros((len(L), len(L)))
L_DM[np.triu_indices(len(L), 1)] = dists
L_DM = L_DM + L_DM.T
mds = MDS(n_components=n_components, dissimilarity="precomputed")
mds.fit(L_DM)
L_mds = mds.fit_transform(L_DM)
return np.array(L_mds), mds
def output_PCA(L, n_components=10):
# pca = PCA(n_components=n_components)
# pca.fit(L)
# L_pca = pca.fit_transform(L)
scal = StandardScaler()
L_t = scal.fit_transform(L)
# print(L_t)
print(L)
print(scal.inverse_transform(L_t))
return np.array(L_t), scal
def entropy_cutoff(L, cutoff):
"""
Cut out columns of ligand matrix by entropy cutoff
"""
labels = L.T
entropies = []
for l in labels:
value,counts = np.unique(l, return_counts=True)
entropies.append(entropy(counts, base=2))
return L[:, [i for i, e in enumerate(entropies) if e > cutoff]]
def output_preprocess(L, dim_reduce=None, entropy=0.4):
if dim_reduce == "mds":
L, mds = output_MDS(L, n_components=4)
return L, mds
elif dim_reduce == "pca":
print("PCA")
L, pca = output_PCA(L, n_components=3)
return L, pca
elif dim_reduce == "entropy":
L_reduced = entropy_cutoff(L, entropy)
return L_reduced, L
else:
return L
def input_scale(G):
scaler = StandardScaler()
return scaler.fit_transform(G)
def enrichment(pred, true, L_unique, true_ind=-1, k=5):
"""
Compute enrichment factor of predicted ligand.
"""
#compute distance from prediction to every other ligand
if true_ind > -1:
distances = [euclidean(pred, l) for l in L_unique]
else:
distances = [jaccard(pred, l) for l in L_unique]
# print(distances)
#list of indices in L
inds = list(range(len(L_unique)))
#sort ligands by proximity to predicted
L_sort = [l for l,_,_ in sorted(zip(L_unique, distances, inds), key=lambda x:x[1])]
if true_ind > -1:
neighbour_inds = [ind for _, ind in L_sort[:k]]
# print(f"searching for {true_ind} in {neighbour_inds}")
return true_ind in neighbour_inds
else:
# for v,_ in L_sort[:k]:
for v in L_sort[:k]:
if (v == true).all():
return 1
return 0
def ligand_screen(y_pred, y_pred_inds, L, graphlist, ind_map, m=20, transformed=False):
"""
Perform ligand screen with predicted fingerprints with nearest neighbour search.
"""
preds = []
enrich = []
positives = []
negatives = []
L_unique = np.unique(L, axis=0)
for pred, i in zip(y_pred, y_pred_inds):
tru = L[i]
if transformed:
en = enrichment(pred, tru, L_unique, k=m, true_ind=i)
else:
en = enrichment(pred, tru, L_unique, k=m)
name = os.path.basename(graphlist[ind_map[i]]).rstrip(".nxpickle")
if en == 1:
positives.append(name)
else:
negatives.append(name)
enrich.append(en)
return np.mean(enrich), positives, negatives
def k_fold(G, L, graphlist, algo="random_forest",
splits=10, screen=True, scale=True, dim_reduce=None,
entropy=0.4, exp='exp'):
kf = KFold(n_splits=splits)
G = [(g, i) for i, g in enumerate(G)]
L = [(l, i) for i, l in enumerate(L)]
G, L = shuffle(G, L)
#store the original index of each point
ind_map = {i:g[1] for i, g in enumerate(G)}
#remove the indices from data matrices
G = [g for g,_ in G]
L = [l for l,_ in L]
G = np.array(G)
L = np.array(L)
if scale:
G = input_scale(G)
regress = False
if dim_reduce:
L, space = output_preprocess(L, dim_reduce=dim_reduce, entropy=entropy)
if dim_reduce in ['mds', 'pca']:
print("doing regression")
regress = True
print(f"new output dimension: {L.shape}")
ms = np.arange(10, 200, 10)
if screen:
accs = {m:[] for m in ms}
poss = {m:[] for m in ms}
negs = {m:[] for m in ms}
else:
accs = []
poss = []
negs = []
fold_count = 0
models = []
for train_index, test_index in kf.split(G):
X_train, X_test = G[train_index], G[test_index]
y_train, y_test = L[train_index], L[test_index]
model = train_model(X_train, y_train, algo=algo, regress=regress)
y_pred = model.predict(X_test)
models = (model, y_pred, test_index, G[test_index], L[test_index])
pickle.dump(models, open(f'../Data/{exp}_{algo}_{fold_count}_models.pickle', 'wb'))
if screen:
for m in ms:
if dim_reduce in ['pca', 'mds']:
acc, pos, neg = ligand_screen(y_pred, test_index, L, graphlist,
ind_map, transformed=True, m=m)
else:
acc,pos, neg = ligand_screen(y_pred, test_index, L, graphlist,
ind_map, m=m)
accs[m].append(acc)
poss[m].append(pos)
negs[m].append(neg)
print(f"fold {fold_count + 1} of {splits}")
fold_count += 1
else:
#clustering
sc = model.score(X_test, y_test)
accs.append(sc)
if screen:
return ([(np.mean(accs[m]), np.std(accs[m])) for m in ms],
models,ind_map, poss, negs)
else:
return ((np.mean(accs), np.std(accs)), models, indices)
def embed_size_run(args):
(embedding_size, embedding, DM, graphlist, L, task, splits,algo,
output_clusters, swap, graphlist) = args
G = embed(DM, graphlist, embedding_size, method=embedding)