Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Carlos GO
RNAmigos
Commits
0fbd98fe
Commit
0fbd98fe
authored
Nov 20, 2018
by
Carlos GO
Browse files
package path
parent
a21026c0
Changes
7
Hide whitespace changes
Inline
Side-by-side
RNAmigos/binding_pockets.py
View file @
0fbd98fe
...
...
@@ -10,7 +10,7 @@ import numpy as np
import
pandas
as
pd
from
Bio.PDB
import
*
from
sdf_parse
import
ligand_dict
as
sdf_info
from
.
sdf_parse
import
ligand_dict
as
sdf_info
# PDB_PATH = os.path.join("..", "Data", "rna_ligand_full_structures")
PDB_PATH
=
os
.
path
.
join
(
".."
,
"Data"
,
"rna_ligand_with_protein"
)
...
...
RNAmigos/data_embed.py
View file @
0fbd98fe
...
...
@@ -4,8 +4,8 @@ Embed graphs from a GED run.
import
pickle
from
ligand_knn
import
prepare_data
from
dissimilarity_embed
import
full_embed
from
.
ligand_knn
import
prepare_data
from
.
dissimilarity_embed
import
full_embed
def
get_DM
(
ged_pickle
):
...
...
RNAmigos/dissimilarity_embed.py
View file @
0fbd98fe
...
...
@@ -21,7 +21,7 @@ import networkx as nx
import
numpy
as
np
from
numpy.linalg
import
eig
from
rna_ged
import
ged
from
.
rna_ged
import
ged
logging
.
basicConfig
(
stream
=
sys
.
stdout
,
level
=
logging
.
DEBUG
)
...
...
RNAmigos/ligand_knn.py
View file @
0fbd98fe
...
...
@@ -44,7 +44,6 @@ def weighted_vote(labels, distances):
consensus
.
append
(
max
(
weight_count
,
key
=
weight_count
.
get
))
return
consensus
def
knn
(
ind
,
distances
,
k
=
1
):
"""
Compute indices of k-nearest neighbours to ind.
...
...
@@ -233,297 +232,6 @@ def lig_kill(DM, L, graphlist, lig):
inds
=
[
i
for
i
,
n
in
enumerate
(
graphlist
)
if
lig
not
in
n
]
return
DM
[
inds
,:][:,
inds
],
L
[
inds
],
[
g
for
i
,
g
in
enumerate
(
graphlist
)
if
i
in
inds
]
def
embed_DM
(
embeddings
):
"""
Build DM for vector embeddings.
"""
DM
=
[[
jaccard
(
embeddings
[
i
],
embeddings
[
j
])
for
j
in
range
(
len
(
embeddings
))]
for
i
in
range
(
len
(
embeddings
))]
return
DM
def
grid_accuracy
(
e
,
ms
,
ks
):
acc_dict
=
{
label
:[]
for
label
in
[
'regular'
,
'nosize'
,
'size'
,
'swap'
,
'embed'
]}
fps
=
pickle
.
load
(
open
(
f
"rna_smiles_fp_
{
e
}
ent_np_maccs.pickle"
,
"rb"
))
fps_prot
=
pickle
.
load
(
open
(
f
"rna_with_prot_maccs_fp.pickle"
,
"rb"
))
fps
=
{
**
fps
,
**
fps_prot
}
# DM, L, graphlist = prepare_data('geds_alpha_t30.pickle',
# DM, L, graphlist = prepare_data('geds_beta1_max20.pickle',
# fps, non_redundant=True)
# DM, L, graphlist = prepare_data('geds_gamma.pickle',
# fps, non_redundant=True)
DM
,
L
,
graphlist
=
prepare_data
(
'geds_delta.pickle'
,
fps
,
non_redundant
=
True
)
print
(
"data prepared"
)
# DM, L, graphlist = lig_kill(DM, L, graphlist, 'EPE')
# DM_ss_only, L_ss_only, graphlist_ss = prepare_data('geds_alphanei_skeleton.pickle', fps)
#get non-redundant graph+ss matrices
# DMs, DM_ss, L = prepare_data('geds_alpha_t30.pickle', fps,
# with_ss=True, ss_geds='geds_alphanei_skeleton.pickle')
for
m
in
ms
:
print
(
f
"m:
{
m
}
"
)
ligand_N
=
len
(
set
(
tuple
(
l
)
for
l
in
L
))
print
(
f
'no red:
{
DM
.
shape
}
'
)
print
(
len
(
np
.
unique
(
L
,
axis
=
0
)))
#regular
# acc = knn_test(DM, L,ks, m, graphlist)
# # plt.plot(acc, label='normal')
# # print('regular')
# print(acc)
# acc_dict['regular'].append(acc)
#embedding
# embeddings = pickle.load(open("test_embeddings.pickle", "rb"))
# acc = knn_test(embed_DM(embeddings), L, ks, m, graphlist)
# print('embed')
# print(acc)
# acc_dict['embed'].append(acc)
#ss
# print(DM_ss_only.shape)
# acc = knn_test(DM_ss_only, L_ss_only,ks, m)
# plt.plot(acc, label='ss only')
# print('ss only')
# print(acc)
# acc_dict['ss_only'] = acc
#ss
# acc = knn_test(DMs + DM_ss, L_ss,ks, m)
# plt.plot(acc, label='ss')
# print('ss')
# print(acc)
# acc_dict['ss'] = acc
#size distance
# print("size diff")
# sDM = size_DM(graphlist, root="../Data/pockets_gamma")
# # # sDM = np.delete(sDM, i, 0)
# # # sDM = np.delete(sDM, i, 1)
# acc = knn_test(sDM, L, ks, m, graphlist)
# acc_dict['size'].append(acc)
# print(f"size: {acc}")
# sdm = np.ravel(sDM)
# dm = np.ravel(DM)
# sdm = (sdm - np.mean(sdm)) / np.std(sdm)
# dm = (dm - np.mean(dm)) / np.std(dm)
# acc = knn_test(DM- (2* sDM), L, ks, m, graphlist)
# acc_dict['nosize'].append(acc)
# print(f"remove size: {acc}")
# # sns.distplot(sdm, label='sdm')
# sns.distplot(dm, label='dm')
# plt.legend()
# plt.show()
# plt.plot(acc, label='size difference')
# print(acc)
#pair shuffle
acc
=
knn_test
(
DM
,
L
,
ks
,
m
,
graphlist
,
swap
=
True
)
# plt.plot(acc, label='pair shuffle')
acc_dict
[
'swap'
].
append
(
acc
)
print
(
f
"swap:
{
acc
}
"
)
# print('pair shuffle')
# random ligand
# rand_ligs = np.array([np.random.randint(2, size=L.shape[1])
# for _ in range(len(L))])
# # print(rand_ligs.shape)
# acc = knn_test(DM, rand_ligs, ks, m, graphlist)
# # acc_dict['random_ligand'] = acc
# # plt.plot(acc,label='random ligand')
# print('random ligand')
# print(acc)
print
(
acc_dict
)
#ligand shuffle
# L_shuff = np.copy(L)
# for i in range(L_shuff.shape[0]):
# np.random.shuffle(L_shuff[i])
# acc = knn_test(DM, L_shuff, ks, m)
# # plt.plot(acc, label='ligand shuffle')
# print("ligand shuffle")
# acc_dict['ligand_shuffle'] = acc
# print(acc)
#DM shuffle
# distances = DM[np.triu_indices(len(DM), 1)]
# np.random.shuffle(distances)
# DM_shuff = np.zeros(DM.shape)
# DM_shuff[np.triu_indices(len(DM),1)] = distances
# DM_shuff = DM_shuff + DM_shuff.T
# acc = knn_test(DM_shuff, L, ks, m)
# acc_dict['distance_matrix_shuffle'] = acc
# plt.plot(acc, label='distance shuffle')
# print("dm shuffle")
# print(acc)
# pickle.dump(acc_dict, open(f'knn_acc_maccs_m{m}_alpha_red.pickle', 'wb'))
# plt.title("k-NN ranks (m = 10)")
# plt.xlabel("Number of neighbours (k)")
# plt.ylabel("Recall")
# plt.xticks(range(len(ks)), ks)
# # plt.ylim([0.5,0.85])
# plt.legend(loc=0)
# # plt.savefig('../Figures/alphatrain_knn_uniqueligand.pdf', format='pdf')
# plt.show()
return
acc_dict
def
prepare_data
(
geds
,
fps
,
ss_geds
=
None
,
with_ss
=
False
,
non_redundant
=
True
):
"""
Convert GEDS to distance matrix with corresponding ligand fingerprints.
Returns:
- Distance matrix of graphs
- Fingerprint matrix
- List of graph IDs
- Skeleton distance (optional)
"""
DM
,
graphlist
=
distance_matrix
(
geds
)
e
=
pickle
.
load
(
open
(
geds
,
'rb'
))
#get ligand fingerprints
ligand_ids
=
[
os
.
path
.
basename
(
p
).
split
(
"_"
)[
1
]
for
p
in
graphlist
]
found_fps
=
[
i
for
i
,
lig
in
enumerate
(
ligand_ids
)
if
lig
in
fps
]
print
(
f
"skipipng
{
len
(
ligand_ids
)
-
len
(
found_fps
)
}
missing fingerprints."
)
L
=
np
.
array
([
fps
[
lig
].
astype
(
int
)
for
ind
,
lig
in
enumerate
(
ligand_ids
)
if
ind
in
found_fps
])
print
(
L
)
print
(
f
"num of ligands by id:
{
len
(
set
(
ligand_ids
))
}
"
)
DM
=
DM
[
found_fps
,:][:,
found_fps
]
print
(
L
.
shape
)
print
(
f
"Unique fingerprints:
{
len
(
np
.
unique
(
L
,
axis
=
0
))
}
"
)
print
(
DM
.
shape
)
if
non_redundant
and
not
with_ss
:
DM
,
L
,
ind
=
no_redundants
(
DM
,
L
)
return
(
DM
,
L
,
[
g
for
i
,
g
in
enumerate
(
graphlist
)
if
i
in
ind
])
elif
with_ss
:
#build ss distance matrix from pockets
pockets
=
[
os
.
path
.
basename
(
p
)
for
p
in
graphlist
]
ss_DM_dirty
,
ss_list
=
distance_matrix
(
ss_geds
)
ss_list
=
[
os
.
path
.
basename
(
p
)
for
p
in
ss_list
]
ss_DM
=
np
.
zeros
((
len
(
DM
),
len
(
DM
)))
#get list of distances
dists
=
[]
found
=
set
()
for
g1
,
g2
in
itertools
.
combinations
(
pockets
,
2
):
ss1
,
ss2
=
(
False
,
False
)
try
:
ss1
=
ss_list
.
index
(
g1
)
except
:
pass
try
:
ss2
=
ss_list
.
index
(
g2
)
except
:
pass
if
ss1
and
ss2
:
dists
.
append
(
ss_DM_dirty
[
ss_list
.
index
(
g1
)])
found
.
add
(
g1
)
found
.
add
(
g2
)
found
=
sorted
(
list
(
found
))
ss_DM
[
np
.
triu_indices
(
len
(
found
),
1
)]
=
dists
#make symmetric
ss_DM
=
ss_DM
+
ss_DM
.
T
DM
=
DM
[
found
,:][:,
found
]
DM
,
L
,
inds
=
no_redundants
(
DM
,
L
,
with_ss
=
True
,
ss
=
ss_DM
)
return
DM
,
L
,
ss_DM
[
inds
,:][:,
inds
]
else
:
return
(
DM
,
L
,
graphlist
)
def
ss_gridsearch
():
DM
,
graphlist_1
=
distance_matrix
(
"geds_alpha_t30.pickle"
)
del_ind
=
graphlist_1
.
index
(
'../Data/pocket_pickles_interchain_5A_bfs2_nostack/4nya_2QB_S.nxpickle'
)
DM
=
np
.
delete
(
DM
,
del_ind
,
0
)
DM
=
np
.
delete
(
DM
,
del_ind
,
1
)
DM_ss
,
graphlist
=
distance_matrix
(
"geds_alphanei_skeleton.pickle"
)
# fps = pickle.load(open(f"rna_smiles_fp_04ent_np.pickle", "rb"))
fps
=
pickle
.
load
(
open
(
f
"rna_smiles_fp_0ent_np_maccs.pickle"
,
"rb"
))
ligand_ids
=
[
os
.
path
.
basename
(
p
).
split
(
"_"
)[
1
]
for
p
in
graphlist_1
]
L
=
np
.
array
([
fps
[
i
].
astype
(
int
)
for
i
in
ligand_ids
])
print
(
L
.
shape
)
twins
=
0
ok
=
0
for
i
in
range
(
len
(
DM
)):
for
j
in
range
(
i
,
len
(
DM
)):
if
i
==
j
:
continue
if
DM
[
i
][
j
]
==
0
and
jaccard
(
L
[
i
],
L
[
j
])
==
0
:
twins
+=
1
print
(
"twinzies"
)
else
:
ok
+=
1
print
(
ok
,
twins
)
# clean_DM, clean_L = unique_ligands(DM, L)
# clean_DM_ss, clean_L = unique_ligands(DM_ss, L)
# print(clean_L.shape)
# sys.exit()
# DM = clean_DM
# L = clean_L
# DM_ss = clean_DM_ss
weights
=
np
.
arange
(
1
,
10
,
0.5
)
# weights = [0, 0.5, 5]
accs
=
[]
c
=
0
for
i
,
w1
in
enumerate
(
weights
):
row
=
[]
for
j
,
w2
in
enumerate
(
weights
):
row
.
append
(
knn_validate
((
w1
*
DM
)
+
(
w2
*
DM_ss
),
L
,
k
=
5
,
m
=
20
))
# row.append(c)
c
+=
1
accs
.
append
(
row
)
print
(
w1
,
row
)
pickle
.
dump
(
accs
,
open
(
'knn_ss_weights.pickle'
,
'wb'
))
sns
.
heatmap
(
accs
,
cmap
=
'Blues'
)
plt
.
xlabel
(
"ss weight"
)
plt
.
ylabel
(
"pocket weight"
)
plt
.
xticks
(
range
(
len
(
weights
)),
weights
)
plt
.
yticks
(
range
(
len
(
weights
)),
weights
)
plt
.
tight_layout
()
# plt.savefig("../Figures/knn_ss_weights_hmp.pdf", format="pdf")
plt
.
show
()
if
__name__
==
"__main__"
:
# grid_accuracy(['02', '04', '08'], [5, 10, 15], np.arange(1, 21, 1))
accs
=
grid_accuracy
(
'0'
,
np
.
arange
(
10
,
200
,
10
),
[
1
,
3
,
5
,
7
])
print
(
accs
)
pickle
.
dump
(
accs
,
open
(
'delta_accs_post_roman.pickle'
,
'wb'
))
# grid_accuracy(['0'], [10], np.arange(1, 10, 1))
# ss_gridsearch()
# DM, graphlist = distance_matrix("geds_alpha_t30.pickle")
# fps = pickle.load(open(f"rna_smiles_fp_0ent_np_maccs.pickle", "rb"))
# ligand_ids = [os.path.basename(p).split("_")[1] for p in graphlist]
# L = np.array([fps[i].astype(int) for i in ligand_ids])
# preds = predictions(DM, L, graphlist, k=5)
# pickle.dump(preds, open('preds_test.pickle', 'wb'))
acc
=
knn_test
(
DM
,
L
,
ks
,
m
,
graphlist
)
RNAmigos/ligand_predict_models.py
deleted
100644 → 0
View file @
a21026c0
"""
Predict ligand vectors.
"""
import
sys
import
os
import
pickle
import
multiprocessing
import
numpy
as
np
from
scipy.stats
import
entropy
from
pystruct.learners
import
NSlackSSVM
from
pystruct.models
import
MultiLabelClf
from
pystruct.plot_learning
import
plot_learning
from
scipy.spatial.distance
import
jaccard
from
scipy.spatial.distance
import
pdist
from
scipy.spatial.distance
import
euclidean
from
sklearn.model_selection
import
KFold
from
sklearn.decomposition
import
PCA
from
sklearn.preprocessing
import
StandardScaler
from
sklearn.manifold
import
MDS
from
sklearn.svm
import
SVC
from
sklearn.svm
import
SVR
from
sklearn.tree
import
DecisionTreeRegressor
from
sklearn.tree
import
DecisionTreeClassifier
from
sklearn.ensemble
import
RandomForestRegressor
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.multioutput
import
*
from
sklearn.neural_network
import
MLPClassifier
from
sklearn.neural_network
import
MLPRegressor
from
sklearn.utils
import
shuffle
from
data_embed
import
*
from
ligand_cluster
import
l_cluster
def
train_model
(
G
,
L
,
algo
=
"random_forest"
,
regress
=
False
):
"""
Train decision tree multi-output regressor on graph vectors in G and
ligand vectors in L.
"""
single_dim
=
False
if
len
(
L
.
shape
)
==
1
:
single_dim
=
True
if
algo
==
"random_forest"
:
# model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100))
if
regress
:
print
(
"random forest regressor"
)
model
=
RandomForestRegressor
(
n_estimators
=
100
)
else
:
print
(
"random forest classifier"
)
if
single_dim
:
model
=
RandomForestClassifier
(
n_estimators
=
100
)
else
:
# model = ClassifierChain(RandomForestClassifier(n_estimators=100))
model
=
RandomForestClassifier
(
n_estimators
=
100
)
if
algo
==
"decision_tree"
:
model
=
DecisionTreeClassifier
()
if
algo
==
"svm"
:
if
regress
:
print
(
"regression svm"
)
model
=
RegressorChain
(
MultiOutputRegressor
(
SVR
()))
else
:
print
(
"classifier svm"
)
model
=
ClassifierChain
(
MultiOutputClassifier
(
SVC
()))
if
algo
==
"ssvm"
:
model
=
NSlackSSVM
(
MultiLabelClf
(),
verbose
=
1
,
max_iter
=
50
)
if
algo
==
"mlp"
:
if
regress
:
print
(
"mlp regressor"
)
model
=
MLPRegressor
(
max_iter
=
2000
)
else
:
print
(
"mlp classifier"
)
if
single_dim
:
model
=
MLPClassifier
(
max_iter
=
2000
)
else
:
# model = ClassifierChain(MLPClassifier(max_iter=2000))
model
=
MLPClassifier
(
max_iter
=
2000
)
print
(
"fitting"
)
model
.
fit
(
G
,
L
)
print
(
"fitted"
)
return
model
def
output_MDS
(
L
,
n_components
=
10
):
dists
=
pdist
(
L
,
metric
=
'euclidean'
)
L_DM
=
np
.
zeros
((
len
(
L
),
len
(
L
)))
L_DM
[
np
.
triu_indices
(
len
(
L
),
1
)]
=
dists
L_DM
=
L_DM
+
L_DM
.
T
mds
=
MDS
(
n_components
=
n_components
,
dissimilarity
=
"precomputed"
)
mds
.
fit
(
L_DM
)
L_mds
=
mds
.
fit_transform
(
L_DM
)
return
np
.
array
(
L_mds
),
mds
def
output_PCA
(
L
,
n_components
=
10
):
# pca = PCA(n_components=n_components)
# pca.fit(L)
# L_pca = pca.fit_transform(L)
scal
=
StandardScaler
()
L_t
=
scal
.
fit_transform
(
L
)
# print(L_t)
print
(
L
)
print
(
scal
.
inverse_transform
(
L_t
))
return
np
.
array
(
L_t
),
scal
def
entropy_cutoff
(
L
,
cutoff
):
"""
Cut out columns of ligand matrix by entropy cutoff
"""
labels
=
L
.
T
entropies
=
[]
for
l
in
labels
:
value
,
counts
=
np
.
unique
(
l
,
return_counts
=
True
)
entropies
.
append
(
entropy
(
counts
,
base
=
2
))
return
L
[:,
[
i
for
i
,
e
in
enumerate
(
entropies
)
if
e
>
cutoff
]]
def
output_preprocess
(
L
,
dim_reduce
=
None
,
entropy
=
0.4
):
if
dim_reduce
==
"mds"
:
L
,
mds
=
output_MDS
(
L
,
n_components
=
4
)
return
L
,
mds
elif
dim_reduce
==
"pca"
:
print
(
"PCA"
)
L
,
pca
=
output_PCA
(
L
,
n_components
=
3
)
return
L
,
pca
elif
dim_reduce
==
"entropy"
:
L_reduced
=
entropy_cutoff
(
L
,
entropy
)
return
L_reduced
,
L
else
:
return
L
def
input_scale
(
G
):
scaler
=
StandardScaler
()
return
scaler
.
fit_transform
(
G
)
def
enrichment
(
pred
,
true
,
L_unique
,
true_ind
=-
1
,
k
=
5
):
"""
Compute enrichment factor of predicted ligand.
"""
#compute distance from prediction to every other ligand
if
true_ind
>
-
1
:
distances
=
[
euclidean
(
pred
,
l
)
for
l
in
L_unique
]
else
:
distances
=
[
jaccard
(
pred
,
l
)
for
l
in
L_unique
]
# print(distances)
#list of indices in L
inds
=
list
(
range
(
len
(
L_unique
)))
#sort ligands by proximity to predicted
L_sort
=
[
l
for
l
,
_
,
_
in
sorted
(
zip
(
L_unique
,
distances
,
inds
),
key
=
lambda
x
:
x
[
1
])]
if
true_ind
>
-
1
:
neighbour_inds
=
[
ind
for
_
,
ind
in
L_sort
[:
k
]]
# print(f"searching for {true_ind} in {neighbour_inds}")
return
true_ind
in
neighbour_inds
else
:
# for v,_ in L_sort[:k]:
for
v
in
L_sort
[:
k
]:
if
(
v
==
true
).
all
():
return
1
return
0
def
ligand_screen
(
y_pred
,
y_pred_inds
,
L
,
graphlist
,
ind_map
,
m
=
20
,
transformed
=
False
):
"""
Perform ligand screen with predicted fingerprints with nearest neighbour search.
"""
preds
=
[]
enrich
=
[]
positives
=
[]
negatives
=
[]
L_unique
=
np
.
unique
(
L
,
axis
=
0
)
for
pred
,
i
in
zip
(
y_pred
,
y_pred_inds
):
tru
=
L
[
i
]
if
transformed
:
en
=
enrichment
(
pred
,
tru
,
L_unique
,
k
=
m
,
true_ind
=
i
)
else
:
en
=
enrichment
(
pred
,
tru
,
L_unique
,
k
=
m
)
name
=
os
.
path
.
basename
(
graphlist
[
ind_map
[
i
]]).
rstrip
(
".nxpickle"
)
if
en
==
1
:
positives
.
append
(
name
)
else
:
negatives
.
append
(
name
)
enrich
.
append
(
en
)
return
np
.
mean
(
enrich
),
positives
,
negatives
def
k_fold
(
G
,
L
,
graphlist
,
algo
=
"random_forest"
,
splits
=
10
,
screen
=
True
,
scale
=
True
,
dim_reduce
=
None
,
entropy
=
0.4
,
exp
=
'exp'
):
kf
=
KFold
(
n_splits
=
splits
)
G
=
[(
g
,
i
)
for
i
,
g
in
enumerate
(
G
)]
L
=
[(
l
,
i
)
for
i
,
l
in
enumerate
(
L
)]
G
,
L
=
shuffle
(
G
,
L
)
#store the original index of each point
ind_map
=
{
i
:
g
[
1
]
for
i
,
g
in
enumerate
(
G
)}
#remove the indices from data matrices
G
=
[
g
for
g
,
_
in
G
]
L
=
[
l
for
l
,
_
in
L
]
G
=
np
.
array
(
G
)
L
=
np
.
array
(
L
)
if
scale
:
G
=
input_scale
(
G
)
regress
=
False
if
dim_reduce
:
L
,
space
=
output_preprocess
(
L
,
dim_reduce
=
dim_reduce
,
entropy
=
entropy
)
if
dim_reduce
in
[
'mds'
,
'pca'
]:
print
(
"doing regression"
)
regress
=
True
print
(
f
"new output dimension:
{
L
.
shape
}
"
)
ms
=
np
.
arange
(
10
,
200
,
10
)
if
screen
:
accs
=
{
m
:[]
for
m
in
ms
}
poss
=
{
m
:[]
for
m
in
ms
}
negs
=
{
m
:[]
for
m
in
ms
}
else
:
accs
=
[]
poss
=
[]
negs
=
[]
fold_count
=
0
models
=
[]
for
train_index
,
test_index
in
kf
.
split
(
G
):
X_train
,
X_test
=
G
[
train_index
],
G
[
test_index
]
y_train
,
y_test
=
L
[
train_index
],
L
[
test_index
]
model
=
train_model
(
X_train
,
y_train
,
algo
=
algo
,
regress
=
regress
)
y_pred
=
model
.
predict
(
X_test
)
models
=
(
model
,
y_pred
,
test_index
,
G
[
test_index
],
L
[
test_index
])
pickle
.
dump
(
models
,
open
(
f
'../Data/
{
exp
}
_
{
algo
}
_
{
fold_count
}
_models.pickle'
,
'wb'
))
if
screen
:
for
m
in
ms
:
if
dim_reduce
in
[
'pca'
,
'mds'
]:
acc
,
pos
,
neg
=
ligand_screen
(
y_pred
,
test_index
,
L
,
graphlist
,
ind_map
,
transformed
=
True
,
m
=
m
)
else
:
acc
,
pos
,
neg
=
ligand_screen
(
y_pred
,
test_index
,
L
,
graphlist
,
ind_map
,
m
=
m
)
accs
[
m
].
append
(
acc
)
poss
[
m
].
append
(
pos
)
negs
[
m
].
append
(
neg
)
print
(
f
"fold
{
fold_count
+
1
}
of
{
splits
}
"
)
fold_count
+=
1
else
:
#clustering
sc
=
model
.
score
(
X_test
,
y_test
)
accs
.
append
(
sc
)
if
screen
:
return
([(
np
.
mean
(
accs
[
m
]),
np
.
std
(
accs
[
m
]))
for
m
in
ms
],
models
,
ind_map
,
poss
,
negs
)
else
:
return
((
np
.
mean
(
accs
),
np
.
std
(
accs
)),
models
,
indices
)
def
embed_size_run
(
args
):
(
embedding_size
,
embedding
,
DM
,
graphlist
,
L
,
task
,
splits
,
algo
,
output_clusters
,
swap
,
graphlist
)
=
args
G
=
embed
(
DM
,
graphlist
,
embedding_size
,
method
=
embedding
)