Commit 2779ff55 by Carlos GO

first commit

parent 04a39122
 """ Perform graph dissimilarity embedding of graphs. 1. Select a subset size N of graphs from full set to be 'prototype' graph set P. 2. For each new graph, embedding vector v is a real vector of size N where each entry v_i in v is the distance from G to P_i """ import sys import time import logging from random import randint import multiprocessing import cProfile import pstats import uuid import random import networkx as nx import numpy as np from numpy.linalg import eig from spectral_distance import * logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) PROCS = 15 def row_compute(args): """ Compute distances """ D, i = args logging.info(f"Computing distance for graph {i+1} of {len(D)}") dist = np.zeros((len(D))) for j in range(i, len(D)): d = graph_distance_evec(D[i], D[j]) dist[j] = d return dist def dummy(args): cProfile.runctx('row_compute(args)', globals(), locals(),\ f'prof_{random.randint(0,20)}.prof') def graph_eigen(G): L = graph_laplacian(G) return eig(L) def distance_matrix_para(D): eigens = [] #compute list of eigendecompositions for each graph. print("diagonalizing laplacians") with multiprocessing.Pool(PROCS) as pool: for i, v in enumerate(pool.map(graph_eigen, D)): eigens.append(v) #fill distance matrix dist = np.zeros((len(D), len(D))) todo = ((eigens, i) for i in range(dist.shape[0])) with multiprocessing.Pool(PROCS) as pool: for i, v in enumerate(pool.map(row_compute, todo)): dist[i] = v #make symmetric i_lower = np.tril_indices(dist.shape[0], -1) dist[i_lower] = dist.T[i_lower] return dist def distance_matrix(D): """ Compute distance matrix for list of graphs in D. """ dist = np.zeros((len(D), len(D))) for i in range(dist.shape[0]): logging.info(f"Computing distance for graph {i+1} of {len(D)}") for j in range(i, dist.shape[0]): d = graph_distance(D[i], D[j]) dist[i][j] = d dist[j][i] = d return dist def median_graph(DM): """ Rkketurn median graph as: median(D) = argmin_g1 \sum_{g2} d(g1, g2). Graph whose distance to all other graphs is minimal. """ return np.argmin(np.sum(DM, axis=1)) def center_graph(DM, mask=None, masked=False): """ Return center graph as; center(D) = argmin_g1 max_g2 d(g1,g2) """ if masked: mask = mask else: mask = np.zeros(len(DM)) return np.argmin(np.ma.array([max([DM[i][j] for i in range(len(DM))]) for j in range(len(DM))], mask=mask)) def spanning_selection(DM, m): median = median_graph(DM) proto_indices = [median] tot_ind = list(np.arange(len(DM))) d_indices = set(tot_ind) - {median} #get point furtherst from prototype set. while len(proto_indices) < m: proto = np.argmax( np.ma.array([min([DM[i][p] for p in proto_indices]) for i in tot_ind], mask=np.isin(tot_ind, proto_indices) )) proto_indices.append(proto) return proto_indices def random_proto(DM, k): """ Random prototype selection. """ return np.random.choice(list(range(len(DM))), size=k, replace=False) def k_centers(DM, k, return_assignments=False): """ k-centers selection algorithm. """ protos = spanning_selection(DM, k) print(f"protos: {protos}") print(DM) inds = list(range(len(DM))) protos = np.random.choice(inds, size=k, replace=False) while True: #find the closest graph to a center and add to centers centers = [{p} for p in protos] for g in range(len(DM)): if g in protos: continue nearest_proto_ind = np.argmin([DM[g][c] for c in protos]) centers[nearest_proto_ind].add(g) # print(f"current protos: {protos}") # print(f"current centers: {centers}") num_changed = 0 for i,cs in enumerate(centers): # clean_D = DM[indices,:][:, indices] # print(DM[list(c):,][:,list(c)]) mask = np.logical_not(np.isin(inds, list(cs))) # print(f"cs: {cs}") # print(f"mask: {mask}") center = center_graph(DM, masked=True, mask=mask) # print(f"center graph {i}: {center}") if center != protos[i]: num_changed += 1 protos[i] = center print(f"changed: {num_changed} of {len(protos)}") if num_changed == 0: if return_assignments: return protos, centers else: return protos # return protos def prototypes(D, m, DM, heuristic='sphere'): """ Compute set of m prototype graphs. Input: list (D): list of original graphs int (m): number of prototypes to select. np array (DM): Distance matrix. Returns: list: list with nx graphs forming prototype set. """ logging.info(f"Using {heuristic} heuristic") if heuristic == 'sphere': """ Select prototypes from a sphere induced on the dataset. """ prototypes = [] logging.info("Computing distance matrix...") logging.info("Finding center of graph set") #get center graph distances = np.sum(DM, axis=1) center_index = np.argmin(distances) center = D[center_index] #get graph furthest from center border_index = np.argmax(DM[center_index]) border = D[border_index] radius = DM[center_index][border_index] #define interval along radius interval = radius / m proto_indices = [] prototypes += [center, border] proto_indices += [center_index, border_index] mask = np.zeros(DM.shape[0]) mask[border_index] = 1 mask[center_index] = 1 center_ref = np.ma.MaskedArray(DM[center_index], mask) logging.info("Obtaining prototype graphs...") for i in range(m-2): border_dist = abs(center_ref - (i*interval)) dist_mask = np.ma.MaskedArray(border_dist, mask) proto_index = dist_mask.argmin() proto_indices.append(proto_index) prototypes.append(D[proto_index]) #mask the prototype we selected mask[proto_index] = 1 return proto_indices if heuristic == 'spanning': return spanning_selection(DM, m) if heuristic == "k-centers": return k_centers(DM, m) if heuristic == "random": return random_proto(DM, m) pass def graph_embed(args): """ Embed graph G given prototype set P. Returns: array: numpy array representing embedding vector. """ i,G,P, = args return (i, np.array([graph_distance(G, p) for p in P])) def full_embed(D, m, DM=None, dist_mat=None, heuristic='spanning'): """ For a dataset of graphs D, perform prototype selection and embedding. Returns: matrix: numpy matrix where each row is the embedding of a graph from D. """ if dist_mat == None: DM = distance_matrix_para(D) P_idx = prototypes(D, m, DM, heuristic=heuristic) print(P_idx) logging.info("Embedding graphs.") # embeddings = [graph_embed(g, P) for g in D] # embeddings = [graph_embed(g, P) for g in D] # todo = ((i,g,P) for i,g in enumerate(D)) embeddings = np.zeros((len(D), m)) for i, g in enumerate(D): embeddings[i] = DM[i][P_idx] # with multiprocessing.Pool(PROCS) as pool: # for e in pool.map(graph_embed, todo): # i,embed = e # embeddings[i] = embed return embeddings if __name__ == "__main__": logging.info("Generating random graphs.") D = [nx.erdos_renyi_graph(randint(10, 20), .7) for _ in range(5)] # print(full_embed(D, 5)) cProfile.run('full_embed(D, 5)', 'runstats') p = pstats.Stats('runstats') p.sort_stats('time').print_stats(10) pass