Commit 443bf790 authored by Roman Sarrazin-Gendron's avatar Roman Sarrazin-Gendron
Browse files

data analysis backup

parent 959ec15b
This diff is collapsed.
......@@ -85,17 +85,11 @@ plt.show()
"""
"""
"""
to_sort = []
motifes = []
a = pickle.load(open("longseqs_crossval.cPickle","rb"))
......@@ -131,9 +125,51 @@ fig, ax = plt.subplots()
"""
print("-----------------------------------------------")
print("JEFFREY RESULTS")
to_sort = []
motifes = []
a = pickle.load(open("jeffrey_results.cPickle","rb"))
x_jeff = []
y_jeff = []
long_motifs = []
to_sort = []
for motif in a:
motif_scores = []
for sequence in a[motif]:
seq_scores = []
for result in sequence:
score = result[0]
seq_scores.append(score)
if len(seq_scores)==0:
seq_scores = [0]
motif_scores.append(max(seq_scores))
if len(motif_scores) > 0 :
motif_mean = np.mean(motif_scores)
else:
motif_mean = 0
if motif_mean>0.35 and len(motif_scores)>3:
print(motif, "~",str(len(motif_scores)), " : ", str(motif_mean))
#interesting_motifs.append(motif)
y_jeff.append(motif_mean)
x_jeff.append(len(motif_scores))
to_sort.append((len(motif_scores),motif_mean,motif))
long_motifs.append(motif)
final_long_scores = sorted(to_sort,key=lambda tup: tup[0], reverse=True)
fig, ax = plt.subplots()
motifs = plt.bar(range(len(x_jeff)),y_jeff)
ax.set_xticks(range(len(x_jeff)))
labelles = x_jeff
ax.set_xticklabels(labelles)
ax.set_xlabel("Number of sequences in test set")
ax.set_ylabel("Mean proportion of correctly predicted base pairs")
ax.set_title("Mean prediction score, leave-one-out cross-val, jeffrey data")
plt.show()
exit()
"""
interesting_motifs = []
to_sort = []
......@@ -157,7 +193,7 @@ for motif in a:
motif_mean = np.mean(motif_scores)
else:
motif_mean = 0
if motif_mean>0.35 and len(motif_scores)>3:
if motif_mean>0.65 and len(motif_scores)>3:
print(motif, "~",str(len(motif_scores)), " : ", str(motif_mean))
interesting_motifs.append(motif)
y_long.append(motif_mean)
......@@ -169,9 +205,12 @@ final_long_scores = sorted(to_sort,key=lambda tup: tup[0], reverse=True)
to_sort = []
motifes = []
a = pickle.load(open("short_only_seqs_crossval.cPickle","rb"))
a = pickle.load(open("short_crossval2.cPickle","rb"))
x = []
y = []
to_sort = []
......@@ -191,7 +230,7 @@ for index,motif in enumerate(a):
motif_mean = np.mean(motif_scores)
else:
motif_mean = 0
if motif_mean>0.35 and len(motif_scores)>4:
if motif_mean>0.3 and len(motif_scores)>3:
if index not in interesting_motifs:
interesting_motifs.append(index)
#if index in long_motifs:
......@@ -200,6 +239,8 @@ for index,motif in enumerate(a):
x.append(len(motif_scores))
to_sort.append((len(motif_scores),motif_mean,index))
final_scores = sorted(to_sort,key=lambda tup: tup[0], reverse=True)
#print(final_scores)
z = [z[0] for z in final_scores]
......
import pickle
import numpy as np
import matplotlib.pyplot as plt
from altair import *
import matplotlib.patches as mpatches
import networkx as nx
import numpy as np
convert = {}
with open('seqs.fa') as f:
lines = f.readlines()
for line in lines:
if line[0] == ">":
enst, ig, chrom, ensg, biotype, trans = line[1:].split(" ")
convert[ensg.split(":")[1].split(".")[0]] = enst
print(convert)
results_per_gene = {}
x_vector = []
y_vector = []
modules = [0, 5, 6, 135, 8, 18, 19, 146, 21, 24, 25, 26, 162, 39, 168, 53, 54, 58, 191, 194, 198, 216, 119, 122]
a = pickle.load(open("ircm_more.cPickle","rb"))
print(len(a))
module_results = {}
for index,i in enumerate(modules):
module_results[i] = []
for j in a:
if len(a[j])>0:
score = 0
if len(a[j][i])> 0:
score =a[j][i][0]
else:
score = 0
score = max(score,0)
gene = j
#gene = j.split(".")[0]
if gene not in results_per_gene:
results_per_gene[gene] = [score]
else:
results_per_gene[gene].append(score)
module_results[i].append(score)
y_vector.append(score)
x_vector.append(index)
for i in results_per_gene:
print (i)
means = []
for i in module_results:
print(i)
m = np.max(module_results[i])
x= np.mean(module_results[i])
means.append(x)
print(m,x)
print(sorted(module_results[0], reverse=True))
plt.scatter(x_vector,y_vector, alpha = 0.3)
plt.xticks(range(len(modules)))
plt.ylabel("Module likelihood at most likely position")
plt.xlabel("module ID")
plt.title("Bayespairing score for 20 modules for 800 sequences from IRCM data")
plt.show()
with open('localization.tsv') as f:
lines = f.readlines()
for line in lines[1:] :
#print(line)
#if " \" " in line:
# elements = str(line.split("\t")[0]) + str(line.split("\"")[2])
## elements = str(line.split("\t")[0]) + str(line.split("]")[1])
elements = line.split("\t")
gene_id,gene_name, gene_biotype, description, longest_isoform_length, cyto_A, cyto_B, insol_A, insol_B, membr_A, membr_B, nucl_A, nucl_B = elements
if gene_id in convert:
gene_format = convert[gene_id]
print(gene_format)
else:
gene_format = ""
if gene_format in results_per_gene:
print("FOUND")
print(gene_format)
for zone in [cyto_A, cyto_B, insol_A, insol_B, membr_A, membr_B, nucl_A, nucl_B]:
results_per_gene[gene_format].append(zone)
results_per_gene[gene_format].append(gene_name)
results_per_gene[gene_format].append(gene_id)
results_per_gene[gene_format].append(gene_biotype)
pickle.dump(results_per_gene,open("gene_data.cPickle","wb"))
This diff is collapsed.
import pickle
import numpy as np
import matplotlib.pyplot as plt
from altair import *
import matplotlib.patches as mpatches
import networkx as nx
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
vector = pickle.load(open("gene_data.cPickle","rb"))
v = {}
v_strong = {}
v_weak = {}
for gene in vector:
#print(len(vector[gene]))
if len(vector[gene])>24:
print(vector[gene])
if sum([float(i) for i in vector[gene][24:32]])>50:
v[gene]=vector[gene]
v_strong[gene]=vector[gene]
if sum([float(i) for i in vector[gene][24:32]])>0 and sum([float(i) for i in vector[gene][24:32]])<5:
v_weak[gene] = vector[gene]
"""
print(len([v_strong[k][14] for k in v_strong]))
print(len([v_weak[k][14] for k in v_weak]))
plt.hist([v_strong[k][14] for k in v_strong],16, color="g",alpha=0.4,normed=True)
plt.hist([v_weak[k][14] for k in v_weak], 16, color="r",alpha=0.4, normed=True)
rp = mpatches.Patch(label="not localized", color="r")
gp = mpatches.Patch(label = "localized", color = "g")
plt.xlabel("Predicted module score")
plt.ylabel("Normalized frequency")
plt.title("Distribution of scores, localized vs non-localized, module 54 ")
plt.legend(handles = [rp,gp])
plt.show()
fig = plt.figure(figsize=plt.figaspect(2.))
ax = fig.add_subplot(111, projection='3d')
print([v[i][0] for i in v], [v[i][25] for i in v], [v[i][27] for i in v])
ax.scatter([float(v[i][7]) for i in v], [float(v[i][29]) for i in v], [float(v[i][31]) for i in v])
plt.show()
"""
for i in v:
# print((float(v[i][24])+float(v[i][25])),(float(v[i][28])+float(v[i][29])),(float(v[i][30])+float(v[i][31])))
a,b,c =float(v[i][24])+float(v[i][25]),float(v[i][28])+float(v[i][29]),float(v[i][30])+float(v[i][31])
if v[i][14]>15 and max(a,b,c)>3000:
if max(a,b,c)==a:
print(v[i][32],v[i][33],v[i][34], "cyto")
if max(a, b, c) == b:
print(v[i][32], v[i][33], v[i][34], "memb")
else:
print(v[i][32], v[i][33], v[i][34], "nuc")
exit()
for t in range(1):
#plt.subplot(4,1,1)
plt.title("localization vs prediction,nucleus, module "+str(t))
rp = mpatches.Patch(label = "insol", color="r")
# bp = mpatches.Patch(label = "cyto", color = "b")
#plt.scatter([v[i][t] for i in v], [min(float(v[i][28])+float(v[i][28]),4999) for i in v], color ="b", alpha=0.25)
plt.scatter([v[i][t] for i in v], [min(float(v[i][26])+float(v[i][27]),4999) for i in v], color ="r", alpha=0.25)
#plt.legend(handles = [rp,bp])
#plt.title("Cytosol localization frequency of gene given module prediction score")
plt.xlabel("prediction score")
plt.ylabel("localization frequency")
##plt.ylim([0,505])
#plt.subplot(4,1,2)
#plt.scatter([v[i][t] for i in v], [min(float(v[i][26])+float(v[i][27]),499) for i in v], alpha=0.5)
#plt.title("Insol localization frequency of gene given module prediction score")
#plt.xlabel("prediction score")
#plt.ylabel("localization frequency")
#plt.ylim([0,505])
##plt.subplot(4,1,3)
#plt.scatter([v[i][t] for i in v], [min(float(v[i][28])+float(v[i][29]),499) for i in v], alpha=0.5)
#plt.title("Membrane localization frequency of gene given module prediction score")
#plt.xlabel("prediction score")
#plt.ylabel("localization frequency")
#plt.ylim([0,505])
#plt.subplot(4,1,4)
#plt.scatter([v[i][t] for i in v], [min(float(v[i][30])+float(v[i][31]),499) for i in v], alpha=0.5)
##plt.title("Nucleus localization frequency of gene given module prediction score")
#plt.xlabel("prediction score")
#plt.ylabel("localization frequency")
#plt.ylim([0,505])
plt.show()
gene_labels = []
for gene in v:
localizations = [v[gene][24],v[gene][25],v[gene][26],v[gene][27],v[gene][28],v[gene][29],v[gene][30],v[gene][31]]
label = np.argmax(localizations)
gene_labels.append(label)
genes_by_label = {}
for i in range(8):
genes_by_label[i] = ([],[])
for index, gene in enumerate(v):
mod_scores = v[gene][0:24]
for j in range(24):
genes_by_label[gene_labels[index]][0].append(j)
for j in mod_scores:
genes_by_label[gene_labels[index]][1].append(j)
colors = ["r","r","r","r","b","b","b","b","b"]
#colors = ["r","b","b","c","g","y","orange","maroon"
"""
for index,label in enumerate(genes_by_label.keys()):
if index in [3,5]:
print(len(genes_by_label[label][0]))
x = genes_by_label[label][0]
y = genes_by_label[label][1]
# print(x)
# print(y)
plt.scatter(x,y,color=colors[index],alpha=0.1)
plt.ylim([0,26])
plt.show()
"""
y = []
x= []
for module in range(0,24):
for index,label in enumerate(genes_by_label.keys()):
for ind,i in enumerate(genes_by_label[label][1]):
# print(genes_by_label[label][0][ind])
if genes_by_label[label][0][ind]==module:
y.append(i)
x.append(label)
print(np.mean(y))
#print(np.std(y))
print("-----")
#print(y)
#print(x)
plt.scatter(x,y,alpha=0.1, marker="p")
plt.ylim([10,25])
plt.title("score prediction distribution for one module given localization")
plt.xlabel("localization area (discrete)")
plt.ylabel("predicted scores for module")
plt.show()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment