Commit 50fe5983 authored by Yixiong Sun's avatar Yixiong Sun
Browse files

Changed module_from_desc to json output

parent 24acde24
......@@ -7,6 +7,8 @@ import operator
import os.path
import pickle
import argparse
import json
from Bio import SeqIO
......@@ -236,7 +238,7 @@ def create_module(desc,fasta,model_name, list_of_nodes, PDBs=[]):
else:
new_aln[position]= [position]
if PDBs == []:
PDBs = ["None" for x in graph_list]
......@@ -254,6 +256,115 @@ def create_module(desc,fasta,model_name, list_of_nodes, PDBs=[]):
print("YOUR MODULE WAS ADDED TO DATASET ", model_name, "AND RECEIVED THE NUMBERS",len(graphs)-1-len(moduleSignatures),"TO", len(graphs)-1, " WITH ",len(sequences[0])," SEQUENCES")
return graphs,aln,sequences
def create_module(desc, fasta, model_name, list_of_nodes, PDBs=[], pdb_info_file="", full_seq_file="", atlas_name=""):
repeat = "../models/repeat.pickle"
dataset_name = "../models/" + model_name + ".json"
dataset = {}
# load the dataset if it exists
if os.path.isfile(dataset_name):
with open(dataset_name) as f:
dataset = json.load(f)
# load in the graph
if 'desc' in desc.lower():
g = make_graph(desc)
else:
g = pickle.load(open(desc, 'rb'))
pdb_aln = []
pdb_seq = []
# Try to load PDB_info
if os.path.isfile(pdb_info_file):
with open(pdb_info_file, 'rb') as f:
pdb_aln, pdb_seq = pickle.load(f)
# try to load full_seq_file
full_records = []
if os.path.isfile(full_seq_file):
full_records = list(SeqIO.parse(full_seq_file, "fasta"))
# module signatures from list of nodes
module_signatures = get_all_signatures_from_graphs(list_of_nodes)
print("THERE ARE", len(module_signatures), "MODULES SIGNATURES:")
print(module_signatures)
dataset_length = len(dataset)
# for each signaature, add the module
for sig in module_signatures:
print("sig is", sig)
graph_list, sequences = make_new_graph_examples(g, fasta, sig)
# this gives alignment of graphs
new_aln = {}
for graph in range(len(graph_list)):
for position in list(graph_list[0].nodes()):
# print("position",position)
if position in new_aln:
new_aln[position].append(position)
else:
new_aln[position] = [position]
if PDBs == []:
PDBs = ["None" for x in graph_list]
data = {}
data["master_graph"] = {"nodes":list(graph_list[0].nodes(data=True)), "edges": list(graph_list[0].edges(data=True))}
# list comprehension could also work but we'll use a for loop for readability
data["graphs"] = {"nodes":[], "edges": []}
for g in graph_list:
data["graphs"]["nodes"].append(list(g.nodes(data=True)))
data["graphs"]["edges"].append(list(g.edges(data=True)))
# alignment data of graphs
data["aln"] = new_aln
# PDB names and positions of the motif
data["PDBs"] = dict(pdb_aln)
# PDB names and the sequences of the motif
data["subsequences"] = dict(pdb_seq)
data["training_set"] = []
# training sequences
if len(list_of_nodes) == len(full_records):
# adding training sequences
for index, record in enumerate(full_records):
data["training_set"].append({
"source": record.id,
"seq": str(record.seq),
"source_pos": [],
"seq_pos": list_of_nodes[index]
})
# siblings
sib_sigs = list(range(dataset_length, dataset_length + len(module_signatures)))
# in sib dict, just remove self
module_num = len(dataset)
sib_sigs.remove(module_num)
data["siblings"] = sib_sigs
# add atlas name
if atlas_name:
data["atlas_name"] = atlas_name
else:
data["atlas_name"] = ""
# general motif name, we will populate this later
data["general_name"] = ""
dataset[len(dataset)] = data
print("len", len(dataset))
# dump the json
with open(dataset_name, 'w') as out:
json.dump(dataset, out)
print("YOUR MODULE WAS ADDED TO DATASET ", model_name, "AND RECEIVED THE NUMBERS", len(dataset) - len(module_signatures), "TO", len(dataset) - 1, " WITH ", len(sequences[0]), " SEQUENCES")
def fuse_existing_databases(new_dataset,name1,name2,l1,l2):
one_of_each_graph = []
......@@ -318,6 +429,11 @@ if __name__ == "__main__":
parser.add_argument("-nodes", help="List of lsit of nodes as a string", required=True)
parser.add_argument('-pdb', nargs='*', help='PDBs in which input is found')
parser.add_argument('-pdb_info', help='Pickle file with tuple of PDB module positions and nucleotides for each column')
parser.add_argument("-full_seq", help="full sequences, FASTA format")
parser.add_argument("-atlas_name", help="name of the motif in the 3dMotifAtlas")
args = parser.parse_args()
mod = create_module(args.g,args.seq, args.n, ast.literal_eval(args.nodes), args.pdb)
create_module(args.g,args.seq, args.n, ast.literal_eval(args.nodes), args.pdb, pdb_info_file=args.pdb_info, full_seq_file=args.full_seq, atlas_name=args.atlas_name)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment