Commit a37a45a9 authored by Jerome Waldispuhl's avatar Jerome Waldispuhl
Browse files

modifs

parent 3611347e
......@@ -4,13 +4,16 @@ import csv
import matplotlib.pyplot as plt
import distance
from random import randint
from altschulEriksonDinuclShuffle import dinuclShuffle
import subprocess
# List diretory
path = "../Data/RNAmuts_mfe_csv/"
dirs = os.listdir( path )
seqlen=50
list_gcr = [10,30,50,70,90]
#list_gcr = [50]
#list_gcr = [10,30,50,70,90]
list_gcr = [50]
n_samples = 10
################
### utils
......@@ -27,16 +30,14 @@ def has_ml(ssa):
elif counter == 0:
hairpin = False
else:
print 'Corrupted structure:' + ssa
sys.exit(1)
raise Exception('Corrupted structure:',ssa)
counter += 1
elif x==')':
hairpin=True
counter -= 1
else:
if not x == '.':
print 'Corrupted structure:' + ssa
sys.exit(1)
raise Exception('Corrupted structure:',ssa)
return False
################################
......@@ -52,8 +53,9 @@ for file in dirs:
# parse files
mystats = {10:{},30:{},50:{},70:{},90:{}}
mydata = {10:{},30:{},50:{},70:{},90:{}}
seqdists = {}
rnafold_re = re.compile('(?P<iseq>[ACGU]+)\n(?P<issa>[.()]+)\ \([\ +-]+(?P<imfe>\d+\.\d+)\)\n')
for gcr in seqfiles.keys():
if not gcr in list_gcr:
......@@ -65,10 +67,10 @@ for gcr in seqfiles.keys():
for filename in seqfiles[gcr]:
print filename
idfile = len(mystats[gcr])
idfile = len(mydata[gcr])
# init data structure
mystats[gcr][idfile] = {}
mydata[gcr][idfile] = {}
# read file
f = open(filename)
......@@ -78,51 +80,47 @@ for gcr in seqfiles.keys():
seqs = {}
for row in reader:
mmfe = float(row[1])
nmut = int(row[2])
mseq = row[3].upper()
is_ml_ssa = has_ml(row[4])
mssa = row[4]
is_ml_ssa = has_ml(mssa)
if is_ml_ssa:
accept_rate = 0
else:
accept_rate = 99
accept_rate = 999
if randint(0,accept_rate) == 0:
if not seqs.has_key(nmut):
seqs[nmut] = []
if not hdist_list.has_key(nmut):
hdist_list[nmut] = {'HH':[],'HM':[],'MM':[]}
for prevseq,prevml in seqs[nmut]:
hdist = distance.hamming(prevseq,mseq)
if is_ml_ssa:
if prevml:
hdist_list[nmut]['MM'].append(hdist)
else:
hdist_list[nmut]['HM'].append(hdist)
else:
if prevml:
hdist_list[nmut]['HM'].append(hdist)
else:
hdist_list[nmut]['HH'].append(hdist)
seqs[nmut].append((mseq,is_ml_ssa))
for nmut in range(seqlen+1):
avg_dist = {'HH':0.,'HM':0.,'MM':0.}
if hdist_list.has_key(nmut):
if len(hdist_list[nmut]['HH']) > 0:
avg_dist['HH'] = float(sum(hdist_list[nmut]['HH'])) / len(hdist_list[nmut]['HH'])
if len(hdist_list[nmut]['HM']) > 0:
avg_dist['HM'] = float(sum(hdist_list[nmut]['HM'])) / len(hdist_list[nmut]['HM'])
if len(hdist_list[nmut]['MM']) > 0:
avg_dist['MM'] = float(sum(hdist_list[nmut]['MM'])) / len(hdist_list[nmut]['MM'])
mystats[gcr][idfile][nmut] = avg_dist
decoy_list = []
for k in range(n_samples):
decoy_list.append(dinuclShuffle(mseq))
if not mydata[gcr][idfile].has_key(nmut):
mydata[gcr][idfile][nmut] = {'ml':{'rm_mfe':[],'decoy_ml':[],'decoy_mfe':[]},'noml':{'rm_mfe':[],'decoy_ml':[],'decoy_mfe':[]}}
# vienna RNA
ml_decoy_count = 0
decoy_mfe_list = []
for k in range(n_samples):
decoy = dinuclShuffle(mseq)
rnafold_proc = subprocess.Popen(['RNAfold','--noPS'],stdout=subprocess.PIPE,stdin=subprocess.PIPE)
rnafold_out = rnafold_proc.communicate(input=decoy)[0]
rnafold_parsed = rnafold_re.match(rnafold_out)
if has_ml(rnafold_parsed.group('issa')):
ml_decoy_count += 1
decoy_mfe_list.append(float(rnafold_parsed.group('imfe')))
avg_ml_decoy = float(ml_decoy_count) / n_samples
avg_mfe_decoy = float(sum(decoy_mfe_list)) / n_samples
if is_ml_ssa:
mydata[gcr][idfile][nmut]['ml']['rm_mfe'].append(mmfe)
mydata[gcr][idfile][nmut]['ml']['decoy_ml'].append(avg_ml_decoy)
mydata[gcr][idfile][nmut]['ml']['decoy_mfe'].append(avg_mfe_decoy)
else:
mydata[gcr][idfile][nmut]['noml']['rm_mfe'].append(mmfe)
mydata[gcr][idfile][nmut]['noml']['decoy_ml'].append(avg_ml_decoy)
mydata[gcr][idfile][nmut]['noml']['decoy_mfe'].append(avg_mfe_decoy)
f.close()
......@@ -135,32 +133,29 @@ if True:
xvalues = range(seqlen+1)
yvalues = {}
for gcr in list_gcr:
yvalues[gcr] = {'HH':[],'HM':[],'MM':[]}
for idf in mystats[gcr].keys():
tmp = {}
for nm in xvalues:
if not tmp.has_key(nm):
tmp[nm] = {'HH':[],'HM':[],'MM':[]}
if mystats[gcr][idfile].has_key(nm):
tmp[nm]['HH'].append(mystats[gcr][idf][nm]['HH'])
tmp[nm]['HM'].append(mystats[gcr][idf][nm]['HM'])
tmp[nm]['MM'].append(mystats[gcr][idf][nm]['MM'])
yvalues[gcr] = {'ml':[],'noml':[]}
for nm in xvalues:
tmp = {'ml':[],'noml':[]}
for idf in mydata[gcr].keys():
if mydata[gcr][idf].has_key(nm):
# ML
if mydata[gcr][idf][nm].has_key('ml') and len(mydata[gcr][idf][nm]['ml']['decoy_ml'])>0:
tmp['ml'].append(float(sum(mydata[gcr][idf][nm]['ml']['decoy_ml'])) / len(mydata[gcr][idf][nm]['ml']['decoy_ml']))
else:
tmp['ml'].append(0.)
# No ML
if mydata[gcr][idf][nm].has_key('noml') and len(mydata[gcr][idf][nm]['noml']['decoy_ml'])>0:
tmp['noml'].append(float(sum(mydata[gcr][idf][nm]['noml']['decoy_ml'])) / len(mydata[gcr][idf][nm]['noml']['decoy_ml']))
else:
tmp['noml'].append(0.)
else:
tmp[nm]['HH'].append(0.)
tmp[nm]['HM'].append(0.)
tmp[nm]['MM'].append(0.)
tmp['ml'].append(0.)
tmp['noml'].append(0.)
yvalues[gcr]['ml'].append(float(sum(tmp['ml'])) / len(tmp['ml']))
yvalues[gcr]['noml'].append(float(sum(tmp['noml'])) / len(tmp['noml']))
for nm in xvalues:
avgHH = float(sum(tmp[nm]['HH'])) / len(tmp[nm]['HH'])
avgHM = float(sum(tmp[nm]['HM'])) / len(tmp[nm]['HM'])
avgMM = float(sum(tmp[nm]['MM'])) / len(tmp[nm]['MM'])
yvalues[gcr]['HH'].append(avgHH)
yvalues[gcr]['HM'].append(avgHM)
yvalues[gcr]['MM'].append(avgMM)
plt.plot(xvalues,yvalues[gcr]['HH'],label='HH (' + str(gcr) + '%)',ls='-',color=color_scheme[gcr])
#plt.plot(xvalues,yvalues[gcr]['HM'],label='HM (' + str(gcr) + '%)',ls='--',color=color_scheme[gcr])
plt.plot(xvalues,yvalues[gcr]['MM'],label='MM (' + str(gcr) + '%)',ls=':',color=color_scheme[gcr])
plt.plot(xvalues,yvalues[gcr]['ml'],label='ML (' + str(gcr) + '%)',ls='-',color=color_scheme[gcr])
plt.plot(xvalues,yvalues[gcr]['noml'],label='No ML (' + str(gcr) + '%)',ls=':',color=color_scheme[gcr])
plt.legend(loc=3)
......
......@@ -6,14 +6,17 @@ import distance
from random import randint
from altschulEriksonDinuclShuffle import dinuclShuffle
import subprocess
import random
from numpy.random import choice
from Bio.SeqUtils import GC
# List diretory
path = "../Data/RNAmuts_mfe_csv/"
dirs = os.listdir( path )
seqlen=50
#list_gcr = [10,30,50,70,90]
list_gcr = [90]
n_samples = 10
list_gcr = [10]
n_samples = 1
################
### utils
......@@ -40,6 +43,23 @@ def has_ml(ssa):
raise Exception('Corrupted structure:',ssa)
return False
mutlist = {'A':['C','G','U'],'C':['A','G','U'],'G':['A','C','U'],'U':['A','C','G']}
ntweights = {
0.1:{'A':[0.05,0.05,0.9],'C':[0.45,0.1,0.45],'G':[0.45,0.1,0.45],'U':[0.9,0.05,0.05]},
0.3:{'A':[0.15,0.15,0.7],'C':[0.35,0.3,0.35],'G':[0.35,0.3,0.35],'U':[0.7,0.15,0.15]},
0.5:{'A':[0.25,0.25,0.5],'C':[0.25,0.5,0.25],'G':[0.25,0.5,0.25],'U':[0.5,0.25,0.25]},
0.7:{'A':[0.35,0.35,0.3],'C':[0.15,0.7,0.15],'G':[0.15,0.7,0.15],'U':[0.3,0.35,0.35]},
0.9:{'A':[0.45,0.45,0.1],'C':[0.05,0.9,0.05],'G':[0.05,0.9,0.05],'U':[0.1,0.45,0.45]}}
def random_seq(seed,nmut,gcr):
mutpoints = random.sample(range(0, 50), nmut)
mutant = list(seed)
for k in mutpoints:
wnt = seed[k]
mutation = choice(mutlist[wnt], p=ntweights[gcr][wnt])
mutant[k] = mutation
return "".join(mutant)
################################
### Read RNAmutants data
################################
......@@ -64,13 +84,14 @@ for gcr in seqfiles.keys():
print gcr
# iterate RNAmutants files
for filename in seqfiles[gcr]:
for filename in seqfiles[gcr][:]:
print filename
idfile = len(mydata[gcr])
# init data structure
# init variables
mydata[gcr][idfile] = {}
seed = ''
# read file
f = open(filename)
......@@ -86,41 +107,54 @@ for gcr in seqfiles.keys():
mssa = row[4]
is_ml_ssa = has_ml(mssa)
if nmut==0:
if seed == '':
seed = mseq
else:
continue
if is_ml_ssa:
accept_rate = 0
else:
accept_rate = 999
accept_rate = 99
if randint(0,accept_rate) == 0:
decoy_list = []
gcr_norm = gcr / 100.
for k in range(n_samples):
decoy_list.append(dinuclShuffle(mseq))
decoy_list.append(random_seq(seed,nmut,gcr_norm))
if not mydata[gcr][idfile].has_key(nmut):
mydata[gcr][idfile][nmut] = {'ml':{'rm_mfe':[],'decoy_ml':[],'decoy_mfe':[]},'noml':{'rm_mfe':[],'decoy_ml':[],'decoy_mfe':[]}}
mydata[gcr][idfile][nmut] = {'rm':{'ml':[],'noml':[]},'random':{'ml':[],'noml':[],'ml_ratio':[]}}
# vienna RNA
ml_decoy_count = 0
decoy_mfe_list = []
decoy_mfe_list = {'ml':[],'noml':[]}
for k in range(n_samples):
decoy = dinuclShuffle(mseq)
rnafold_proc = subprocess.Popen(['RNAfold','--noPS'],stdout=subprocess.PIPE,stdin=subprocess.PIPE)
rnafold_out = rnafold_proc.communicate(input=decoy)[0]
rnafold_parsed = rnafold_re.match(rnafold_out)
if has_ml(rnafold_parsed.group('issa')):
is_decoy_ml = has_ml(rnafold_parsed.group('issa'))
if is_decoy_ml:
ml_decoy_count += 1
decoy_mfe_list.append(float(rnafold_parsed.group('imfe')))
decoy_mfe_list['ml'].append(float(rnafold_parsed.group('imfe')))
else:
decoy_mfe_list['noml'].append(float(rnafold_parsed.group('imfe')))
avg_ml_decoy = float(ml_decoy_count) / n_samples
avg_mfe_decoy = float(sum(decoy_mfe_list)) / n_samples
avg_mfe_decoy = {}
avg_mfe_decoy['ml'] = float(sum(decoy_mfe_list['ml'])) / n_samples
avg_mfe_decoy['noml'] = float(sum(decoy_mfe_list['noml'])) / n_samples
if is_ml_ssa:
mydata[gcr][idfile][nmut]['ml']['rm_mfe'].append(mmfe)
mydata[gcr][idfile][nmut]['ml']['decoy_ml'].append(avg_ml_decoy)
mydata[gcr][idfile][nmut]['ml']['decoy_mfe'].append(avg_mfe_decoy)
mydata[gcr][idfile][nmut]['rm']['ml'].append(mmfe)
else:
mydata[gcr][idfile][nmut]['noml']['rm_mfe'].append(mmfe)
mydata[gcr][idfile][nmut]['noml']['decoy_ml'].append(avg_ml_decoy)
mydata[gcr][idfile][nmut]['noml']['decoy_mfe'].append(avg_mfe_decoy)
mydata[gcr][idfile][nmut]['rm']['noml'].append(mmfe)
if is_decoy_ml:
mydata[gcr][idfile][nmut]['random']['ml'].append(avg_mfe_decoy['ml'])
else:
mydata[gcr][idfile][nmut]['random']['noml'].append(avg_mfe_decoy['noml'])
mydata[gcr][idfile][nmut]['random']['ml_ratio'].append(avg_ml_decoy)
f.close()
......@@ -133,40 +167,78 @@ if True:
xvalues = range(seqlen+1)
yvalues = {}
for gcr in list_gcr:
yvalues[gcr] = {'ml':{'rm':[],'decoy':[]},'noml':{'rm':[],'decoy':[]}}
yvalues[gcr] = {'rm':{'ml':[],'noml':[]},'random':{'ml':[],'noml':[],'ml_ratio':[]}}
for nm in xvalues:
tmp = {'ml':{'rm':[],'decoy':[]},'noml':{'rm':[],'decoy':[]}}
tmp = {'rm':{'ml':[],'noml':[]},'random':{'ml':[],'noml':[],'ml_ratio':[]}}
for idf in mydata[gcr].keys():
if mydata[gcr][idf].has_key(nm):
# ML
if mydata[gcr][idf][nm].has_key('ml') and len(mydata[gcr][idf][nm]['ml']['rm_mfe'])>0:
tmp['ml']['rm'].append(float(sum(mydata[gcr][idf][nm]['ml']['rm_mfe'])) / len(mydata[gcr][idf][nm]['ml']['rm_mfe']))
tmp['ml']['decoy'].append(float(sum(mydata[gcr][idf][nm]['ml']['decoy_mfe'])) / len(mydata[gcr][idf][nm]['ml']['decoy_mfe']))
# RNAmutants
if mydata[gcr][idf][nm].has_key('rm'):
if len(mydata[gcr][idf][nm]['rm']['ml'])>0:
tmp['rm']['ml'].append(float(sum(mydata[gcr][idf][nm]['rm']['ml'])) / len(mydata[gcr][idf][nm]['rm']['ml']))
else:
tmp['rm']['ml'].append(0.)
if len(mydata[gcr][idf][nm]['rm']['noml'])>0:
tmp['rm']['noml'].append(float(sum(mydata[gcr][idf][nm]['rm']['noml'])) / len(mydata[gcr][idf][nm]['rm']['noml']))
else:
tmp['rm']['noml'].append(0.)
else:
tmp['ml']['rm'].append(0.)
tmp['ml']['decoy'].append(0.)
# No ML
if mydata[gcr][idf][nm].has_key('noml') and len(mydata[gcr][idf][nm]['noml']['rm_mfe'])>0:
tmp['noml']['rm'].append(float(sum(mydata[gcr][idf][nm]['noml']['rm_mfe'])) / len(mydata[gcr][idf][nm]['noml']['rm_mfe']))
tmp['noml']['decoy'].append(float(sum(mydata[gcr][idf][nm]['noml']['decoy_mfe'])) / len(mydata[gcr][idf][nm]['noml']['decoy_mfe']))
tmp['rm']['ml'].append(0.)
tmp['rm']['noml'].append(0.)
# Random
if mydata[gcr][idf][nm].has_key('random'):
if len(mydata[gcr][idf][nm]['random']['ml'])>0:
tmp['random']['ml'].append(float(sum(mydata[gcr][idf][nm]['random']['ml'])) / len(mydata[gcr][idf][nm]['random']['ml']))
else:
tmp['random']['ml'].append(0.)
if len(mydata[gcr][idf][nm]['random']['noml'])>0:
tmp['random']['noml'].append(float(sum(mydata[gcr][idf][nm]['random']['noml'])) / len(mydata[gcr][idf][nm]['random']['noml']))
else:
tmp['random']['noml'].append(0.)
if len(mydata[gcr][idf][nm]['random']['ml_ratio'])>0:
tmp['random']['ml'].append(float(sum(mydata[gcr][idf][nm]['random']['ml_ratio'])) / len(mydata[gcr][idf][nm]['random']['ml_ratio']))
else:
tmp['random']['ml_ratio'].append(0.)
else:
tmp['noml']['rm'].append(0.)
tmp['noml']['decoy'].append(0.)
else:
tmp['ml']['rm'].append(0.)
tmp['noml']['decoy'].append(0.)
yvalues[gcr]['ml']['rm'].append(float(sum(tmp['ml']['rm'])) / len(tmp['ml']['rm']))
yvalues[gcr]['noml']['rm'].append(float(sum(tmp['noml']['rm'])) / len(tmp['noml']['rm']))
yvalues[gcr]['ml']['decoy'].append(float(sum(tmp['ml']['decoy'])) / len(tmp['ml']['decoy']))
yvalues[gcr]['noml']['decoy'].append(float(sum(tmp['noml']['decoy'])) / len(tmp['noml']['decoy']))
plt.plot(xvalues,yvalues[gcr]['ml']['rm'],label='ML (RNAmutants)',color='C0')
plt.plot(xvalues,yvalues[gcr]['noml']['rm'],label='No ML (RNAmutants)',color='C1')
plt.plot(xvalues,yvalues[gcr]['ml']['decoy'],label='ML (decoy)',ls=':',color='C0')
plt.plot(xvalues,yvalues[gcr]['noml']['decoy'],label='No ML (decoy)',ls=':',color='C1')
plt.legend(loc=4)
tmp['random']['ml'].append(0.)
tmp['random']['noml'].append(0.)
tmp['random']['ml_ratio'].append(0.)
if len(tmp['rm']['ml']) > 0:
yvalues[gcr]['rm']['ml'].append(float(sum(tmp['rm']['ml'])) / len(tmp['rm']['ml']))
else:
yvalues[gcr]['rm']['ml'].append(0.)
if len(tmp['rm']['noml']) > 0:
yvalues[gcr]['rm']['noml'].append(float(sum(tmp['rm']['noml'])) / len(tmp['rm']['noml']))
else:
yvalues[gcr]['rm']['noml'].append(0.)
if len(tmp['random']['ml']) > 0:
yvalues[gcr]['random']['ml'].append(float(sum(tmp['random']['ml'])) / len(tmp['random']['ml']))
else:
yvalues[gcr]['random']['ml'].append(0.)
if len(tmp['random']['noml']) > 0:
yvalues[gcr]['random']['noml'].append(float(sum(tmp['random']['noml'])) / len(tmp['random']['noml']))
else:
yvalues[gcr]['random']['noml'].append(0.)
if len(tmp['random']['ml_ratio']) > 0:
yvalues[gcr]['random']['ml_ratio'].append(float(sum(tmp['random']['ml_ratio'])) / len(tmp['random']['ml_ratio']))
else:
yvalues[gcr]['random']['ml_ratio'].append(0.)
plt.plot(xvalues,yvalues[gcr]['rm']['ml'],label='RNAmutants (ML)',color='C0')
plt.plot(xvalues,yvalues[gcr]['rm']['noml'],label='RNAmutants (Stem)',color='C1')
plt.plot(xvalues,yvalues[gcr]['random']['ml'],label='Random (ML)',ls=':',color='C0')
plt.plot(xvalues,yvalues[gcr]['random']['noml'],label='Random (Stem)',ls=':',color='C1')
# labels
plt.xlabel('number of mutations',fontsize=18)
plt.ylabel('energy (kcal/mol)',fontsize=18)
plt.legend(loc=4,fontsize=10)
# layout
#plt.locator_params(axis='y', nticks=5)
plt.tight_layout()
# show plot
plt.show()
......@@ -45,17 +45,18 @@ if False:
#yvalues = np.array([ scipy.misc.comb(50, n, exact=False) * math.pow(3,n) for n in xvalues])
#plt.plot(xvalues,yvalues)
gc_target_list = [0.1,0.3,0.5,0.7,0.9]
gc_labels = {0.1:'0.1',0.3:'0.3',0.5:'0.5',0.7:'0.7',0.9:'0.9'}
gc_target_list = [0.1,0.3,0.5]
#gc_target_list = [0.1,0.3,0.5,0.7,0.9]
gc_labels = {0.1:'0.1/0.9',0.3:'0.3/0.7',0.5:'0.5',0.7:'0.7',0.9:'0.9'}
gc_ls = {0.1:':',0.3:'--',0.5:'-',0.7:'--',0.9:':'}
gc_lc = {0.1:'blue',0.3:'green',0.5:'red',0.7:'magenta',0.9:'cyan'}
size_sequence = 50
for target_gc in gc_target_list:
print target_gc
# number of gc & au in seed
n_gc = int(50 * target_gc) # number of gc
n_au = 50 - n_gc # number of au
n_gc = int(size_sequence * target_gc) # number of gc
n_au = size_sequence - n_gc # number of au
# limit of modified gc in mutants
min_target_gc = target_gc - 0.1
max_target_gc = target_gc + 0.1
......@@ -85,10 +86,13 @@ for target_gc in gc_target_list:
plt.plot(np.array(xvalues),np.array(yvalues),label=gc_labels[target_gc])
# labels
plt.xlabel('number of mutations',fontsize=18)
plt.ylabel('normalized multi-loop frequency',fontsize=18)
#plt.xlabel('number of mutations',fontsize=18)
plt.xlabel('normalized multi-loop frequency',fontsize=18)
plt.ylabel('number of mutants',fontsize=18)
plt.legend(loc=3)
plt.semilogy()
plt.tight_layout()
# show plot
plt.show()
%!TEX root = main_maternal.tex
\section{Discussion}
We provided evidence that in the absence of selective pressure the structure of the \st{evolutionary} \hlt{mutational} landscape could have helped to promote the emergence of an RNA-based form of life. To support our hypothesis, we built a comprehensive representation of the \st{evolutionary} \hlt{mutational} landscape of RNA molecules, and investigated scenarios based on distinct hypotheses.
We provided evidence that in the absence of selective pressure the structure of the mutational landscape could have helped to promote the emergence of an RNA-based form of life. To support our hypothesis, we built a comprehensive representation of the mutational landscape of RNA molecules, and investigated scenarios based on distinct hypotheses.
Our results offer solid foundations to parsimonious evolutionary scenarios based on undirected molecular self-replications with occasional mutations. In these simple models, the GC content appears as a key feature to determine the probability of discovering stable multi-branched secondary structures. In particular, intermediate GC contents (i.e. 0.5) result in a drift of \hlt{randomly replicating} populations toward a sub-space of the evolutionary landscape \hlt{uncovered by \RNAmutants} that drastically increases the probability of discovering thermodynamically stable complex shapes essential for the emergence of life at the molecular level.
Our results offer solid foundations to parsimonious evolutionary scenarios based on undirected molecular self-replications with occasional mutations. In these simple models, the GC content appears as a key feature to determine the probability of discovering stable multi-branched secondary structures. In particular, intermediate GC contents (i.e. 0.5) result in a drift of randomly replicating populations toward a sub-space of the evolutionary landscape uncovered by \RNAmutants that drastically increases the probability of discovering thermodynamically stable complex shapes essential for the emergence of life at the molecular level.
The preservation of intermediate GC content values appeared to us as a reasonable assumption, which could reflect the availability of various nucleotides in the prebiotic milieu. This nucleotide composition bias can be interpreted as an intrinsic force that favoured the emergence of life. It also offers novel insights into fundamental properties of the genetic alphabet \citep{Gardner:2003aa}.
......@@ -14,7 +13,7 @@ Eventually, our results could be used to put in perspective earlier findings sug
Our analysis completes recent studies that aimed to characterize fundamental properties of genotype-phenotype maps \citep{Greenbury:2015aa,Manrubia:2017aa}, and showed that their structure may contribute to the emergence of functional molecules \citep{Dingle:2015aa}. It also emphasizes the relevance of theoretical models based on a thermodynamical view of prebiotic evolution \cite{Pascal:2013aa}.
The size of the RNA sequences considered in this study has been fixed at 50 nucleotides. This length appears to be the current upper limit for non-enzymatic synthesis \citep{Hill:1993aa}, and therefore maximizes the expressivity of our evolutionary scenario. Variations of the sizes of populations or lengths of RNA sequences \hlt{resulting from indels} could be eventually considered with the implementation of dedicated algorithms \citep{Waldispuhl:2002aa}. Although we do not expect any major impact on our conclusions.
The size of the RNA sequences considered in this study has been fixed at 50 nucleotides. This length appears to be the current upper limit for non-enzymatic synthesis \citep{Hill:1993aa}, and therefore maximizes the expressivity of our evolutionary scenario. Variations of the sizes of populations or lengths of RNA sequences resulting from indels could be eventually considered with the implementation of dedicated algorithms \citep{Waldispuhl:2002aa}. Although, if these variations remain modest, we do not expect any major impact on our conclusions.
The error rates considered in this study were chosen to match values used in previous related works (e.g \citep{manrubia2007modular}). This choice is also corroborated by recent experiments suggesting that early life scenarios could sustain high error rates \cite{Rajamani:2010aa}. Nevertheless, lower mutation rates would only increase the number of generations needed to reach the asymptotic behaviour (See \textbf{Fig.~\ref{fig:tamura}}), and thus would not affect our results.
......
......@@ -54,16 +54,16 @@ In the most commonly accepted scenarios, the establishment of a stable, autonomo
Interestingly, \textit{in vitro} experiments revealed the extreme versatility of random nucleic acids \citep{Beaudry:1992aa,Bartel:1993aa,Schultes:2005aa}. Other studies have also suggested that essential RNA molecules such as the hammerhead ribozyme have multiple origins \citep{Salehi-Ashtiani:2001aa}. All together, these observations reinforce the plausibility of a spontaneous emergence of multiple functional sub-units. But they also question us about the likelihood of such events and the existence of intrinsic forces promoting these phenomena.
% models boosting structural complexity
Various theoretical models have been proposed to highlight mechanisms that may have favoured the birth and growth of structural complexity from replications of small monomers. Computational studies have been of tremendous help to validate these theories and quantify their impact. In particular, numerical simulations enabled us to explore the effects of polymerization on mineral surfaces \citep{Szabo:2002aa,Briones:2009aa} or the importance of spatial distribution \citep{Shay:2015aa}. \hlt{Another important aspect} of early life models is the tradeoff between stability and structural complexity. Stable folds often lack the complexity necessary to support novel functions but are more resilient to harsh pre-cellular environments ~\cite{ivica2013paradox}. \todo{GC content?} Still, the debate about the necessity for such hypotheses remains open.
Various theoretical models have been proposed to highlight mechanisms that may have favoured the birth and growth of structural complexity from replications of small monomers. Computational studies have been of tremendous help to validate these theories and quantify their impact. In particular, numerical simulations enabled us to explore the effects of polymerization on mineral surfaces \citep{Szabo:2002aa,Briones:2009aa} or the importance of spatial distribution \citep{Shay:2015aa}. Another important aspect of early life models is the tradeoff between stability and structural complexity. Stable folds often lack the complexity necessary to support novel functions but are more resilient to harsh pre-cellular environments ~\cite{ivica2013paradox}. Often, the GC content of the sequences appears as a relevant parameter to balance these opposite properties. Still, the debate about the necessity for such mechanisms remains open.
%\subsection{Our contribution}
In this work, we show that structural complexity can naturally emerge without the help of any sophisticated molecular mechanisms. We reveal subtle topological features of RNA mutational networks that helped to promote the discovery of functional RNAs at the early stages of the RNA world hypothesis. We demonstrate that in the absence of selective pressure, self-replicating RNA populations naturally drift toward \st{a singular region} \hlt{regions} of the sequence landscape enriched in complex structures, allowing for the simultaneous discovery of all molecular components needed to form a complete functional system.
In this work, we show that structural complexity can naturally emerge without the help of any sophisticated molecular mechanisms. We reveal subtle topological features of RNA mutational networks that helped to promote the discovery of functional RNAs at the early stages of the RNA world hypothesis. We demonstrate that in the absence of selective pressure, self-replicating RNA populations naturally drift toward distinct regions of the sequence landscape enriched in complex structures, allowing for the simultaneous discovery of all molecular components needed to form a complete functional system.
For the first time, we apply customized algorithms to map secondary structures on all mutant sequences with $50$ nucleotides \citep{Waldispuhl:2008aa,waldispuhl2011unbiased}. This approach considerably expands the scope and significance of comprehensive RNA evolutionary studies that were previously limited to sequences with less than $20$ nucleotides \citep{Gruner:1996aa,Cowperthwaite:2008aa}, or restricted to explore a small fraction of the sequence landscape of sequences \citep{stich2008structural,Dingle:2015aa}. This technical breakthrough is essential to observe the formation of complex multi-branched structures often used to carry essential molecular functions that cannot be assembled on shorter sequences.
Our simulations reveal the unexpected presence of a large pool of remarkably stable multi-branched structures in
a region of the RNA mutational landscape characterized by an average distance of $30$ to $40$ mutations from a random sequence, and a balanced GC content (i.e. $0.5$). Strikingly, these multi-branched RNAs have similar energies ($-15 \pm 5\:\kcalmol$) to those observed in the Rfam database \citep{Nawrocki:2015aa} \hlt{on the same length scale} (See {\bf Fig.~\ref{subfig:rfam_stats}}).
a region of the RNA mutational landscape characterized by an average distance of $30$ to $40$ mutations from a random sequence, and a balanced GC content (i.e. $0.5$). Strikingly, these multi-branched RNAs have similar energies ($-15 \pm 5\:\kcalmol$) to those observed in the Rfam database \citep{Nawrocki:2015aa} on the same length scale (See {\bf Fig.~\ref{subfig:rfam_stats}}).
We compare these data to populations that evolved under a selective pressure eliciting stable structures. Although this evolutionary mechanism shows a remarkable capacity to quickly improve the stability of structures, it fails to reproduce the structural complexity observed in RNA families of similar lengths.
Finally, we show that a population of RNA molecules replicating itself \hlt{randomly} with random errors but preserving a balanced GC content \citep{Tamura:1992aa}, naturally evolves toward regions of the landscape enriched with multi-branched structures potentially capable of supporting essential biochemical functions. Our results argue for a simple scenario of the origin of life in which an initial pool of nucleic acids would irresistibly evolve to promote a spontaneous and simultaneous discovery of the basic bricks of life.
Finally, we show that a population of RNA molecules replicating itself randomly with accidental errors but preserving a balanced GC content \citep{Tamura:1992aa}, naturally evolves toward regions of the landscape enriched with multi-branched structures potentially capable of supporting essential biochemical functions. Our results argue for a simple scenario of the origin of life in which an initial pool of nucleic acids would irresistibly evolve to promote a spontaneous and simultaneous discovery of the basic bricks of life.
......@@ -58,7 +58,7 @@ $\\\small$^1$ School of Computer Science, McGill University, Montreal, Canada\\\
\begin{abstract}
The RNA world hypothesis relies on the ability of ribonucleic acids to replicate and spontaneously acquire complex structures capable of supporting essential biological functions. Multiple sophisticated evolutionary models have been proposed, but they often assume specific conditions.
%
In this work we explore a simple and parsimonious scenario describing the emergence of complex molecular structures at the early stages of life. We show that at specific GC-content regimes, an undirected replication model is sufficient to explain the apparition of multi-branched RNA secondary structures -- a structural signature of many essential ribozymes. We ran a large scale computational study to map energetically stable structures on complete mutational networks of 50-nucleotide-long RNA sequences. Our results reveal \st{a distinct region} \hlt{regions} of the sequence landscape enriched with multi-branched structures bearing strong similarities to those observed in databases. A random replication mechanism preserving a $50\%$ GC-content suffices to explain a natural drift of RNA populations toward \st{this particular region} \hlt{complex stable structures}.
In this work we explore a simple and parsimonious scenario describing the emergence of complex molecular structures at the early stages of life. We show that at specific GC-content regimes, an undirected replication model is sufficient to explain the apparition of multi-branched RNA secondary structures -- a structural signature of many essential ribozymes. We ran a large scale computational study to map energetically stable structures on complete mutational networks of 50-nucleotide-long RNA sequences. Our results reveal distinct regions of the sequence landscape enriched with multi-branched structures bearing strong similarities to those observed in databases. A random replication mechanism preserving a $50\%$ GC-content suffices to explain a natural drift of RNA populations toward complex stable structures.
\end{abstract}
......
......@@ -45,7 +45,7 @@ When a sequence is selected for replication, the child sequence is formed by cop
\subsubsection{Controlling population GC content}
\label{sec:gc_control}
There are two obstacles to maintaining evolving populations within the desired GC content range of $\pm 0.1$. First, an initial population of random sequences sampled uniformly from the full alphabet naturally tends converge to a GC content of $0.5$. To avoid this, we sample from the alphabet with probability of sampling GC and AU equal to the desired GC content. This way our initial population has the desired nucleotide distribution. Second, when running the simulation, random mutations are able to move replicating sequences outside of the desired range, \hlt{especially at extremes of mutation rate and GC content.} To avoid this drift, at the selection stage, we do not select mutations that would take the sequence outside of this range. Instead, if a mutation takes a replicating sequence outside the GC range, we simply repeat the mutation process on the sequence until the child sequence has the appropriate GC content (See {\bf Alg. ~\ref{alg:gc}}). Given that populations are initialized in the appropriate GC range, we are likely to find valid mutants relatively quickly and always avoid drifting away from the target GC.\\
There are two obstacles to maintaining evolving populations within the desired GC content range of $\pm 0.1$. First, an initial population of random sequences sampled uniformly from the full alphabet naturally tends converge to a GC content of $0.5$. To avoid this, we sample from the alphabet with probability of sampling GC and AU equal to the desired GC content. This way our initial population has the desired nucleotide distribution. Second, when running the simulation, random mutations are able to move replicating sequences outside of the desired range, especially at extremes of mutation rate and GC content. To avoid this drift, at the selection stage, we do not select mutations that would take the sequence outside of this range. Instead, if a mutation takes a replicating sequence outside the GC range, we simply repeat the mutation process on the sequence until the child sequence has the appropriate GC content (See {\bf Alg. ~\ref{alg:gc}}). Given that populations are initialized in the appropriate GC range, we are likely to find valid mutants relatively quickly and always avoid drifting away from the target GC.\\
\IncMargin{1em}
\begin{algorithm}[H]
......
......@@ -3,7 +3,7 @@
% Overview
\subsection{Our approach}
We apply two complementary mutation space search techniques to characterize the influence of \st{a selection} \hlt{sampling} process to the repertoire of shapes accessible from an initial pool of random sequences (See \textbf{Fig.~\ref{fig:summary}}). Importantly, our analysis explicitly models the impact of the GC content bias. Our first algorithm \RNAmutants enumerates all mutated sequences and samples the ones with the \emph{globally} lowest folding energy \citep{Waldispuhl:2008aa}. It enables us to calculate the structures accessible from a random replication process. \hlt{In contrast, } our other algorithm named \maternal, has been developed for this study to simulate the evolution of a population of RNA sequences that preferentially selects the most stable structures under nucleotide bias.
We apply two complementary mutation space search techniques to characterize the influence of sampling processes to the repertoire of shapes accessible from an initial pool of random sequences (See \textbf{Fig.~\ref{fig:summary}}). Importantly, our analysis explicitly models the impact of the GC content bias. Our first algorithm \RNAmutants enumerates all mutated sequences and samples the ones with the \emph{globally} lowest folding energy \citep{Waldispuhl:2008aa}. It enables us to calculate the structures accessible from a random replication process. In contrast, our other algorithm named \maternal, has been developed for this study to simulate the evolution of a population of RNA sequences that preferentially selects the most stable structures under nucleotide bias.
\begin{figure}
......@@ -15,8 +15,7 @@ We apply two complementary mutation space search techniques to characterize the
Using \RNAmutants, we compute for the very first time, the energy landscape of the complete mutational network of RNA sequences of length $50$ preserving the GC content (See {\bf Fig.~\ref{fig:energy}}). This result contrasts with previous studies that used brute-force approaches to calculate the complete sequence landscape, and were therefore limited to sizes of $20$ nucleotides. This technical breakthrough is essential because multi-branched structures occur only on RNAs with sizes above $40$ (See {\bf Fig.~\ref{fig:rfam}})
More precisely, given an initial sequence (i.e. the \emph{seed}), \RNAmutants calculates mutated sequences with the lowest folding energies among all possible secondary structures (i.e. no target secondary structure). Importantly, \RNAmutants allows us to control the GC content of mutated sequences \citep{waldispuhl2011unbiased} (fraction of bases in a sequence that are either G or C). We ran \RNAmutants on $20$ independently generated random sequences for each GC content regime in $\big\{ 0.1, 0.3, 0.5, 0.7, 0.9 \big\}$ ($\pm 0.1$). At each mutational distance (i.e. number of mutations from the seed) from 0 to 50, we sample from the Boltzmann distribution approximately \num{10000} sequence-structure pairs with the target GC content. \todo{word on sample size} It is worth mentioning that the secondary structures sampled by \RNAmutants are not necessarily MFE structures of the sequences associated with. However, our data shows that the vast majority of these sampled structures are very close to their MFE counterpart (See {\bf Supp. Fig.~\ref{fig:freq}}). This observation enables us to simplify analysis by retaining MFE structures and produce data that is directly comparable with \maternal.
More precisely, given an initial sequence (i.e. the \emph{seed}), \RNAmutants calculates mutated sequences with the lowest folding energies among all possible secondary structures (i.e. no target secondary structure). Importantly, \RNAmutants allows us to control the GC content of mutated sequences \citep{waldispuhl2011unbiased} (fraction of bases in a sequence that are either G or C). We ran \RNAmutants on $20$ independently generated random sequences for each GC content regime in $\big\{ 0.1, 0.3, 0.5, 0.7, 0.9 \big\}$ ($\pm 0.1$). At each mutational distance (i.e. number of mutations from the seed) from 0 to 50, we sample from the Boltzmann distribution approximately \num{10000} sequence-structure pairs with the target GC content (the sample size that has been empirically determined to guarantee the reproducibility of our results). It is worth mentioning that the secondary structures sampled by \RNAmutants are not necessarily MFE structures of the sequences associated with. However, our data shows that the vast majority of these sampled structures are very close to their MFE counterpart (See {\bf Supp. Fig.~\ref{fig:freq}}). This observation enables us to simplify analysis by retaining MFE structures and produce data that is directly comparable with \maternal.
\subsection{Energy landscape of RNA mutational networks}
......@@ -44,9 +43,9 @@ As expected, we observe that the nucleotide content is an important factor in de
%\subsubsection{Structural diversity}
Having established that mutational neighbourhoods of random sequences yield stable and low energy states, it is important to also understand the structural features of these states. This is of particular interest trying to provide an account of the emergence of functional and complex RNAs. While previous studies on the structure of short randomly sampled RNA sequences have shown that simple hairpin structures dominate the landscape, we find that for longer molecules the landscape of stable mutant neighbourhood reveals ensembles that are well populated with diverse structures (See {\bf Fig.~\ref{fig:struct_diversity}}). Interestingly, for all GC content sampling and particularly at low to intermediate GC contents, we find that after an initial stabilization period at short mutational distances ($\sim 10$ mutations) more complex structures begin to emerge. \todo{figure?} This sudden and unexpected change of regime seems to correlate with the decrease of the average folding energies observed in {\bf Fig.~\ref{fig:energy}}.
Having established that mutational neighbourhoods of random sequences yield stable and low energy states, it is important to also understand the structural features of these states. This is of particular interest trying to provide an account of the emergence of functional and complex RNAs. While previous studies on the structure of short randomly sampled RNA sequences have shown that simple hairpin structures dominate the landscape, we find that for longer molecules the landscape of stable mutant neighbourhood reveals ensembles that are well populated with diverse structures (See {\bf Fig.~\ref{fig:struct_diversity}}). Interestingly, for all GC content sampling and particularly at low to intermediate GC contents, we find that after an initial stabilization period at short mutational distances ($\sim 10$ mutations) more complex structures begin to emerge. This sudden and unexpected change of regime seems to correlate with the decrease of the average folding energies observed in {\bf Fig.~\ref{fig:energy}}.
We can observe this change as an increase in the number of structures with internal loops ({\bf Fig.~\ref{fig:rnamuts_internal}}) and remarkably, multi-loops ({\bf Fig.~\ref{fig:rnamuts_multi}}). This finding is in good qualitative agreement with databases of evolved structures which contain structures with more diverse motifs than has been previously estimated from pools of random RNAs. \todo{size difference}
We can observe this change as an increase in the number of structures with internal loops ({\bf Fig.~\ref{fig:rnamuts_internal}}) and remarkably, multi-loops ({\bf Fig.~\ref{fig:rnamuts_multi}}). This finding is in good qualitative agreement with databases of evolved structures which contain structures with more diverse motifs than what has been estimated from pools of random RNAs (See {\bf Fig.~\ref{fig:ML_distribs}}, or the work of \citet{stich2008structural} on shorter sequences).
These secondary structure features (i.e. internal and multi-loops) are key components of RNAs that allow for higher order interactions that can support catalytic function. Although multi-loops occur at relatively low frequencies in the \RNAmutants mutational landscapes (at most 0.01), we focus on them for their high degree of structural complexity and importance to many functional RNAs observed in vivo such as the hammerhead ribozyme.
......@@ -99,7 +98,7 @@ More precisely, we observe that at short mutational distances, evolutionary appr
Interestingly, we find that varying the mutation rate does not have a strong effect on the energy of populations obtained (See {\bf Fig.~\ref{fig:mat_muts}}). We see that the average population energy remains within $\pm 5\:\kcalmol$ for all mutation rates except the highest of $0.1$. This apparent property of energy based evolutionary models suggests that the search for stable structures is flexible to external conditions such as varying mutation rates and may provide a mechanism for better exploring phenotype space without sacrificing stability.
%\subsubsection{Structural diversity}
While we observe very efficient energy optimization from the natural selection process,