Commit 5b5211e5 authored by Jerome Waldispuhl's avatar Jerome Waldispuhl
Browse files

Merge branch 'master' of jwgitlab.cs.mcgill.ca:vreinharz/maternal_all

merge with appeal
parents 2642b2c6 a2037b01
import sys
from math import log;
def loop_counter(structure):
......@@ -189,3 +190,5 @@ def compute_entropy(listseqs):
outdic['avg'] /= lenseq;
return outdic['avg'];
if __name__ == "__main__":
print(loop_counter(sys.argv[1]))
import numpy as np
import pandas as pd
def count_stacks(ss):
stacks = 0
stack_size = 0
in_stack = False
i = 0
j = len(ss)-1
while i < j:
l = ss[i]
r = ss[j]
if l == "(" and r == ")":
stack_size += 1
i += 1
j -= 1
if stack_size > 1:
in_stack = True
else:
if in_stack:
stacks += 1
in_stack = False
stack_size = 0
if l == "(" and r == ".":
j -= 1
if l == "." and r == ")":
i += 1
if l == "." and r == ".":
i += 1
j -= 1
return stacks
def contiguity(stem):
"""
return: contiguity score for stem
"""
stem_length = len(stem)
bps = stem.count('(')
#count number of contiguous stacks
stacks = count_stacks(stem)
return np.log((stem_length - bps) / stacks)
def stem_find(ss):
"""
return: list containing start and end indices of all stems in RNA
"""
stack = []
stem_start = None
stem_end= None
#add bases to stack if open
#remove from stack if closing
#if stack is empty we have a stem
stem_indices = []
for i, b in enumerate(ss):
if b == "(":
if len(stack) == 0:
stem_start = i
stack.append(b)
elif b == ")":
stack.pop()
if len(stack) == 0:
stem_end = i
stem_indices.append((stem_start, stem_end))
elif b == "." and len(stack) == 0:
continue
else:
continue
if len(stack) != 0:
print("UNBALANCED!")
return stem_indices
def mean_contig(ss, stems):
contigs = []
for s in stems:
start, end = s
contigs.append(contiguity(ss[start: end+1]) * float(start-end))
return np.mean(contigs)
def ml_contiguity():
df = pd.read_csv("../Data/rnamuts_multiloops.csv")
sss = df['structure']
for ss in sss:
stems = stem_find(ss)
print(ss)
print(stems)
# print(mean_contig(ss, stems))
break
def contigs(ss):
stack = []
in_stack = False
stack_count = 0
stacks = 0
for i, s in enumerate(ss):
if s == "(":
stack.append(i)
stack_count += 1
elif s == ")":
pass
else:
if stack_count > 1:
stacks += 1
stack_count = 0
return stacks
if __name__ == "__main__":
# ml_contiguity()
ss = "((((..)).))"
print(contigs(ss))
......@@ -1445,3 +1445,13 @@ CONCLUSION: Adaptation time of molecular quasispecies to a given environment is
Publisher = {Oxford Univ Press},
Title = {{NNDB}: the nearest neighbor parameter database for predicting stability of nucleic acid secondary structure},
Year = {2009}}
@article{ivica2013paradox,
title={The paradox of dual roles in the {RNA} world: resolving the conflict between stable folding and templating ability},
author={Ivica, Nikola A and Obermayer, Benedikt and Campbell, Gregory W and Rajamani, Sudha and Gerland, Ulrich and Chen, Irene A},
journal={Journal of molecular evolution},
volume={77},
number={3},
pages={55--63},
year={2013},
publisher={Springer}
}
%!TEX root = main_maternal.tex
\section{Discussion}
We provided evidence that in the absence of selective pressure the structure of the evolutionary landscape could have helped to promote the emergence of an RNA-based form of life. To support our hypothesis, we built a comprehensive representation of the evolutionary landscape of RNA molecules, and investigated scenarios based on distinct hypotheses.
We provided evidence that in the absence of selective pressure the structure of the \st{evolutionary} \hlt{mutational} landscape could have helped to promote the emergence of an RNA-based form of life. To support our hypothesis, we built a comprehensive representation of the \st{evolutionary} \hlt{mutational} landscape of RNA molecules, and investigated scenarios based on distinct hypotheses.
Our results offer solid foundations to parsimonious evolutionary scenarios based on undirected molecular self-replications with occasional mutations. In these simple models, the GC content appears as a key feature to determine the probability of discovering stable multi-branched secondary structures. In particular, intermediate GC contents (i.e. 0.5) result in a drift of the population toward a sub-space of the evolutionary landscape that drastically increases the probability of discovering thermodynamically stable complex shapes essential for the emergence of life at the molecular level.
Our results offer solid foundations to parsimonious evolutionary scenarios based on undirected molecular self-replications with occasional mutations. In these simple models, the GC content appears as a key feature to determine the probability of discovering stable multi-branched secondary structures. In particular, intermediate GC contents (i.e. 0.5) result in a drift of \hlt{randomly replicating} populations toward a sub-space of the evolutionary landscape \hlt{uncovered by \RNAmutants} that drastically increases the probability of discovering thermodynamically stable complex shapes essential for the emergence of life at the molecular level.
The preservation of intermediate GC content values appeared to us as a reasonable assumption, which could reflect the availability of various nucleotides in the prebiotic milieu. This nucleotide composition bias can be interpreted as an intrinsic force that favoured the emergence of life. It also offers novel insights into fundamental properties of the genetic alphabet \citep{Gardner:2003aa}.
......@@ -13,7 +14,7 @@ Eventually, our results could be used to put in perspective earlier findings sug
Our analysis completes recent studies that aimed to characterize fundamental properties of genotype-phenotype maps \citep{Greenbury:2015aa,Manrubia:2017aa}, and showed that their structure may contribute to the emergence of functional molecules \citep{Dingle:2015aa}. It also emphasizes the relevance of theoretical models based on a thermodynamical view of prebiotic evolution \cite{Pascal:2013aa}.
The size of the RNA sequences considered in this study has been fixed at 50 nucleotides. This length appears to be the current upper limit for non-enzymatic synthesis \citep{Hill:1993aa}, and therefore maximizes the expressivity of our evolutionary scenario. Variations of the sizes of populations or lengths of RNA sequences could be eventually considered with the implementation of dedicated algorithms \citep{Waldispuhl:2002aa}. Although we do not expect any major impact on our conclusions.
The size of the RNA sequences considered in this study has been fixed at 50 nucleotides. This length appears to be the current upper limit for non-enzymatic synthesis \citep{Hill:1993aa}, and therefore maximizes the expressivity of our evolutionary scenario. Variations of the sizes of populations or lengths of RNA sequences \hlt{resulting from indels} could be eventually considered with the implementation of dedicated algorithms \citep{Waldispuhl:2002aa}. Although we do not expect any major impact on our conclusions.
The error rates considered in this study were chosen to match values used in previous related works (e.g \citep{manrubia2007modular}). This choice is also corroborated by recent experiments suggesting that early life scenarios could sustain high error rates \cite{Rajamani:2010aa}. Nevertheless, lower mutation rates would only increase the number of generations needed to reach the asymptotic behaviour (See \textbf{Fig.~\ref{fig:tamura}}), and thus would not affect our results.
......
......@@ -54,16 +54,16 @@ In the most commonly accepted scenarios, the establishment of a stable, autonomo
Interestingly, \textit{in vitro} experiments revealed the extreme versatility of random nucleic acids \citep{Beaudry:1992aa,Bartel:1993aa,Schultes:2005aa}. Other studies have also suggested that essential RNA molecules such as the hammerhead ribozyme have multiple origins \citep{Salehi-Ashtiani:2001aa}. All together, these observations reinforce the plausibility of a spontaneous emergence of multiple functional sub-units. But they also question us about the likelihood of such events and the existence of intrinsic forces promoting these phenomena.
% models boosting structural complexity
Various theoretical models have been proposed to highlight mechanisms that may have favoured the birth and growth of structural complexity from replications of small monomers. Computational studies have been of tremendous help to validate these theories and quantify their impact. In particular, numerical simulations enabled us to explore the effects of polymerization on mineral surfaces \citep{Szabo:2002aa,Briones:2009aa} or the importance of spatial distribution \citep{Shay:2015aa}. Still, the debate about the necessity for such hypothesis remains open.
Various theoretical models have been proposed to highlight mechanisms that may have favoured the birth and growth of structural complexity from replications of small monomers. Computational studies have been of tremendous help to validate these theories and quantify their impact. In particular, numerical simulations enabled us to explore the effects of polymerization on mineral surfaces \citep{Szabo:2002aa,Briones:2009aa} or the importance of spatial distribution \citep{Shay:2015aa}. \hlt{Another important aspect} of early life models is the tradeoff between stability and structural complexity. Stable folds often lack the complexity necessary to support novel functions but are more resilient to harsh pre-cellular environments ~\cite{ivica2013paradox}. \todo{GC content?} Still, the debate about the necessity for such hypotheses remains open.
%\subsection{Our contribution}
In this work, we show that structural complexity can naturally emerge without the help of any sophisticated molecular mechanisms. We reveal subtle topological features of RNA mutational networks that helped to promote the discovery of functional RNAs at the early stages of the RNA world hypothesis. We demonstrate that in the absence of selective pressure, self-replicating RNA populations naturally drift toward a singular region of the sequence landscape enriched in complex structures, allowing for the simultaneous discovery of all molecular components needed to form a complete functional system.
In this work, we show that structural complexity can naturally emerge without the help of any sophisticated molecular mechanisms. We reveal subtle topological features of RNA mutational networks that helped to promote the discovery of functional RNAs at the early stages of the RNA world hypothesis. We demonstrate that in the absence of selective pressure, self-replicating RNA populations naturally drift toward \st{a singular region} \hlt{regions} of the sequence landscape enriched in complex structures, allowing for the simultaneous discovery of all molecular components needed to form a complete functional system.
For the first time, we apply customized algorithms to map secondary structures on all mutant sequences with $50$ nucleotides \citep{Waldispuhl:2008aa,waldispuhl2011unbiased}. This approach considerably expands the scope and significance of comprehensive RNA evolutionary studies that were previously limited to sequences with less than $20$ nucleotides \citep{Gruner:1996aa,Cowperthwaite:2008aa}, or restricted to explore a small fraction of the sequence landscape of sequences \citep{stich2008structural,Dingle:2015aa}. This technical breakthrough is essential to observe the formation of complex multi-branched structures often used to carry essential molecular functions that cannot be assembled on shorter sequences.
Our simulations reveal the unexpected presence of a large pool of remarkably stable multi-branched structures in
a region of the RNA mutational landscape characterized by an average distance of $30$ to $40$ mutations from a random sequence, and a balanced GC content (i.e. $0.5$). Strikingly, these multi-branched RNAs have similar energies ($-15 \pm 5\:\kcalmol$) to those observed in the Rfam database \citep{Nawrocki:2015aa} (See {\bf Fig.~\ref{subfig:rfam_stats}}).
a region of the RNA mutational landscape characterized by an average distance of $30$ to $40$ mutations from a random sequence, and a balanced GC content (i.e. $0.5$). Strikingly, these multi-branched RNAs have similar energies ($-15 \pm 5\:\kcalmol$) to those observed in the Rfam database \citep{Nawrocki:2015aa} \hlt{on the same length scale} (See {\bf Fig.~\ref{subfig:rfam_stats}}).
We compare these data to populations that evolved under a selective pressure eliciting stable structures. Although this evolutionary mechanism shows a remarkable capacity to quickly improve the stability of structures, it fails to reproduce the structural complexity observed in RNA families of similar lengths.
Finally, we show that a population of RNA molecules replicating itself with random errors but preserving a balanced GC content \citep{Tamura:1992aa}, naturally evolves toward regions of the landscape enriched with multi-branched structures potentially capable of supporting essential biochemical functions. Our results argue for a simple scenario of the origin of life in which an initial pool of nucleic acids would irresistibly evolve to promote a spontaneous and simultaneous discovery of the basic bricks of life.
Finally, we show that a population of RNA molecules replicating itself \hlt{randomly} with random errors but preserving a balanced GC content \citep{Tamura:1992aa}, naturally evolves toward regions of the landscape enriched with multi-branched structures potentially capable of supporting essential biochemical functions. Our results argue for a simple scenario of the origin of life in which an initial pool of nucleic acids would irresistibly evolve to promote a spontaneous and simultaneous discovery of the basic bricks of life.
......@@ -12,7 +12,9 @@
\usepackage{amsmath}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{todonotes}
\usepackage{tikz}
\usepackage{soul}
\usetikzlibrary{shapes,arrows}
\usepackage[]{algorithm2e}
\usepackage{siunitx}
......@@ -40,6 +42,8 @@
\newcommand{\kcalmol}{\si{\kilo\calorie\per\mol}}
\newcommand{\kcalmolk}{\si{\kilo\calorie\per\mol\per\kelvin}}
\newcommand{\hlt}[1]{\colorbox{pink}{#1}}
\begin{document}
......@@ -53,7 +57,9 @@ $\\\small$^1$ School of Computer Science, McGill University, Montreal, Canada\\\
\begin{abstract}
The RNA world hypothesis relies on the ability of ribonucleic acids to replicate and spontaneously acquire complex structures capable of supporting essential biological functions. Multiple sophisticated evolutionary models have been proposed, but they often assume specific conditions.
%
In this work we explore a simple and parsimonious scenario describing the emergence of complex molecular structures at the early stages of life. We show that at specific GC-content regimes, an undirected replication model is sufficient to explain the apparition of multi-branched RNA secondary structures -- a structural signature of many essential ribozymes. We ran a large scale computational study to map energetically stable structures on complete mutational networks of 50-nucleotide-long RNA sequences. Our results reveal a distinct region of the sequence landscape enriched with multi-branched structures bearing strong similarities to those observed in databases. A random replication mechanism preserving a $50\%$ GC-content suffices to explain a natural drift of RNA populations toward this particular region.
In this work we explore a simple and parsimonious scenario describing the emergence of complex molecular structures at the early stages of life. We show that at specific GC-content regimes, an undirected replication model is sufficient to explain the apparition of multi-branched RNA secondary structures -- a structural signature of many essential ribozymes. We ran a large scale computational study to map energetically stable structures on complete mutational networks of 50-nucleotide-long RNA sequences. Our results reveal \st{a distinct region} \hlt{regions} of the sequence landscape enriched with multi-branched structures bearing strong similarities to those observed in databases. A random replication mechanism preserving a $50\%$ GC-content suffices to explain a natural drift of RNA populations toward \st{this particular region} \hlt{complex stable structures}.
\end{abstract}
\newpage
......@@ -70,7 +76,7 @@ In this work we explore a simple and parsimonious scenario describing the emerge
\input{contributions.tex}
\input{declaration.tex}
%\input{declaration.tex}
\bibliographystyle{unsrtnat}
\bibliography{biblio}
......
......@@ -45,7 +45,7 @@ When a sequence is selected for replication, the child sequence is formed by cop
\subsubsection{Controlling population GC content}
\label{sec:gc_control}
There are two obstacles to maintaining evolving populations within the desired GC content range of $\pm 0.1$. First, an initial population of random sequences sampled uniformly from the full alphabet naturally tends converge to a GC content of $0.5$. To avoid this, we sample from the alphabet with probability of sampling GC and AU equal to the desired GC content. This way our initial population has the desired nucleotide distribution. Second, when running the simulation, random mutations are able to move replicating sequences outside of the desired range. Given that we are selecting for stable structures, it is likely to drive the population to higher GC contents. To avoid this, at the selection stage, we do not select mutations that would take the sequence outside of this range. Instead, if a mutation takes a replicating sequence outside the GC range, we simply repeat the mutation process on the sequence until the child sequence has the appropriate GC content (See {\bf Alg. ~\ref{alg:gc}}).\\
There are two obstacles to maintaining evolving populations within the desired GC content range of $\pm 0.1$. First, an initial population of random sequences sampled uniformly from the full alphabet naturally tends converge to a GC content of $0.5$. To avoid this, we sample from the alphabet with probability of sampling GC and AU equal to the desired GC content. This way our initial population has the desired nucleotide distribution. Second, when running the simulation, random mutations are able to move replicating sequences outside of the desired range, \hlt{especially at extremes of mutation rate and GC content.} To avoid this drift, at the selection stage, we do not select mutations that would take the sequence outside of this range. Instead, if a mutation takes a replicating sequence outside the GC range, we simply repeat the mutation process on the sequence until the child sequence has the appropriate GC content (See {\bf Alg. ~\ref{alg:gc}}). Given that populations are initialized in the appropriate GC range, we are likely to find valid mutants relatively quickly and always avoid drifting away from the target GC.\\
\IncMargin{1em}
\begin{algorithm}[H]
......
This diff is collapsed.
% Title: A LaTeX Template For Responses To a Referees' Reports
`% Title: A LaTeX Template For Responses To a Referees' Reports
% Author: Petr Zemek <s3rvac@gmail.com>
% Homepage: https://blog.petrzemek.net/2016/07/17/latex-template-for-responses-to-referees-reports/
% License: CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment