version and compatibility updates

veg · Nov 15, 2022 · 128af4b · 128af4b
1 parent 53d7dec
commit 128af4b
Show file tree

Hide file tree

Showing 539 changed files with 27,171 additions and 799,748 deletions.
diff --git a/HBL/.DS_Store b/HBL/.DS_Store
diff --git a/HBL/data_filter-2.bf b/HBL/data_filter-2.bf
@@ -1,10 +1,13 @@
 RequireVersion ("2.4.0");
 
+filter.retain_all_sequences = "QVOA|OGV";
 
 LoadFunctionLibrary     ("libv3/tasks/alignments.bf");
 LoadFunctionLibrary     ("libv3/convenience/regexp.bf");
 LoadFunctionLibrary     ("libv3/IOFunctions.bf");
 
+
+
 filter.analysis_description = {terms.io.info :
                             "
                             Map a protein MSA back onto nucleotide sequences
@@ -74,18 +77,22 @@ fprintf (filter.path, CLEAR_FILE);
 
 utility.SetEnvVariable ("DATA_FILE_PRINT_FORMAT",9);
 
-
-utility.ForEach (filter.splits, "_sequences_", 'filter.compress ( _sequences_)');
+filter.splits["filter.compress"][""];
+//utility.ForEachPair (filter.splits, "_key_", "_sequences_", 'filter.compress ( _sequences_, _key_)');
 
 filter.count = 0;
 
-
-function filter.compress (s) {
+function filter.compress (k, s) {
     if (utility.Array1D (s)) {
         filter.selector  = utility.SwapKeysAndValues (s);
-        DataSetFilter filter.subfilter = CreateFilter (filter.dataset, 1, "", filter.selector / filter.names[speciesIndex]);
-        io.ReportProgressMessage ("UNIQUE SEQUENCES", "Retained `alignments.CompressDuplicateSequences ('filter.subfilter','filter.subfilter.compressed', TRUE)` unique  sequences");
-        if (filter.count > 0) {
+        if (None == regexp.Find (k, filter.retain_all_sequences) ) {
+            DataSetFilter filter.subfilter = CreateFilter (filter.dataset, 1, "", filter.selector / filter.names[speciesIndex]);
+            io.ReportProgressMessage ("UNIQUE SEQUENCES", "`k` : retained `alignments.CompressDuplicateSequences ('filter.subfilter','filter.subfilter.compressed', TRUE)` unique sequences out of " + filter.subfilter.species + " total sequences");
+        } else {
+            DataSetFilter filter.subfilter.compressed = CreateFilter (filter.dataset, 1, "", filter.selector / filter.names[speciesIndex]);
+            io.ReportProgressMessage ("UNIQUE SEQUENCES", "`k` ALL " + filter.subfilter.compressed.species + " sequences"); 
+        }
+         if (filter.count > 0) {
             fprintf (filter.path, "\n", filter.subfilter.compressed);
         } else {
             fprintf (filter.path, filter.subfilter.compressed);

diff --git a/HBL/data_filter.bf b/HBL/data_filter.bf
@@ -38,20 +38,21 @@ io.ReportProgressMessage ("Data QC", "Loaded `filter.nuc_data[terms.data.sequenc
 
 /* split off QVOA reads */
 
-filter.partitionining_expressions   = {"QVOA" : "(QVOA)|(OGV)"};
+filter.regex = "(QVOA)|(OGV)|(DNA)|(xxxx)|(BAR[0-9]+)";
+filter.partitionining_expressions   = {"QVOA" : filter.regex};
 filter.partitoned_sequence_names    = regexp.PartitionByRegularExpressions(alignments.GetSequenceNames ("filter.raw_data"), filter.partitionining_expressions); 
 
 //io.CheckAssertion ("Abs(filter.partitoned_sequence_names['(QVOA)|(OGV)'])>0", "There were no sequences marked as QVOA in the input file");
 
 KeywordArgument ("qvoa", "Write QVOA sequences here ", filter.nuc_data[terms.data.file] + "_qvoa.fas");
 filter.qvoa_path = io.PromptUserForFilePath ("Save QVOA sequences to");
-io.ReportProgressMessage ("Data QC", "Found `Abs(filter.partitoned_sequence_names['(QVOA)|(OGV)'])` QVOA sequences in the file; writing them to **`filter.qvoa_path`**");
+io.ReportProgressMessage ("Data QC", "Found `Abs(filter.partitoned_sequence_names[filter.regex])` QVOA sequences in the file; writing them to **`filter.qvoa_path`**");
 alignments.GetSequenceByName ("filter.raw_data", None);
 fprintf (filter.qvoa_path, CLEAR_FILE, KEEP_OPEN);
 
 filter.qvoa = {};
 
-utility.ForEach (filter.partitoned_sequence_names["(QVOA)|(OGV)"], "_seq_record_", 
+utility.ForEach (filter.partitoned_sequence_names[filter.regex], "_seq_record_", 
 '
     sanitized_name = ((_seq_record_ ^ {{"_xxxx_000WPI"}{""}}) ^ {{"OGV_"}{""}}) + "_QVOA";
     filter.qvoa [sanitized_name] = alignments.StripGaps (alignments.GetSequenceByName ("filter.raw_data", _seq_record_));
@@ -165,11 +166,13 @@ if (Abs(filter.frameshifted)) {
         io.ReportProgressBar ("filter","Processing sequence " + filter.seq_count);
         filter.cleaned = IgSCUEAL.align_sequence_to_reference_set (filter.RNA_reads[_sequence_], filter.ref_seq, filter.options);
 
-        //if (_sequence_ == "CAP206_159wpi_ENV_2_all_aligned_loops_trimmed_98_2") {
+        //if (_sequence_ == "PIDCAP380_PH380_F1_2_23_W5half_post_ART_DNA_BAR1_Seq1_Phase3_NumReads240_P17_QVOA") {
         //    console.log (filter.cleaned);
         //}
+
+        //console.log (filter.cleaned);
 
-        filtered.aa_seq = alignments.StripGaps(filter.cleaned["AA"]);
+        filtered.aa_seq = alignments.StripGaps(filter.cleaned["AA"])^ {{"\\?","X"}};
         filtered.na_seq = IgSCUEAL.strip_in_frame_indels(filter.cleaned["QRY"]);
 
         (filter.sequences_with_copies[filter.RNA_reads[_sequence_]])["_write_to_file"][""];
@@ -192,7 +195,7 @@ utility.ForEachPair (filter.clean_seqs, "_sequence_", "_value_",
     //    console.log (filter.cleaned);
     //}
 
-    filtered.aa_seq = alignments.StripGaps(filter.cleaned["AA"]);
+    filtered.aa_seq = alignments.StripGaps(filter.cleaned["AA"])  ^ {{"\\?","X"}};
     filtered.na_seq = IgSCUEAL.strip_in_frame_indels(filter.cleaned["QRY"]);
     (filter.sequences_with_copies[filter.RNA_reads[_sequence_]])["_write_to_file"][""];
     filter.seq_count += 1;
@@ -206,7 +209,13 @@ io.ReportProgressMessage ("Data QC", "Performing QA on QVOA reads");
 utility.ForEachPair (filter.qvoa, "_sequence_", "_value_",
 '
     filter.cleaned = IgSCUEAL.align_sequence_to_reference_set (_value_, filter.ref_seq, filter.options);
-    filtered.aa_seq = alignments.StripGaps(filter.cleaned["AA"]);
+
+    if (None == filter.cleaned) {
+        console.log (_sequence_ + " failed to map to RNA references");
+        return;
+    }
+
+    filtered.aa_seq = alignments.StripGaps(filter.cleaned["AA"]) ^ {{"\\?","X"}};
     filtered.na_seq = IgSCUEAL.strip_in_frame_indels(filter.cleaned["QRY"]);
 
     fprintf (filter.combined_protein_path, ">", _sequence_, "\n",  filtered.aa_seq, "\n");

diff --git a/HBL/lib/igscueal.bf b/HBL/lib/igscueal.bf
@@ -627,7 +627,24 @@ namespace IgSCUEAL {
         return {'aligned': sequence, 'stripped' : sequence ^ {{"\-",""}}};
     }
 
-    lfunction  strip_non_letters (sequence) {
+
+    lfunction  strip_in_frame_gaps (sequence) {
+        stripped = ""; stripped * Abs (sequence);
+        for (i = 0; i < Abs (sequence); i+=3) {
+            codon = sequence[i][i+2];
+            if ((codon $ "[ACGT][ACGT][ACGT]" )[0] == 0) {
+                stripped * codon;
+            } else {
+                if (codon != '---') {
+                    return null;
+                }
+            }
+        }
+        stripped * 0;
+        return {'aligned': sequence, 'stripped' : stripped};
+    }
+
+   lfunction  strip_non_letters (sequence) {
         return {'aligned': sequence, 'stripped' : sequence ^ {{"[^A-Z,a-z]",""}}};
     }
 
@@ -664,7 +681,13 @@ namespace IgSCUEAL {
        result = {'SITES' : data["sites"]};
 
        utility.ForEach (align_with_these, "_value_",
-        "`&result`[_value_] = IgSCUEAL.strip_gaps (alignments.GetSequenceByName ('`namespace`', _value_));"
+        "
+            `&result`[_value_] = IgSCUEAL.strip_in_frame_gaps (alignments.GetSequenceByName ('`namespace`', _value_));
+            if (`&result`[_value_] == None) {
+                `&result` - _value_;
+            }
+        "
+
         );
 
 

diff --git a/README.md b/README.md
@@ -1,24 +1,25 @@
 # Timing of QVOA strains using RNA pre-treatment data
 
 <a href="https://zenodo.org/badge/latestdoi/181484600"><img src="https://zenodo.org/badge/181484600.svg" alt="DOI"></a>
+> Updated Nov 2022 to provide compatibility with HyPhy 2.5.x and support additional options, like not compressing identical sequences
 
 ## Overview
 
-This pipeline **times** outgrowth virus (OGV) strains from a single host using serially sampled RNA data. Four different approaches are used to assign dates to unobserved strains. First, each tree is rooted to maximize the root-to-tip to sampling time correlation coefficient. 
+This pipeline **times** outgrowth virus (OGV) strains from a single host using longtitudinally sampled RNA data to provide temporal information. Four different approaches are used to time strains with unobserved dates (OGV or DNA strains). The phylogenetic tree for each genomic segment is rooted to maximize the root-to-tip to sampling time correlation coefficient first.
 
 ### Patristic distance
-Given a tree with OGV and RNA strains, this approach simply finds the nearest NGS neighbor (in partistic, i.e. path length sense) for each OGV sequence. To assess reliability of this assignment, we also identify all **K≥1** sequences within a factor of 2x of this smallest distance, and extract the consensus vote to assign the date. The **support** value if the proportion of **K** neighboring sequences that come from the same timepoint. In the example below, there are **9** RNA neighbors to the blue QVOA sequence. The closest one is `CAP288_4260_184WPI_ENV_C1C2_NGS_020_0_006` and the other two are with 2x distance. The blue sequence will be assigned to the `184WPI` timepoint, because most of the neighbors are from that timepoint, and the support value will be `8/9`, because one of teh neighbors has a timestamp of `174WPI`
+Given a tree with OGV and RNA strains, this approach simply finds the nearest RNA neighbor (in [patristic](https://link.springer.com/referenceworkentry/10.1007/978-1-4020-6754-9_12406), i.e., path length sense) for each OGV sequence. To assess reliability of this assignment, we also identify all **K≥1** sequences within a factor of `2x` of this smallest distance, and extract the consensus vote to assign the date. The **support** value if the proportion of **K** neighboring sequences that come from the same timepoint. In the example below, there are **9** RNA neighbors to the blue QVOA sequence. The closest one is `CAP288_4260_184WPI_ENV_C1C2_NGS_020_0_006` and the other two are with 2x distance. The blue sequence will be assigned to the `184WPI` timepoint (WPI = weeks post infection), because most of the neighbors are from that timepoint, and the support value will be `8/9`, because one of teh neighbors has a timestamp of `174WPI`
 
 ![img1](img/1.png)
 
 ### Clade support
-Here, we start with a QVOA sequence, and go up the tree until we encounter the first internal node that has both QVOA and NGS descendants and that has a minimum of 0.90 bootstrap support. We then assign a timestamp and support to the QVOA sequence using the same majority rule as for the patristic distance for all the nodes in the clade. In the example below, to date `CAP288_NEF_1_W11_QVOA`, we walk up the tree to the first supported node (blue) and look at its RNA descendants, of which there are **6** AND 5/6 come from `184WPI`, yielding the assignment of `184WPI` for the QVOA sequence and support of 5/6.
+Here, we start with a QVOA sequence, and go up the tree until we encounter the first internal node that has **both** QVOA and RNA descendants and that has a minimum of `0.90` bootstrap support. We then assign a timestamp and support to the QVOA sequence using the same majority rule as for the patristic distance for all the nodes in the clade. In the example below, to date `CAP288_NEF_1_W11_QVOA`, we walk up the tree to the first supported node (blue) and look at its RNA descendants, of which there are **6** AND 5/6 come from `184WPI`, yielding the assignment of `184WPI` for the QVOA sequence and support of 5/6.
 
 ![img1](img/2.png)
 
 ### Placement
 
-Phylogenetic placement using modified IgSCUEAL. The support value is the model weight ([0-1]) assigned to the branches with teh corresponding label
+Phylogenetic placement using a modified [IgSCUEAL](https://royalsocietypublishing.org/doi/full/10.1098/rstb.2014.0240) tool. The support value is the model weight (in [0-1]) assigned to the branches with teh corresponding label
 
 ### Regression
 
@@ -30,18 +31,53 @@ The linear regression molecular clock method, essentially the same as in [the Jo
 
 ### Dependancies
 
-1. `HyPhy v2.4` -- the `develop` branch from [github.com/veg/hyphy](github.com/veg/hyphy)
-2. `muscle` for MSA generation
+1. `HyPhy ≥v2.5.42` -- or the `develop` branch from [github.com/veg/hyphy](github.com/veg/hyphy)
+2. `MAFFT` for MSA generation
 3. `FastTree` for phylogeny inference
-4. `phylotree.js` for tree processing and visualization [note; there is a custom JS script for performing dating steps that is included in `scripts`]
+4. `phylotree.js` (an `npm` package) for tree processing and visualization [note; there is a custom JS script for performing dating steps that is included in `scripts`]
 5. `snakemake` for workflow management
+6. `node` for executing JavaScript
+7. `BioPython` 
+
+See notes on configuration below and please run 
+```
+npm install 
+```
+from the base directory 
+
 
 #### Workflow
 
-The workflow for sample classification is implemented as a [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow, encoded in the `Snakemake` file, and can be graphically summarized as follows (for a specific sample):
+The workflow for sample classification is implemented as a [Snakemake](https://snakemake.readthedocs.io/en/stable/) workflow, encoded in the `Snakemake` file, and can be graphically summarized as follows (for a specific sample)...
 
 ![img1](img/3.png)
 
+The pipeline requires an input `samples.json` file, with pathnames of files to process specified relative to the `data` directory, e.g. 
+
+```
+{
+    'samples': [
+        "CAP188/CAP188_ENV_2_all_hap.fasta",
+        "CAP188/CAP188_ENV_3_all_hap.fasta",
+        "CAP188/CAP188_ENV_4_all_hap.fasta",
+        "CAP188/CAP188_NEF_1_all_hap.fasta",
+        "CAP206/CAP206_ENV_2_all_hap.fasta",
+        "CAP206/CAP206_ENV_4_all_hap.fasta",
+        "CAP206/CAP206_GAG_1_all_hap.fasta",
+        "CAP217/CAP217_ENV_2_all_hap.fasta",
+        "CAP217/CAP217_ENV_3_all_hap.fasta",
+        "CAP217/CAP217_ENV_4_all_hap.fasta",
+        "CAP257/CAP257_ENV_2_all_hap.fasta",
+        "CAP257/CAP257_ENV_3_all_hap.fasta",
+        "CAP257/CAP257_ENV_4_all_hap.fasta",
+```
+
+The pipeline is executed by calling `snakemake`, e.g. 
+
+```
+$snakemake --cores 4 
+```
+
 After the pipeline has completed, you need to run 
 
 ```
@@ -55,38 +91,43 @@ to generate the overall summary CSV file.
 The high level directory structure contains the following components
 
 ```
-├── HBL               [HyPhy scripts]
-│   ├── data
+├── HBL							[HyPhy scripts]
 │   ├── data_filter-2.bf
 │   ├── data_filter.bf
 │   ├── lib
 │   └── scripts
-├── README.md         [This file]
-├── Snakefile         [Snakemake configuration file]
-├── data              [unprocessed FASTA files]
+├── LICENSE
+├── package.json					[npm package manifest]
+├── README.md						[This file]
+├── Snakefile						[Snakemake configuration file]
+├── data							[unprocessed FASTA files]
 │   ├── CAP188
 │   ├── CAP206
 │   ├── CAP217
 │   ├── CAP257
+│   ├── CAP268
+│   ├── CAP280
 │   ├── CAP287
 │   ├── CAP288
 │   ├── CAP302
 │   ├── CAP316
 │   ├── CAP336
-│   └── conversion.json. [Color codes and WPI to pre-ART conversions]
-├── img                  [Images for this file]
+│   ├── CAP372
+│   └── conversion.json	  [Color codes and WPI to pre-ART conversions]
+├── img					  [Images for this file]
 │   ├── 1.png
 │   ├── 2.png
-│   └── 3.pdf
+│   └── 3.png
 ├── results              [Results generated by the pipeline]
 │   ├── alignments	     [Alignments]
 │   ├── dating		     [QVOA/OGV dating results]
 │   └── trees            [Trees inferred by the pipeline]
 ├── samples.json         [Manifest of files to be processed by the pipeline]
-├── scripts
-│   └── result-summary.py[Penultimate summary script]
+└── scripts
+    ├── compute-distance.js 	[Classifier/timing script]
+    └── result-summary.py		[Penultimate summary script]
+```    
 
-```
 
 Within specific resutls directories, the following files are created **for each input FASTA**
 
@@ -159,13 +200,22 @@ CAP188_4260_185WPI_ENV_C2C3_NGS_046_0.002
 
 3. The resulting in-frame sequences were translated to amino-acids, mapping all ambiguous codons to `?` (e.g. `A-A`).
 
-4. A multiple sequence alignment (MSA) was generated from translated protein sequences using `muscle`
+4. A multiple sequence alignment (MSA) was generated from translated protein sequences using `MAFFT`
 
 5. Original codon sequences are mapped to aligned protein sequences using a script in `HyPhy`. Identical sequences (which do not contribute any information to phylogenetic analyses) are removed at this stage.
 
 6. ML phylognetic trees are reconstructed using `FastTree` (slow option) for RNA sequences **only** (these are later used for phylogenetic placement) and for RNA+QVOA 
 
-7. The pipeline will **compress** identical sequences, i.e. only one copy will be retained. The copy number will (including single copies, i.e. `1`) will be appended to sequence names. 
+7. The pipeline will **compress** identical RNA sequences, i.e. only one copy will be retained. The copy number will (including single copies, i.e. `1`) will be appended to sequence names. It will NOT compress QVOA sequences, defined as those with `QVOA` or `OGV` in their name.
 
-> **Important:** adjust the paths to various executables, like HYPHY and FastTree, in the `Snakefile` before running the pipeline
+> **Important:** adjust the paths to various executables, like `hyphy` and `FastTree`, in the `Snakefile` before running the pipeline
 
+```
+callables = {
+    'hyphy' : '/usr/local/bin/hyphy',
+    'mafft' : '/usr/local/bin/mafft --auto ',
+    'raxml' : '/usr/local/bin/raxml-ng',
+    'fasttree' : '/usr/local/bin/FastTree',
+    'classifier' : 'scripts/compute-distance.js'
+}
+```