Merge branch 'CW-3533' into 'dev'

add run IDs to output summary CSVs [CW-3533] Closes CW-3533 See merge request epi2melabs/workflows/wf-cas9!53
epi2me-labs · Feb 15, 2024 · baf8246 · baf8246
2 parents 0927951 + 77a0230
commit baf8246
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 5 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v1.1.0]
+### Added
+- A column with sequencing run IDs to the output summary CSV files.
+
+### Changed
+- Minor formatting changes of github issue template.
+
 ## [v1.0.1]
 ### Fixed
 - Overestimation of kbases mapped values.
@@ -14,7 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Memory and CPU requirements for each process.
 
 ### Changed
-- Documentation updated to new format. 
+- Documentation updated to new format.
 - Bumped minimum required Nextflow version to '>=23.04.2'
 - Publish target coverage output to output directory.
 
@@ -25,7 +32,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 - Inputs file names and directories can now contain spaces.
-- Replaced --threads option in fastqingress with hardcoded values to remove warning about undefined `param.threads`. 
+- Replaced --threads option in fastqingress with hardcoded values to remove warning about undefined `param.threads`.
 - Handling for file and directory names that contain spaces.
 
 ## [v0.1.11]

diff --git a/bin/workflow_glue/build_tables.py b/bin/workflow_glue/build_tables.py
@@ -47,7 +47,7 @@ def main(args):
     """Entry point."""
     header = [
         'chr', 'start', 'end', 'target', 'nreads', 'nbases',
-        'tsize', 'coverage_frac', 'median_cov', 'p', 'n', 'sample_id']
+        'tsize', 'coverage_frac', 'median_cov', 'p', 'n', 'sample_id', 'run_ids']
 
     frames = []
 
@@ -125,7 +125,7 @@ def main(args):
             'mean_read_length': 1})
 
         df_all = df_all[[
-            'sample', 'chr', 'start', 'end', 'target', 'tsize',
+            'sample', 'run_ids', 'chr', 'start', 'end', 'target', 'tsize',
             'kbases', 'coverage_frac', 'median_cov', 'nreads',
             'mean_read_length', 'strand_bias']]
         df_all.sort_values(
@@ -149,11 +149,16 @@ def main(args):
         sample_df['nreads'] = df['nreads'].sum()
         sample_df = sample_df.round(2)
         sample_df['sample_id'] = sid
+        # the `run_ids` column should contain the same value for all rows for this
+        # sample
+        sample_df["run_ids"], = df["run_ids"].unique()
         dfs.append(sample_df)
 
     if dfs:
         sample_summary = pd.concat(dfs, axis=1).T
         sample_summary.set_index('sample_id', drop=True, inplace=True)
+        # move the `run_ids` column to the beginning of the dataframe
+        sample_summary.insert(0, "run_ids", sample_summary.pop("run_ids"))
     else:
         sample_summary = pd.DataFrame()
 

diff --git a/main.nf b/main.nf
@@ -210,6 +210,9 @@ process target_summary {
     # Add sample_id column
     sed -i "s/\$/\t${meta.alias}/" ${meta.alias}_target_summary.bed
 
+    # Add run_ids column
+    sed -i "s/\$/\t${meta.run_ids.join(',')}/" ${meta.alias}_target_summary.bed
+
     rm median_coverage.bed pos.bed neg.bed
     """
 }

diff --git a/nextflow.config b/nextflow.config
@@ -53,7 +53,7 @@ manifest {
     description     = 'Summarise the results of Cas9 enrichment sequencing.'
     mainScript      = 'main.nf'
     nextflowVersion = '>=23.04.2'
-    version         = 'v1.0.1'
+    version         = 'v1.1.0'
 
 }