Merge branch 'CW-2063-samplesheet' into 'dev'

add sample_sheet test Closes CW-2063 See merge request epi2melabs/workflows/wf-bacterial-genomes!61
epi2me-labs · May 10, 2023 · b96bacb · b96bacb
2 parents bf688ee + 6ccdf41
commit b96bacb
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 38 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -31,6 +31,11 @@ docker-run:
               NF_WORKFLOW_OPTS: "--fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000 \
               --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v3.5.2"
               NF_IGNORE_PROCESSES: "deNovo,assemblyStats,runProkka"
+        - if: $MATRIX_NAME == "sample-sheet"
+          variables:
+              NF_WORKFLOW_OPTS: "--fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000 \
+              --basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v3.5.2 --sample_sheet test_data/sample_sheet.csv"
+              NF_IGNORE_PROCESSES: "deNovo,assemblyStats,runProkka"
 
 
 # reminder: update AUX_IMAGE_TAG if the aux container package versions are changed

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,9 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [unreleased]
+## [v0.2.13]
 ### Fixed
 - Rows with too few / too many columns in `medaka_models.tsv`.
+- Check sample sheet script.
 ### Changed
 - Now uses new `fastq_ingress` implementation.
 

diff --git a/bin/workflow_glue/check_sample_sheet.py b/bin/workflow_glue/check_sample_sheet.py
@@ -1,43 +1,93 @@
-"""Script to check that sample sheet is well-formatted."""
+"""Check if a sample sheet is valid."""
+import csv
 import sys
 
-import pandas as pd
-
 from .util import get_named_logger, wf_parser  # noqa: ABS101
 
 
 def main(args):
-    """Run entry point."""
-    logger = get_named_logger("check-sheet")
+    """Run the entry point."""
+    logger = get_named_logger("checkSheet")
+
+    barcodes = []
+    aliases = []
+    sample_types = []
+    allowed_sample_types = [
+        "test_sample", "positive_control", "negative_control", "no_template_control"
+        ]
 
     try:
-        logger.info(f"Reading {args.sample_sheet}.")
-        samples = pd.read_csv(args.sample_sheet, sep=None)
-        if 'alias' in samples.columns:
-            if 'sample_id' in samples.columns:
-                sys.stderr.write(
-                    "Warning: sample sheet contains both 'alias' and "
-                    'sample_id, using the former.')
-            samples['sample_id'] = samples['alias']
-        if not set(['sample_id', 'barcode']).intersection(samples.columns):
-            raise IOError()
-    except Exception:
-        raise IOError(
-            "Could not parse sample sheet, it must contain two columns "
-            "named 'barcode' and 'sample_id' or 'alias'.")
-    # check duplicates
-    dup_bc = samples['barcode'].duplicated()
-    dup_sample = samples['sample_id'].duplicated()
-    if any(dup_bc) or any(dup_sample):
-        raise IOError(
-            "Sample sheet contains duplicate values.")
-    samples.to_csv(args.output, sep=",", index=False)
-    logger.info(f"Written cleaned-up sheet to {args.output}.")
+        with open(args.sample_sheet, "r") as f:
+            csv_reader = csv.DictReader(f)
+            n_row = 0
+            for row in csv_reader:
+                n_row += 1
+                if n_row == 1:
+                    n_cols = len(row)
+                else:
+                    # check we got the same number of fields
+                    if len(row) != n_cols:
+                        raise ValueError(
+                            f"Unexpected number of cells in row number {n_row}."
+                        )
+                try:
+                    barcodes.append(row["barcode"])
+                except KeyError:
+                    sys.stdout.write("'barcode' column missing")
+                    sys.exit()
+                try:
+                    aliases.append(row["alias"])
+                except KeyError:
+                    sys.stdout.write("'alias' column missing")
+                    sys.exit()
+                try:
+                    sample_types.append(row["type"])
+                except KeyError:
+                    pass
+    except Exception as e:
+        sys.stdout.write(f"Parsing error: {e}")
+        sys.exit()
+
+    # check barcode and alias values are unique
+    if len(barcodes) > len(set(barcodes)):
+        sys.stdout.write("values in 'barcode' column not unique")
+        sys.exit()
+    if len(aliases) > len(set(aliases)):
+        sys.stdout.write("values in 'alias' column not unique")
+        sys.exit()
+
+    if sample_types:
+        # check if "type" column has unexpected values
+        unexp_type_vals = set(sample_types) - set(allowed_sample_types)
+
+        if unexp_type_vals:
+            sys.stdout.write(
+                f"found unexpected values in 'type' column: {unexp_type_vals}. "
+                f"Allowed values are: {allowed_sample_types}"
+            )
+            sys.exit()
+
+        if args.required_sample_types:
+            for required_type in args.required_sample_types:
+                if required_type not in allowed_sample_types:
+                    sys.stdout.write(f"Not an allowed sample type: {required_type}")
+                    sys.exit()
+                if sample_types.count(required_type) < 1:
+                    sys.stdout.write(
+                        f"Sample sheet requires at least 1 of {required_type}")
+                    sys.exit()
+
+    logger.info(f"Checked sample sheet {args.sample_sheet}.")
 
 
 def argparser():
     """Argument parser for entrypoint."""
-    parser = wf_parser("check-sample-sheet")
-    parser.add_argument('sample_sheet')
-    parser.add_argument('output')
+    parser = wf_parser("check_sample_sheet")
+    parser.add_argument("sample_sheet", help="Sample sheet to check")
+    parser.add_argument(
+        "--required_sample_types",
+        help="List of required sample types. Each sample type provided must "
+             "appear at least once in the sample sheet",
+        nargs="*"
+    )
     return parser
diff --git a/lib/fastqingress.nf b/lib/fastqingress.nf
@@ -149,7 +149,7 @@ def watch_path(Map margs) {
         // add metadata from sample sheet (we can't use join here since it does not work
         // with repeated keys; we therefore need to transform the sample sheet data into
         // a map with the barcodes as keys)
-        def ch_sample_sheet = get_sample_sheet(file(margs.sample_sheet))
+        def ch_sample_sheet = get_sample_sheet(file(margs.sample_sheet), margs.required_sample_types)
         | collect
         | map { it.collectEntries { [(it["barcode"]): it] } }
         // now we can use this channel to annotate all files with the corresponding info
@@ -247,6 +247,7 @@ Map parse_arguments(Map arguments) {
                 "analyse_unclassified": false,
                 "fastcat_stats": false,
                 "fastcat_extra_args": "",
+                "required_sample_types": [],
                 "watch_path": false],
         name: "fastq_ingress")
     return parser.parse_args(arguments)
@@ -319,7 +320,7 @@ def get_valid_inputs(Map margs){
             // filter based on sample sheet in case one was provided
             if (margs.sample_sheet) {
                 // get channel of entries in the sample sheet
-                def ch_sample_sheet = get_sample_sheet(file(margs.sample_sheet))
+                def ch_sample_sheet = get_sample_sheet(file(margs.sample_sheet), margs.required_sample_types)
                 // get the union of both channels (missing values will be replaced with
                 // `null`)
                 def ch_union = Channel.fromPath(sub_dirs_with_fastq_files).map {
@@ -396,7 +397,7 @@ ArrayList get_fq_files_in_dir(Path dir) {
  * @param sample_sheet: path to the sample sheet CSV
  * @return: channel of maps (with values in sample sheet header as keys)
  */
-def get_sample_sheet(Path sample_sheet) {
+def get_sample_sheet(Path sample_sheet, ArrayList required_sample_types) {
     // If `validate_sample_sheet` does not return an error message, we can assume that
     // the sample sheet is valid and parse it. However, because of Nextflow's
     // asynchronous magic, we might emit values from `.splitCSV()` before the
@@ -405,7 +406,7 @@ def get_sample_sheet(Path sample_sheet) {
     // in STDOUT. Thus, we use the somewhat clunky construct with `concat` and `last`
     // below. This lets the CSV channel only start to emit once the error checking is
     // done.
-    ch_err = validate_sample_sheet(sample_sheet).map {
+    ch_err = validate_sample_sheet(sample_sheet, required_sample_types).map {
         // check if there was an error message
         if (it) error "Invalid sample sheet: ${it}."
         it
@@ -425,13 +426,19 @@ def get_sample_sheet(Path sample_sheet) {
  * message is emitted.
  *
  * @param: path to sample sheet CSV
+ * @param: list of required sample types (optional)
  * @return: string (optional)
  */
 process validate_sample_sheet {
     label params.process_label
-    input: path csv
+    input: 
+        path csv
+        val required_sample_types
     output: stdout
+    script:
+    String req_types_arg = required_sample_types ? "--required_sample_types "+required_sample_types.join(" ") : ""
     """
-    workflow-glue check_sample_sheet $csv
+    workflow-glue check_sample_sheet $csv $req_types_arg
     """
 }
+
diff --git a/nextflow.config b/nextflow.config
@@ -49,7 +49,7 @@ manifest {
     description     = 'Assembly, variant calling, and annotation of bacterial genomes.'
     mainScript      = 'main.nf'
     nextflowVersion = '>=20.10.0'
-    version         = 'v0.2.12'
+    version         = 'v0.2.13'
 }
 
 epi2melabs {

diff --git a/test_data/sample_sheet.csv b/test_data/sample_sheet.csv
@@ -0,0 +1,3 @@
+barcode,alias,type
+barcode01,sample1,test_sample
+barcode02,sample2,test_sample