Skip to content

Commit

Permalink
Merge branch 'CW-2063-samplesheet' into 'dev'
Browse files Browse the repository at this point in the history
add sample_sheet test

Closes CW-2063

See merge request epi2melabs/workflows/wf-bacterial-genomes!61
  • Loading branch information
sarahjeeeze committed May 10, 2023
2 parents bf688ee + 6ccdf41 commit b96bacb
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 38 deletions.
5 changes: 5 additions & 0 deletions .gitlab-ci.yml
Expand Up @@ -31,6 +31,11 @@ docker-run:
NF_WORKFLOW_OPTS: "--fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000 \
--basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v3.5.2"
NF_IGNORE_PROCESSES: "deNovo,assemblyStats,runProkka"
- if: $MATRIX_NAME == "sample-sheet"
variables:
NF_WORKFLOW_OPTS: "--fastq test_data/fastq --reference_based_assembly --reference test_data/ref/reference.subseq.fa.gz --threads 4 --chunk_size 100000 \
--basecaller_cfg dna_r10.4.1_e8.2_400bps_hac@v3.5.2 --sample_sheet test_data/sample_sheet.csv"
NF_IGNORE_PROCESSES: "deNovo,assemblyStats,runProkka"


# reminder: update AUX_IMAGE_TAG if the aux container package versions are changed
Expand Down
3 changes: 2 additions & 1 deletion CHANGELOG.md
Expand Up @@ -4,9 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [unreleased]
## [v0.2.13]
### Fixed
- Rows with too few / too many columns in `medaka_models.tsv`.
- Check sample sheet script.
### Changed
- Now uses new `fastq_ingress` implementation.

Expand Down
110 changes: 80 additions & 30 deletions bin/workflow_glue/check_sample_sheet.py
@@ -1,43 +1,93 @@
"""Script to check that sample sheet is well-formatted."""
"""Check if a sample sheet is valid."""
import csv
import sys

import pandas as pd

from .util import get_named_logger, wf_parser # noqa: ABS101


def main(args):
"""Run entry point."""
logger = get_named_logger("check-sheet")
"""Run the entry point."""
logger = get_named_logger("checkSheet")

barcodes = []
aliases = []
sample_types = []
allowed_sample_types = [
"test_sample", "positive_control", "negative_control", "no_template_control"
]

try:
logger.info(f"Reading {args.sample_sheet}.")
samples = pd.read_csv(args.sample_sheet, sep=None)
if 'alias' in samples.columns:
if 'sample_id' in samples.columns:
sys.stderr.write(
"Warning: sample sheet contains both 'alias' and "
'sample_id, using the former.')
samples['sample_id'] = samples['alias']
if not set(['sample_id', 'barcode']).intersection(samples.columns):
raise IOError()
except Exception:
raise IOError(
"Could not parse sample sheet, it must contain two columns "
"named 'barcode' and 'sample_id' or 'alias'.")
# check duplicates
dup_bc = samples['barcode'].duplicated()
dup_sample = samples['sample_id'].duplicated()
if any(dup_bc) or any(dup_sample):
raise IOError(
"Sample sheet contains duplicate values.")
samples.to_csv(args.output, sep=",", index=False)
logger.info(f"Written cleaned-up sheet to {args.output}.")
with open(args.sample_sheet, "r") as f:
csv_reader = csv.DictReader(f)
n_row = 0
for row in csv_reader:
n_row += 1
if n_row == 1:
n_cols = len(row)
else:
# check we got the same number of fields
if len(row) != n_cols:
raise ValueError(
f"Unexpected number of cells in row number {n_row}."
)
try:
barcodes.append(row["barcode"])
except KeyError:
sys.stdout.write("'barcode' column missing")
sys.exit()
try:
aliases.append(row["alias"])
except KeyError:
sys.stdout.write("'alias' column missing")
sys.exit()
try:
sample_types.append(row["type"])
except KeyError:
pass
except Exception as e:
sys.stdout.write(f"Parsing error: {e}")
sys.exit()

# check barcode and alias values are unique
if len(barcodes) > len(set(barcodes)):
sys.stdout.write("values in 'barcode' column not unique")
sys.exit()
if len(aliases) > len(set(aliases)):
sys.stdout.write("values in 'alias' column not unique")
sys.exit()

if sample_types:
# check if "type" column has unexpected values
unexp_type_vals = set(sample_types) - set(allowed_sample_types)

if unexp_type_vals:
sys.stdout.write(
f"found unexpected values in 'type' column: {unexp_type_vals}. "
f"Allowed values are: {allowed_sample_types}"
)
sys.exit()

if args.required_sample_types:
for required_type in args.required_sample_types:
if required_type not in allowed_sample_types:
sys.stdout.write(f"Not an allowed sample type: {required_type}")
sys.exit()
if sample_types.count(required_type) < 1:
sys.stdout.write(
f"Sample sheet requires at least 1 of {required_type}")
sys.exit()

logger.info(f"Checked sample sheet {args.sample_sheet}.")


def argparser():
"""Argument parser for entrypoint."""
parser = wf_parser("check-sample-sheet")
parser.add_argument('sample_sheet')
parser.add_argument('output')
parser = wf_parser("check_sample_sheet")
parser.add_argument("sample_sheet", help="Sample sheet to check")
parser.add_argument(
"--required_sample_types",
help="List of required sample types. Each sample type provided must "
"appear at least once in the sample sheet",
nargs="*"
)
return parser
19 changes: 13 additions & 6 deletions lib/fastqingress.nf
Expand Up @@ -149,7 +149,7 @@ def watch_path(Map margs) {
// add metadata from sample sheet (we can't use join here since it does not work
// with repeated keys; we therefore need to transform the sample sheet data into
// a map with the barcodes as keys)
def ch_sample_sheet = get_sample_sheet(file(margs.sample_sheet))
def ch_sample_sheet = get_sample_sheet(file(margs.sample_sheet), margs.required_sample_types)
| collect
| map { it.collectEntries { [(it["barcode"]): it] } }
// now we can use this channel to annotate all files with the corresponding info
Expand Down Expand Up @@ -247,6 +247,7 @@ Map parse_arguments(Map arguments) {
"analyse_unclassified": false,
"fastcat_stats": false,
"fastcat_extra_args": "",
"required_sample_types": [],
"watch_path": false],
name: "fastq_ingress")
return parser.parse_args(arguments)
Expand Down Expand Up @@ -319,7 +320,7 @@ def get_valid_inputs(Map margs){
// filter based on sample sheet in case one was provided
if (margs.sample_sheet) {
// get channel of entries in the sample sheet
def ch_sample_sheet = get_sample_sheet(file(margs.sample_sheet))
def ch_sample_sheet = get_sample_sheet(file(margs.sample_sheet), margs.required_sample_types)
// get the union of both channels (missing values will be replaced with
// `null`)
def ch_union = Channel.fromPath(sub_dirs_with_fastq_files).map {
Expand Down Expand Up @@ -396,7 +397,7 @@ ArrayList get_fq_files_in_dir(Path dir) {
* @param sample_sheet: path to the sample sheet CSV
* @return: channel of maps (with values in sample sheet header as keys)
*/
def get_sample_sheet(Path sample_sheet) {
def get_sample_sheet(Path sample_sheet, ArrayList required_sample_types) {
// If `validate_sample_sheet` does not return an error message, we can assume that
// the sample sheet is valid and parse it. However, because of Nextflow's
// asynchronous magic, we might emit values from `.splitCSV()` before the
Expand All @@ -405,7 +406,7 @@ def get_sample_sheet(Path sample_sheet) {
// in STDOUT. Thus, we use the somewhat clunky construct with `concat` and `last`
// below. This lets the CSV channel only start to emit once the error checking is
// done.
ch_err = validate_sample_sheet(sample_sheet).map {
ch_err = validate_sample_sheet(sample_sheet, required_sample_types).map {
// check if there was an error message
if (it) error "Invalid sample sheet: ${it}."
it
Expand All @@ -425,13 +426,19 @@ def get_sample_sheet(Path sample_sheet) {
* message is emitted.
*
* @param: path to sample sheet CSV
* @param: list of required sample types (optional)
* @return: string (optional)
*/
process validate_sample_sheet {
label params.process_label
input: path csv
input:
path csv
val required_sample_types
output: stdout
script:
String req_types_arg = required_sample_types ? "--required_sample_types "+required_sample_types.join(" ") : ""
"""
workflow-glue check_sample_sheet $csv
workflow-glue check_sample_sheet $csv $req_types_arg
"""
}

2 changes: 1 addition & 1 deletion nextflow.config
Expand Up @@ -49,7 +49,7 @@ manifest {
description = 'Assembly, variant calling, and annotation of bacterial genomes.'
mainScript = 'main.nf'
nextflowVersion = '>=20.10.0'
version = 'v0.2.12'
version = 'v0.2.13'
}

epi2melabs {
Expand Down
3 changes: 3 additions & 0 deletions test_data/sample_sheet.csv
@@ -0,0 +1,3 @@
barcode,alias,type
barcode01,sample1,test_sample
barcode02,sample2,test_sample

0 comments on commit b96bacb

Please sign in to comment.