From 860cdc43b23dc2c3a62983705f6c5764db14bbd7 Mon Sep 17 00:00:00 2001 From: Christopher Alder Date: Wed, 6 Mar 2024 12:28:22 +0000 Subject: [PATCH] Resfinder multiple mutations fix [CW-3284] --- CHANGELOG.md | 5 ++- README.md | 2 - bin/workflow_glue/process_resfinder.py | 61 +++++++++++++++----------- docs/04_install_and_run.md | 2 - modules/local/checkpoints.nf | 16 +++---- modules/local/isolates.nf | 9 ++-- nextflow.config | 4 +- 7 files changed, 53 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1240def..08f9326 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,12 +4,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [v1.2.0] ### Added - `client_fields` parameter to allow input of a JSON file of key value pairs to display on output reports. - `min_read_length` parameter to remove reads below specified length (default 1000bp) from downstream analysis, to improve de novo assembly process - Salmonella serotyping with `SeqSero2` +### Fixed +- Duplicate entries in `Pointfinder` processing + ## [v1.1.1] ### Fixed - Report generation when `Resfinder` fails diff --git a/README.md b/README.md index 37b4083..986d7b5 100644 --- a/README.md +++ b/README.md @@ -71,8 +71,6 @@ The workflow can be run with the demo data using: nextflow run epi2me-labs/wf-bacterial-genomes \ --fastq wf-bacterial-genomes-demo/isolates_fastq \ --isolates \ - --reference_based_assembly \ - --reference wf-bacterial-genomes-demo/ref/ref.fasta.gz \ --sample_sheet wf-bacterial-genomes-demo/isolates_sample_sheet.csv \ -profile standard ``` diff --git a/bin/workflow_glue/process_resfinder.py b/bin/workflow_glue/process_resfinder.py index 899cce7..a4cacab 100755 --- a/bin/workflow_glue/process_resfinder.py +++ b/bin/workflow_glue/process_resfinder.py @@ -50,11 +50,11 @@ def read_pointfinder_xml(xml_file): results['query_end'].append(hsp.query_end) results['subject_start'].append(hsp.sbjct_start) results['subject_end'].append(hsp.sbjct_end) - results['query'].append(record.query[:100]) + results['query'].append(record.query) results_df = pd.DataFrame(results) # Switch start and end if start is greater than end - if (results_df.loc[0, 'query_start'] < results_df.loc[0, 'query_end']): + if (results_df.loc[0, 'query_start'] > results_df.loc[0, 'query_end']): results_df['query_start'], results_df['query_end'] = results_df[ 'query_end'], results_df['query_start'] results_df['subject_start'], results_df['subject_end'] = results_df[ @@ -80,6 +80,17 @@ def extract_point_bp(subject, mutation): return (mutation_bp) +def get_global_mutation_position(q_start, q_end, s_start, s_end, loc_pos): + """Get global mutation position on query depending on query strand orientation.""" + if s_start < s_end: + glob_loc_start = q_start + loc_pos + glob_loc_end = q_start + loc_pos + 2 + else: + glob_loc_start = q_end - loc_pos + glob_loc_end = q_end - loc_pos - 2 + return (glob_loc_start, glob_loc_end) + + # Maybe add a test for this? def convert_pointfinder_row(database_location, row): """Convert point finder row.""" @@ -91,28 +102,26 @@ def convert_pointfinder_row(database_location, row): pointfinder_blast_data['Mutation'] = row['Mutation'] # TODO: This works for mutations in CDS, but not sure how it will work # with mutations in promoter regions - bp_position = extract_point_bp( - pointfinder_blast_data.loc[0, 'Subject'], - pointfinder_blast_data.loc[0, 'Mutation']) - pointfinder_blast_data['Local_loc_bp'] = bp_position - start_loc = pointfinder_blast_data.loc[0, 'query_start'] - end_loc = pointfinder_blast_data.loc[0, 'query_end'] - # Getting round the fact that the res-gene may match in the reverse - # orientation, and therefore the start and end positions are reversed - if (start_loc > end_loc): - pointfinder_blast_data['Global_loc_bp'] = start_loc - bp_position - # End of codon - pointfinder_blast_data['Global_loc_bp_end'] = start_loc - \ - bp_position + 2 - else: - pointfinder_blast_data['Global_loc_bp'] = end_loc + bp_position - # TODO: should this be a + or - - pointfinder_blast_data['Global_loc_bp_end'] = start_loc - \ - bp_position + 2 + + pointfinder_blast_data['Local_loc_bp'] = pointfinder_blast_data.apply( + lambda row: extract_point_bp(row["Subject"], row["Mutation"]), axis=1 + ) + # Find start and end of mutation on query depending on orientation of gene + pointfinder_blast_data[ + ['Global_loc_bp', 'Global_loc_bp_end'] + ] = pointfinder_blast_data.apply( + lambda row: get_global_mutation_position( + row["query_start"], + row["query_end"], + row["subject_start"], + row["subject_end"], + row["Local_loc_bp"] + ), axis=1, result_type="expand" + ) return pointfinder_blast_data -def process_pointfinder(pointfinder_data, output_file, database_location): +def process_pointfinder(pointfinder_data, database_location): """Select right columns and split loc column.""" # TODO: This is all messy and needs refactoring if len(pointfinder_data) == 0: @@ -125,24 +134,22 @@ def process_pointfinder(pointfinder_data, output_file, database_location): except IndexError: gene_name = pointfinder_data['Mutation'] pointfinder_data["Resistance gene"] = gene_name - mutation_names = pointfinder_data['Mutation'] loc_split = pointfinder_data['Mutation'].str.split( " ", n=1, expand=True) pointfinder_data = pointfinder_data.assign( Sequence=loc_split[0], Mutation=loc_split[1]) pointfinder_data['Query_loc'] = 0 df_list = [] - for key, row in pointfinder_data.iterrows(): + # multiple entries can be found in xml so iterate over rows + for _, row in pointfinder_data.iterrows(): pointfinder_blast_data = convert_pointfinder_row( database_location, row) df_list.append(pointfinder_blast_data) concat_df = pd.concat(df_list) - # split the mutation and get the base position concat_df_merge = concat_df.merge(pointfinder_data, on='Mutation') concat_df_merge_tidy = concat_df_merge.iloc[ :, [4, 8, 9, 12, 10, 11, 13, 14]] - concat_df_merge_tidy.index = mutation_names concat_df_merge_tidy = concat_df_merge_tidy.assign(Type='pointfinder') out_columns = [ 'Contig', 'Start', 'End', 'Phenotype', 'Identity/Nucleotide', @@ -159,8 +166,10 @@ def main(args): resfinder_results = process_resfinder(resfinder_data) if (args.pointfinder_file != "NOT_RUN"): pointfinder_data = pd.read_csv(args.pointfinder_file, sep="\t") + # Drop duplicates - entries will be recovered in process_pointfinder + pointfinder_data = pointfinder_data.drop_duplicates() pointfinder_results = process_pointfinder( - pointfinder_data, args.output, args.database_location) + pointfinder_data, args.database_location) else: pointfinder_results = pd.DataFrame() if (args.disinf_file != "NOT_RUN"): diff --git a/docs/04_install_and_run.md b/docs/04_install_and_run.md index e08db56..2fd44da 100644 --- a/docs/04_install_and_run.md +++ b/docs/04_install_and_run.md @@ -27,8 +27,6 @@ The workflow can be run with the demo data using: nextflow run epi2me-labs/wf-bacterial-genomes \ --fastq wf-bacterial-genomes-demo/isolates_fastq \ --isolates \ - --reference_based_assembly \ - --reference wf-bacterial-genomes-demo/ref/ref.fasta.gz \ --sample_sheet wf-bacterial-genomes-demo/isolates_sample_sheet.csv \ -profile standard ``` diff --git a/modules/local/checkpoints.nf b/modules/local/checkpoints.nf index 5a42d47..ff78e45 100644 --- a/modules/local/checkpoints.nf +++ b/modules/local/checkpoints.nf @@ -48,7 +48,7 @@ process accumulateCheckpoints { process ingressCheckpoint { label "wf_common" cpus 1 - memory "250 MB" + memory "1 GB" input: tuple val(meta), val(status) output: @@ -75,7 +75,7 @@ process ingressCheckpoint { process alignmentCheckpoint { label "wf_common" cpus 1 - memory "250 MB" + memory "1 GB" input: tuple val(meta), path(bam, stageAs: "bam/*"), path(bai, stageAs: "bai/*") output: @@ -101,7 +101,7 @@ process alignmentCheckpoint { process assemblyCheckpoint { label "wf_common" cpus 1 - memory "250 MB" + memory "1 GB" input: tuple val(meta), path(fasta) output: @@ -127,7 +127,7 @@ process assemblyCheckpoint { process variantCheckpoint { label "wf_common" cpus 1 - memory "250 MB" + memory "1 GB" input: tuple val(meta), val(status) output: @@ -153,7 +153,7 @@ process variantCheckpoint { process amrCheckpoint { label "wf_common" cpus 1 - memory "250 MB" + memory "1 GB" input: tuple val(meta), val(status) output: @@ -178,7 +178,7 @@ process amrCheckpoint { process annotationCheckpoint { label "wf_common" cpus 1 - memory "250 MB" + memory "1 GB" input: tuple val(meta), val(status) output: @@ -203,7 +203,7 @@ process annotationCheckpoint { process perSampleReportingCheckpoint { label "wf_common" cpus 1 - memory "250 MB" + memory "1 GB" input: tuple val(meta), val(status) output: @@ -227,7 +227,7 @@ process perSampleReportingCheckpoint { process reportingCheckpoint { label "wf_common" cpus 1 - memory "250 MB" + memory "1 GB" input: path report output: diff --git a/modules/local/isolates.nf b/modules/local/isolates.nf index 44b54a3..44388ee 100644 --- a/modules/local/isolates.nf +++ b/modules/local/isolates.nf @@ -16,7 +16,7 @@ process mlstSearch { process getPointfinderSpecies { label "wfbacterialgenomes" cpus 1 - memory "500 MB" + memory "2 GB" input: tuple val(meta), path("${meta.alias}.mlst.json") output: @@ -41,7 +41,8 @@ process resfinder { tuple val(meta), path("${meta.alias}_resfinder_results"), val(species) script: """ - gunzip -c input_genome.fasta.gz > input_genome.fasta + # sed added to remove basecaller config from fasta + resfinder table + gunzip -c input_genome.fasta.gz | sed '/^>/ s/ .*//' > input_genome.fasta python -m resfinder \ -o ${meta.alias}_resfinder_results \ @@ -61,7 +62,7 @@ process processResfinder { // Disinfection not processed yet (CW-2106) label "wfbacterialgenomes" cpus 2 - memory "500 MB" + memory "2 GB" input: tuple val(meta), path("${meta.alias}_resfinder_results"), val(species) output: @@ -86,7 +87,7 @@ process processResfinder { process serotyping { label "seqsero2" cpus 1 - memory "7 GB" + memory "3 GB" errorStrategy 'ignore' input: tuple val(meta), path("input_genome.fasta.gz"), val(species) diff --git a/nextflow.config b/nextflow.config index 2bfcefa..8d5e3a8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -43,8 +43,6 @@ params { example_cmd = [ "--fastq 'wf-bacterial-genomes-demo/isolates_fastq'", "--isolates", - "--reference_based_assembly", - "--reference 'wf-bacterial-genomes-demo/ref/ref.fasta.gz'", "--sample_sheet 'wf-bacterial-genomes-demo/isolates_sample_sheet.csv'" ] common_sha = "sha1c5febff9f75143710826498b093d9769a5edbb9" @@ -63,7 +61,7 @@ manifest { description = 'Assembly, variant calling, and annotation of bacterial genomes.' mainScript = 'main.nf' nextflowVersion = '>=23.04.2' - version = 'v1.1.1' + version = 'v1.2.0' } epi2melabs {