Merge branch 'CW-3284/resfinder_error' into 'dev'

Resfinder multiple mutations fix [CW-3284] See merge request epi2melabs/workflows/wf-bacterial-genomes!116
epi2me-labs · Mar 6, 2024 · 6af5457 · 6af5457
2 parents f9c4020 + 860cdc4
commit 6af5457
Show file tree

Hide file tree

Showing 7 changed files with 53 additions and 46 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,12 +4,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unreleased]
+## [v1.2.0]
 ### Added
 - `client_fields` parameter to allow input of a JSON file of key value pairs to display on output reports.
 - `min_read_length` parameter to remove reads below specified length (default 1000bp) from downstream analysis, to improve de novo assembly process
 - Salmonella serotyping with `SeqSero2`
 
+### Fixed
+- Duplicate entries in `Pointfinder` processing
+
 ## [v1.1.1]
 ### Fixed
 - Report generation when `Resfinder` fails

diff --git a/README.md b/README.md
@@ -71,8 +71,6 @@ The workflow can be run with the demo data using:
 nextflow run epi2me-labs/wf-bacterial-genomes \
     --fastq wf-bacterial-genomes-demo/isolates_fastq \
     --isolates \
-    --reference_based_assembly \
-    --reference wf-bacterial-genomes-demo/ref/ref.fasta.gz \
     --sample_sheet wf-bacterial-genomes-demo/isolates_sample_sheet.csv \
     -profile standard
 ```

diff --git a/bin/workflow_glue/process_resfinder.py b/bin/workflow_glue/process_resfinder.py
@@ -50,11 +50,11 @@ def read_pointfinder_xml(xml_file):
                     results['query_end'].append(hsp.query_end)
                     results['subject_start'].append(hsp.sbjct_start)
                     results['subject_end'].append(hsp.sbjct_end)
-                    results['query'].append(record.query[:100])
+                    results['query'].append(record.query)
 
     results_df = pd.DataFrame(results)
     # Switch start and end if start is greater than end
-    if (results_df.loc[0, 'query_start'] < results_df.loc[0, 'query_end']):
+    if (results_df.loc[0, 'query_start'] > results_df.loc[0, 'query_end']):
         results_df['query_start'], results_df['query_end'] = results_df[
             'query_end'], results_df['query_start']
         results_df['subject_start'], results_df['subject_end'] = results_df[
@@ -80,6 +80,17 @@ def extract_point_bp(subject, mutation):
         return (mutation_bp)
 
 
+def get_global_mutation_position(q_start, q_end, s_start, s_end, loc_pos):
+    """Get global mutation position on query depending on query strand orientation."""
+    if s_start < s_end:
+        glob_loc_start = q_start + loc_pos
+        glob_loc_end = q_start + loc_pos + 2
+    else:
+        glob_loc_start = q_end - loc_pos
+        glob_loc_end = q_end - loc_pos - 2
+    return (glob_loc_start, glob_loc_end)
+
+
 # Maybe add a test for this?
 def convert_pointfinder_row(database_location, row):
     """Convert point finder row."""
@@ -91,28 +102,26 @@ def convert_pointfinder_row(database_location, row):
     pointfinder_blast_data['Mutation'] = row['Mutation']
     # TODO: This works for mutations in CDS, but not sure how it will work
     # with mutations in promoter regions
-    bp_position = extract_point_bp(
-        pointfinder_blast_data.loc[0, 'Subject'],
-        pointfinder_blast_data.loc[0, 'Mutation'])
-    pointfinder_blast_data['Local_loc_bp'] = bp_position
-    start_loc = pointfinder_blast_data.loc[0, 'query_start']
-    end_loc = pointfinder_blast_data.loc[0, 'query_end']
-    # Getting round the fact that the res-gene may match in the reverse
-    # orientation, and therefore the start and end positions are reversed
-    if (start_loc > end_loc):
-        pointfinder_blast_data['Global_loc_bp'] = start_loc - bp_position
-        # End of codon
-        pointfinder_blast_data['Global_loc_bp_end'] = start_loc - \
-            bp_position + 2
-    else:
-        pointfinder_blast_data['Global_loc_bp'] = end_loc + bp_position
-        # TODO: should this be a + or -
-        pointfinder_blast_data['Global_loc_bp_end'] = start_loc - \
-            bp_position + 2
+
+    pointfinder_blast_data['Local_loc_bp'] = pointfinder_blast_data.apply(
+        lambda row: extract_point_bp(row["Subject"], row["Mutation"]), axis=1
+        )
+    # Find start and end of mutation on query depending on orientation of gene
+    pointfinder_blast_data[
+        ['Global_loc_bp', 'Global_loc_bp_end']
+        ] = pointfinder_blast_data.apply(
+            lambda row: get_global_mutation_position(
+                row["query_start"],
+                row["query_end"],
+                row["subject_start"],
+                row["subject_end"],
+                row["Local_loc_bp"]
+            ), axis=1, result_type="expand"
+        )
     return pointfinder_blast_data
 
 
-def process_pointfinder(pointfinder_data, output_file, database_location):
+def process_pointfinder(pointfinder_data, database_location):
     """Select right columns and split loc column."""
     # TODO: This is all messy and needs refactoring
     if len(pointfinder_data) == 0:
@@ -125,24 +134,22 @@ def process_pointfinder(pointfinder_data, output_file, database_location):
     except IndexError:
         gene_name = pointfinder_data['Mutation']
     pointfinder_data["Resistance gene"] = gene_name
-    mutation_names = pointfinder_data['Mutation']
     loc_split = pointfinder_data['Mutation'].str.split(
         " ", n=1, expand=True)
     pointfinder_data = pointfinder_data.assign(
         Sequence=loc_split[0], Mutation=loc_split[1])
     pointfinder_data['Query_loc'] = 0
     df_list = []
-    for key, row in pointfinder_data.iterrows():
+    # multiple entries can be found in xml so iterate over rows
+    for _, row in pointfinder_data.iterrows():
         pointfinder_blast_data = convert_pointfinder_row(
             database_location, row)
         df_list.append(pointfinder_blast_data)
 
     concat_df = pd.concat(df_list)
-    # split the mutation and get the base position
     concat_df_merge = concat_df.merge(pointfinder_data, on='Mutation')
     concat_df_merge_tidy = concat_df_merge.iloc[
         :, [4, 8, 9, 12, 10, 11, 13, 14]]
-    concat_df_merge_tidy.index = mutation_names
     concat_df_merge_tidy = concat_df_merge_tidy.assign(Type='pointfinder')
     out_columns = [
         'Contig', 'Start', 'End', 'Phenotype', 'Identity/Nucleotide',
@@ -159,8 +166,10 @@ def main(args):
     resfinder_results = process_resfinder(resfinder_data)
     if (args.pointfinder_file != "NOT_RUN"):
         pointfinder_data = pd.read_csv(args.pointfinder_file, sep="\t")
+        # Drop duplicates - entries will be recovered in process_pointfinder
+        pointfinder_data = pointfinder_data.drop_duplicates()
         pointfinder_results = process_pointfinder(
-            pointfinder_data, args.output, args.database_location)
+            pointfinder_data, args.database_location)
     else:
         pointfinder_results = pd.DataFrame()
     if (args.disinf_file != "NOT_RUN"):

diff --git a/docs/04_install_and_run.md b/docs/04_install_and_run.md
@@ -27,8 +27,6 @@ The workflow can be run with the demo data using:
 nextflow run epi2me-labs/wf-bacterial-genomes \
     --fastq wf-bacterial-genomes-demo/isolates_fastq \
     --isolates \
-    --reference_based_assembly \
-    --reference wf-bacterial-genomes-demo/ref/ref.fasta.gz \
     --sample_sheet wf-bacterial-genomes-demo/isolates_sample_sheet.csv \
     -profile standard
 ```

diff --git a/modules/local/checkpoints.nf b/modules/local/checkpoints.nf
@@ -48,7 +48,7 @@ process accumulateCheckpoints {
 process ingressCheckpoint {
   label "wf_common"
   cpus 1
-  memory "250 MB"
+  memory "1 GB"
   input:
     tuple val(meta), val(status)
   output:
@@ -75,7 +75,7 @@ process ingressCheckpoint {
 process alignmentCheckpoint {
   label "wf_common"
   cpus 1
-  memory "250 MB"
+  memory "1 GB"
   input:
     tuple val(meta), path(bam, stageAs: "bam/*"), path(bai, stageAs: "bai/*")
   output:
@@ -101,7 +101,7 @@ process alignmentCheckpoint {
 process assemblyCheckpoint {
   label "wf_common"
   cpus 1
-  memory "250 MB"
+  memory "1 GB"
   input:
     tuple val(meta), path(fasta)
   output:
@@ -127,7 +127,7 @@ process assemblyCheckpoint {
 process variantCheckpoint {
   label "wf_common"
   cpus 1
-  memory "250 MB"
+  memory "1 GB"
   input:
     tuple val(meta), val(status)
   output:
@@ -153,7 +153,7 @@ process variantCheckpoint {
 process amrCheckpoint {
   label "wf_common"
   cpus 1
-  memory "250 MB"
+  memory "1 GB"
   input:
     tuple val(meta), val(status)
   output:
@@ -178,7 +178,7 @@ process amrCheckpoint {
 process annotationCheckpoint {
   label "wf_common"
   cpus 1
-  memory "250 MB"
+  memory "1 GB"
   input:
     tuple val(meta), val(status)
   output:
@@ -203,7 +203,7 @@ process annotationCheckpoint {
 process perSampleReportingCheckpoint {
   label "wf_common"
   cpus 1
-  memory "250 MB"
+  memory "1 GB"
   input:
     tuple val(meta), val(status)
   output:
@@ -227,7 +227,7 @@ process perSampleReportingCheckpoint {
 process reportingCheckpoint {
   label "wf_common"
   cpus 1
-  memory "250 MB"
+  memory "1 GB"
   input:
     path report
   output:

diff --git a/modules/local/isolates.nf b/modules/local/isolates.nf
@@ -16,7 +16,7 @@ process mlstSearch {
 process getPointfinderSpecies {
     label "wfbacterialgenomes"
     cpus 1
-    memory "500 MB"
+    memory "2 GB"
     input:
         tuple val(meta), path("${meta.alias}.mlst.json")
     output:
@@ -41,7 +41,8 @@ process resfinder {
         tuple val(meta), path("${meta.alias}_resfinder_results"), val(species)
     script:
     """
-    gunzip -c input_genome.fasta.gz > input_genome.fasta
+    # sed added to remove basecaller config from fasta +  resfinder table 
+    gunzip -c input_genome.fasta.gz | sed '/^>/ s/ .*//' > input_genome.fasta
 
     python -m resfinder \
         -o ${meta.alias}_resfinder_results \
@@ -61,7 +62,7 @@ process processResfinder {
     // Disinfection not processed yet (CW-2106)
     label "wfbacterialgenomes"
     cpus 2
-    memory "500 MB"
+    memory "2 GB"
     input:
         tuple val(meta), path("${meta.alias}_resfinder_results"), val(species)
     output:
@@ -86,7 +87,7 @@ process processResfinder {
 process serotyping {
     label "seqsero2"
     cpus 1
-    memory "7 GB"
+    memory "3 GB"
     errorStrategy 'ignore'
     input: 
         tuple val(meta), path("input_genome.fasta.gz"), val(species)

diff --git a/nextflow.config b/nextflow.config
@@ -43,8 +43,6 @@ params {
       example_cmd = [
             "--fastq 'wf-bacterial-genomes-demo/isolates_fastq'",
             "--isolates",
-            "--reference_based_assembly",
-            "--reference 'wf-bacterial-genomes-demo/ref/ref.fasta.gz'",
             "--sample_sheet 'wf-bacterial-genomes-demo/isolates_sample_sheet.csv'"
       ]
       common_sha = "sha1c5febff9f75143710826498b093d9769a5edbb9"
@@ -63,7 +61,7 @@ manifest {
     description     = 'Assembly, variant calling, and annotation of bacterial genomes.'
     mainScript      = 'main.nf'
     nextflowVersion = '>=23.04.2'
-    version         = 'v1.1.1'
+    version         = 'v1.2.0'
 }
 
 epi2melabs {