Skip to content

Commit

Permalink
Merge branch 'CW-3284/resfinder_error' into 'dev'
Browse files Browse the repository at this point in the history
Resfinder multiple mutations fix [CW-3284]

See merge request epi2melabs/workflows/wf-bacterial-genomes!116
  • Loading branch information
Christopher Alder committed Mar 6, 2024
2 parents f9c4020 + 860cdc4 commit 6af5457
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 46 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Expand Up @@ -4,12 +4,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
## [v1.2.0]
### Added
- `client_fields` parameter to allow input of a JSON file of key value pairs to display on output reports.
- `min_read_length` parameter to remove reads below specified length (default 1000bp) from downstream analysis, to improve de novo assembly process
- Salmonella serotyping with `SeqSero2`

### Fixed
- Duplicate entries in `Pointfinder` processing

## [v1.1.1]
### Fixed
- Report generation when `Resfinder` fails
Expand Down
2 changes: 0 additions & 2 deletions README.md
Expand Up @@ -71,8 +71,6 @@ The workflow can be run with the demo data using:
nextflow run epi2me-labs/wf-bacterial-genomes \
--fastq wf-bacterial-genomes-demo/isolates_fastq \
--isolates \
--reference_based_assembly \
--reference wf-bacterial-genomes-demo/ref/ref.fasta.gz \
--sample_sheet wf-bacterial-genomes-demo/isolates_sample_sheet.csv \
-profile standard
```
Expand Down
61 changes: 35 additions & 26 deletions bin/workflow_glue/process_resfinder.py
Expand Up @@ -50,11 +50,11 @@ def read_pointfinder_xml(xml_file):
results['query_end'].append(hsp.query_end)
results['subject_start'].append(hsp.sbjct_start)
results['subject_end'].append(hsp.sbjct_end)
results['query'].append(record.query[:100])
results['query'].append(record.query)

results_df = pd.DataFrame(results)
# Switch start and end if start is greater than end
if (results_df.loc[0, 'query_start'] < results_df.loc[0, 'query_end']):
if (results_df.loc[0, 'query_start'] > results_df.loc[0, 'query_end']):
results_df['query_start'], results_df['query_end'] = results_df[
'query_end'], results_df['query_start']
results_df['subject_start'], results_df['subject_end'] = results_df[
Expand All @@ -80,6 +80,17 @@ def extract_point_bp(subject, mutation):
return (mutation_bp)


def get_global_mutation_position(q_start, q_end, s_start, s_end, loc_pos):
"""Get global mutation position on query depending on query strand orientation."""
if s_start < s_end:
glob_loc_start = q_start + loc_pos
glob_loc_end = q_start + loc_pos + 2
else:
glob_loc_start = q_end - loc_pos
glob_loc_end = q_end - loc_pos - 2
return (glob_loc_start, glob_loc_end)


# Maybe add a test for this?
def convert_pointfinder_row(database_location, row):
"""Convert point finder row."""
Expand All @@ -91,28 +102,26 @@ def convert_pointfinder_row(database_location, row):
pointfinder_blast_data['Mutation'] = row['Mutation']
# TODO: This works for mutations in CDS, but not sure how it will work
# with mutations in promoter regions
bp_position = extract_point_bp(
pointfinder_blast_data.loc[0, 'Subject'],
pointfinder_blast_data.loc[0, 'Mutation'])
pointfinder_blast_data['Local_loc_bp'] = bp_position
start_loc = pointfinder_blast_data.loc[0, 'query_start']
end_loc = pointfinder_blast_data.loc[0, 'query_end']
# Getting round the fact that the res-gene may match in the reverse
# orientation, and therefore the start and end positions are reversed
if (start_loc > end_loc):
pointfinder_blast_data['Global_loc_bp'] = start_loc - bp_position
# End of codon
pointfinder_blast_data['Global_loc_bp_end'] = start_loc - \
bp_position + 2
else:
pointfinder_blast_data['Global_loc_bp'] = end_loc + bp_position
# TODO: should this be a + or -
pointfinder_blast_data['Global_loc_bp_end'] = start_loc - \
bp_position + 2

pointfinder_blast_data['Local_loc_bp'] = pointfinder_blast_data.apply(
lambda row: extract_point_bp(row["Subject"], row["Mutation"]), axis=1
)
# Find start and end of mutation on query depending on orientation of gene
pointfinder_blast_data[
['Global_loc_bp', 'Global_loc_bp_end']
] = pointfinder_blast_data.apply(
lambda row: get_global_mutation_position(
row["query_start"],
row["query_end"],
row["subject_start"],
row["subject_end"],
row["Local_loc_bp"]
), axis=1, result_type="expand"
)
return pointfinder_blast_data


def process_pointfinder(pointfinder_data, output_file, database_location):
def process_pointfinder(pointfinder_data, database_location):
"""Select right columns and split loc column."""
# TODO: This is all messy and needs refactoring
if len(pointfinder_data) == 0:
Expand All @@ -125,24 +134,22 @@ def process_pointfinder(pointfinder_data, output_file, database_location):
except IndexError:
gene_name = pointfinder_data['Mutation']
pointfinder_data["Resistance gene"] = gene_name
mutation_names = pointfinder_data['Mutation']
loc_split = pointfinder_data['Mutation'].str.split(
" ", n=1, expand=True)
pointfinder_data = pointfinder_data.assign(
Sequence=loc_split[0], Mutation=loc_split[1])
pointfinder_data['Query_loc'] = 0
df_list = []
for key, row in pointfinder_data.iterrows():
# multiple entries can be found in xml so iterate over rows
for _, row in pointfinder_data.iterrows():
pointfinder_blast_data = convert_pointfinder_row(
database_location, row)
df_list.append(pointfinder_blast_data)

concat_df = pd.concat(df_list)
# split the mutation and get the base position
concat_df_merge = concat_df.merge(pointfinder_data, on='Mutation')
concat_df_merge_tidy = concat_df_merge.iloc[
:, [4, 8, 9, 12, 10, 11, 13, 14]]
concat_df_merge_tidy.index = mutation_names
concat_df_merge_tidy = concat_df_merge_tidy.assign(Type='pointfinder')
out_columns = [
'Contig', 'Start', 'End', 'Phenotype', 'Identity/Nucleotide',
Expand All @@ -159,8 +166,10 @@ def main(args):
resfinder_results = process_resfinder(resfinder_data)
if (args.pointfinder_file != "NOT_RUN"):
pointfinder_data = pd.read_csv(args.pointfinder_file, sep="\t")
# Drop duplicates - entries will be recovered in process_pointfinder
pointfinder_data = pointfinder_data.drop_duplicates()
pointfinder_results = process_pointfinder(
pointfinder_data, args.output, args.database_location)
pointfinder_data, args.database_location)
else:
pointfinder_results = pd.DataFrame()
if (args.disinf_file != "NOT_RUN"):
Expand Down
2 changes: 0 additions & 2 deletions docs/04_install_and_run.md
Expand Up @@ -27,8 +27,6 @@ The workflow can be run with the demo data using:
nextflow run epi2me-labs/wf-bacterial-genomes \
--fastq wf-bacterial-genomes-demo/isolates_fastq \
--isolates \
--reference_based_assembly \
--reference wf-bacterial-genomes-demo/ref/ref.fasta.gz \
--sample_sheet wf-bacterial-genomes-demo/isolates_sample_sheet.csv \
-profile standard
```
Expand Down
16 changes: 8 additions & 8 deletions modules/local/checkpoints.nf
Expand Up @@ -48,7 +48,7 @@ process accumulateCheckpoints {
process ingressCheckpoint {
label "wf_common"
cpus 1
memory "250 MB"
memory "1 GB"
input:
tuple val(meta), val(status)
output:
Expand All @@ -75,7 +75,7 @@ process ingressCheckpoint {
process alignmentCheckpoint {
label "wf_common"
cpus 1
memory "250 MB"
memory "1 GB"
input:
tuple val(meta), path(bam, stageAs: "bam/*"), path(bai, stageAs: "bai/*")
output:
Expand All @@ -101,7 +101,7 @@ process alignmentCheckpoint {
process assemblyCheckpoint {
label "wf_common"
cpus 1
memory "250 MB"
memory "1 GB"
input:
tuple val(meta), path(fasta)
output:
Expand All @@ -127,7 +127,7 @@ process assemblyCheckpoint {
process variantCheckpoint {
label "wf_common"
cpus 1
memory "250 MB"
memory "1 GB"
input:
tuple val(meta), val(status)
output:
Expand All @@ -153,7 +153,7 @@ process variantCheckpoint {
process amrCheckpoint {
label "wf_common"
cpus 1
memory "250 MB"
memory "1 GB"
input:
tuple val(meta), val(status)
output:
Expand All @@ -178,7 +178,7 @@ process amrCheckpoint {
process annotationCheckpoint {
label "wf_common"
cpus 1
memory "250 MB"
memory "1 GB"
input:
tuple val(meta), val(status)
output:
Expand All @@ -203,7 +203,7 @@ process annotationCheckpoint {
process perSampleReportingCheckpoint {
label "wf_common"
cpus 1
memory "250 MB"
memory "1 GB"
input:
tuple val(meta), val(status)
output:
Expand All @@ -227,7 +227,7 @@ process perSampleReportingCheckpoint {
process reportingCheckpoint {
label "wf_common"
cpus 1
memory "250 MB"
memory "1 GB"
input:
path report
output:
Expand Down
9 changes: 5 additions & 4 deletions modules/local/isolates.nf
Expand Up @@ -16,7 +16,7 @@ process mlstSearch {
process getPointfinderSpecies {
label "wfbacterialgenomes"
cpus 1
memory "500 MB"
memory "2 GB"
input:
tuple val(meta), path("${meta.alias}.mlst.json")
output:
Expand All @@ -41,7 +41,8 @@ process resfinder {
tuple val(meta), path("${meta.alias}_resfinder_results"), val(species)
script:
"""
gunzip -c input_genome.fasta.gz > input_genome.fasta
# sed added to remove basecaller config from fasta + resfinder table
gunzip -c input_genome.fasta.gz | sed '/^>/ s/ .*//' > input_genome.fasta
python -m resfinder \
-o ${meta.alias}_resfinder_results \
Expand All @@ -61,7 +62,7 @@ process processResfinder {
// Disinfection not processed yet (CW-2106)
label "wfbacterialgenomes"
cpus 2
memory "500 MB"
memory "2 GB"
input:
tuple val(meta), path("${meta.alias}_resfinder_results"), val(species)
output:
Expand All @@ -86,7 +87,7 @@ process processResfinder {
process serotyping {
label "seqsero2"
cpus 1
memory "7 GB"
memory "3 GB"
errorStrategy 'ignore'
input:
tuple val(meta), path("input_genome.fasta.gz"), val(species)
Expand Down
4 changes: 1 addition & 3 deletions nextflow.config
Expand Up @@ -43,8 +43,6 @@ params {
example_cmd = [
"--fastq 'wf-bacterial-genomes-demo/isolates_fastq'",
"--isolates",
"--reference_based_assembly",
"--reference 'wf-bacterial-genomes-demo/ref/ref.fasta.gz'",
"--sample_sheet 'wf-bacterial-genomes-demo/isolates_sample_sheet.csv'"
]
common_sha = "sha1c5febff9f75143710826498b093d9769a5edbb9"
Expand All @@ -63,7 +61,7 @@ manifest {
description = 'Assembly, variant calling, and annotation of bacterial genomes.'
mainScript = 'main.nf'
nextflowVersion = '>=23.04.2'
version = 'v1.1.1'
version = 'v1.2.0'
}

epi2melabs {
Expand Down

0 comments on commit 6af5457

Please sign in to comment.