Skip to content

Commit

Permalink
Batch amplicon plots (#251)
Browse files Browse the repository at this point in the history
* Error out if HDR amplicon matches existing amplicon

* Add check for amplicon sequence uniqueness

* Fix bug with bam_input not having bam_output

* Test for no returned lines in auto mode, version bump to 2.2.11

* Fix pandas deprecation of df.append
  • Loading branch information
kclem committed Oct 9, 2022
1 parent 726b2b9 commit 6a2b342
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 99 deletions.
191 changes: 97 additions & 94 deletions CRISPResso2/CRISPRessoBatchCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,17 @@ def main():
crispresso2_info['results']['general_plots']['summary_plot_titles'] = {}
crispresso2_info['results']['general_plots']['summary_plot_labels'] = {}
crispresso2_info['results']['general_plots']['summary_plot_datas'] = {}
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_names'] = []
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_paths'] = {}
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_titles'] = {}
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_labels'] = {}
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_datas'] = {}

crispresso2_info['results']['general_plots']['allele_modification_line_plot_names'] = []
crispresso2_info['results']['general_plots']['allele_modification_line_plot_paths'] = {}
crispresso2_info['results']['general_plots']['allele_modification_line_plot_titles'] = {}
crispresso2_info['results']['general_plots']['allele_modification_line_plot_labels'] = {}
crispresso2_info['results']['general_plots']['allele_modification_line_plot_datas'] = {}

# report for amplicons
for amplicon_index, amplicon_seq in enumerate(all_amplicons):
Expand Down Expand Up @@ -684,105 +695,97 @@ def main():
crispresso2_info['results']['general_plots']['summary_plot_labels'][plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to +' conversion rates for the amplicon ' + amplicon_name
crispresso2_info['results']['general_plots']['summary_plot_datas'][plot_name] = [('Nucleotide frequencies', os.path.basename(nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename(modification_frequency_summary_filename))]

# allele modification frequency heatmap and line plots
if not args.suppress_plots and not args.suppress_batch_summary_plots:
if guides_all_same:
sgRNA_intervals = [consensus_sgRNA_intervals] * modification_frequency_summary_df.shape[0]
else:
sgRNA_intervals = [consensus_sgRNA_intervals]
for modification_type in ['Insertions', 'Deletions', 'Substitutions']:
modification_df = modification_frequency_summary_df[
modification_frequency_summary_df['Modification'] == modification_type
]
modification_df.index = [
'{0} ({1})'.format(batch, batch_index)
for batch_index, batch in enumerate(
modification_df['Batch'], 1,
)
]
modification_df = modification_df.drop(
['Modification', 'Batch'], axis=1,
)
modification_df.columns = [
'{0} ({1})'.format(column, position)
for position, column in
enumerate(modification_df.columns, 1)
]
plot_name = 'CRISPRessoBatch_percentage_of_{0}_across_alleles_{1}_heatmap'.format(modification_type.lower(), amplicon_name)
plot_path = '{0}.html'.format(_jp(plot_name))

allele_modification_heatmap_input = {
'sample_values': modification_df,
'sample_sgRNA_intervals': sgRNA_intervals,
'plot_path': plot_path,
'title': modification_type,
}
plot(
CRISPRessoPlot.plot_allele_modification_heatmap,
allele_modification_heatmap_input,
)

crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_names'].append(plot_name)
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_paths'][plot_name] = plot_path
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_titles'][plot_name] = 'CRISPRessoBatch {0} Across Samples for {1}'.format(
modification_type,
amplicon_name,
)
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_labels'][plot_name] = 'Each row is a sample and each column is a position in the amplicon sequence. Each cell shows the percentage of {0} for the sample at that position relative to the amplicon. Guides for each sample are identified by a black rectangle.'.format(modification_type.lower())
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_datas'][plot_name] = [
(
'CRISPRessoBatch Modification Frequency Summary',
os.path.basename(
modification_frequency_summary_filename,
),
),
]

plot_name = 'CRISPRessoBatch_percentage_of_{0}_across_alleles_{1}_line'.format(modification_type.lower(), amplicon_name)
plot_path = '{0}.html'.format(_jp(plot_name))

allele_modification_line_input = {
'sample_values': modification_df,
'sample_sgRNA_intervals': sgRNA_intervals,
'plot_path': plot_path,
'title': modification_type,
}
plot(
CRISPRessoPlot.plot_allele_modification_line,
allele_modification_line_input,
)

crispresso2_info['results']['general_plots']['allele_modification_line_plot_names'].append(plot_name)
crispresso2_info['results']['general_plots']['allele_modification_line_plot_paths'][plot_name] = plot_path
crispresso2_info['results']['general_plots']['allele_modification_line_plot_titles'][plot_name] = 'CRISPRessoBatch {0} Across Samples for {1}'.format(
modification_type,
amplicon_name,
)
crispresso2_info['results']['general_plots']['allele_modification_line_plot_labels'][plot_name] = 'Each line is a sample that indicates the percentage of {0} for the sample at that position relative to the amplicon. Guides are shown by a grey rectangle.'.format(modification_type.lower())
crispresso2_info['results']['general_plots']['allele_modification_line_plot_datas'][plot_name] = [
(
'CRISPRessoBatch Modification Frequency Summary',
os.path.basename(
modification_frequency_summary_filename,
),
),
]
#end if amp_found_count > 0 (how many folders had information for this amplicon)
#end per-amplicon analysis

crispresso2_info['results']['general_plots']['window_nuc_pct_quilt_plot_names'] = window_nuc_pct_quilt_plot_names
crispresso2_info['results']['general_plots']['nuc_pct_quilt_plot_names'] = nuc_pct_quilt_plot_names
crispresso2_info['results']['general_plots']['window_nuc_conv_plot_names'] = window_nuc_conv_plot_names
crispresso2_info['results']['general_plots']['nuc_conv_plot_names'] = nuc_conv_plot_names

# allele modification frequency heatmap and line plots
if not args.suppress_plots and not args.suppress_batch_summary_plots:
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_names'] = []
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_paths'] = {}
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_titles'] = {}
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_labels'] = {}
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_datas'] = {}

crispresso2_info['results']['general_plots']['allele_modification_line_plot_names'] = []
crispresso2_info['results']['general_plots']['allele_modification_line_plot_paths'] = {}
crispresso2_info['results']['general_plots']['allele_modification_line_plot_titles'] = {}
crispresso2_info['results']['general_plots']['allele_modification_line_plot_labels'] = {}
crispresso2_info['results']['general_plots']['allele_modification_line_plot_datas'] = {}
if guides_all_same:
sgRNA_intervals = [consensus_sgRNA_intervals] * modification_frequency_summary_df.shape[0]
else:
sgRNA_intervals = [consensus_sgRNA_intervals]
for modification_type in ['Insertions', 'Deletions', 'Substitutions']:
modification_df = modification_frequency_summary_df[
modification_frequency_summary_df['Modification'] == modification_type
]
modification_df.index = [
'{0} ({1})'.format(batch, batch_index)
for batch_index, batch in enumerate(
modification_df['Batch'], 1,
)
]
modification_df = modification_df.drop(
['Modification', 'Batch'], axis=1,
)
modification_df.columns = [
'{0} ({1})'.format(column, position)
for position, column in
enumerate(modification_df.columns, 1)
]
plot_name = 'CRISPRessoBatch_percentage_of_{0}_across_alleles_{1}_heatmap'.format(modification_type.lower(), amplicon_name)
plot_path = '{0}.html'.format(_jp(plot_name))

allele_modification_heatmap_input = {
'sample_values': modification_df,
'sample_sgRNA_intervals': sgRNA_intervals,
'plot_path': plot_path,
'title': modification_type,
}
plot(
CRISPRessoPlot.plot_allele_modification_heatmap,
allele_modification_heatmap_input,
)

crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_names'].append(plot_name)
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_paths'][plot_name] = plot_path
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_titles'][plot_name] = 'CRISPRessoBatch {0} Across Samples for {1}'.format(
modification_type,
amplicon_name,
)
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_labels'][plot_name] = 'Each row is a sample and each column is a position in the amplicon sequence. Each cell shows the percentage of {0} for the sample at that position relative to the amplicon. Guides for each sample are identified by a black rectangle.'.format(modification_type.lower())
crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_datas'][plot_name] = [
(
'CRISPRessoBatch Modification Frequency Summary',
os.path.basename(
modification_frequency_summary_filename,
),
),
]

plot_name = 'CRISPRessoBatch_percentage_of_{0}_across_alleles_{1}_line'.format(modification_type.lower(), amplicon_name)
plot_path = '{0}.html'.format(_jp(plot_name))

allele_modification_line_input = {
'sample_values': modification_df,
'sample_sgRNA_intervals': sgRNA_intervals,
'plot_path': plot_path,
'title': modification_type,
}
plot(
CRISPRessoPlot.plot_allele_modification_line,
allele_modification_line_input,
)

crispresso2_info['results']['general_plots']['allele_modification_line_plot_names'].append(plot_name)
crispresso2_info['results']['general_plots']['allele_modification_line_plot_paths'][plot_name] = plot_path
crispresso2_info['results']['general_plots']['allele_modification_line_plot_titles'][plot_name] = 'CRISPRessoBatch {0} Across Samples for {1}'.format(
modification_type,
amplicon_name,
)
crispresso2_info['results']['general_plots']['allele_modification_line_plot_labels'][plot_name] = 'Each line is a sample that indicates the percentage of {0} for the sample at that position relative to the amplicon. Guides are shown by a grey rectangle.'.format(modification_type.lower())
crispresso2_info['results']['general_plots']['allele_modification_line_plot_datas'][plot_name] = [
(
'CRISPRessoBatch Modification Frequency Summary',
os.path.basename(
modification_frequency_summary_filename,
),
),
]

# summarize amplicon modifications
with open(_jp('CRISPRessoBatch_quantification_of_editing_frequency.txt'), 'w') as outfile:
Expand Down
9 changes: 8 additions & 1 deletion CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -1130,7 +1130,7 @@ def rreplace(s, old, new):
fastq_output = _jp('CRISPResso_output.fastq.gz')
crispresso2_info['fastq_output'] = fastq_output
info('Writing fastq output file: ' + fastq_output)
if args.bam_output:
if args.bam_output or args.bam_input:
if args.fastq_output:
raise CRISPRessoShared.BadParameterException('bam_output is not compatable with fastq_output! Please either use bam_output or fastq_output.')
bam_output = _jp('CRISPResso_output.bam')
Expand Down Expand Up @@ -1299,6 +1299,10 @@ def rreplace(s, old, new):
amplicon_min_alignment_score_arr.append(this_min_aln_score)

if args.expected_hdr_amplicon_seq != "":
if args.expected_hdr_amplicon_seq in amplicon_seq_arr:
raise CRISPRessoShared.BadParameterException(
"HDR expected amplicon sequence is the same as a reference amplicon. Check the --expected_hdr_amplicon_seq parameter")

amplicon_seq_arr.append(args.expected_hdr_amplicon_seq)
amplicon_name_arr.append('HDR')
amplicon_min_alignment_score_arr.append(args.default_min_aln_score)
Expand Down Expand Up @@ -1374,6 +1378,9 @@ def rreplace(s, old, new):
if wrong_nt:
this_name = amplicon_name_arr[idx]
raise CRISPRessoShared.NTException('Reference amplicon sequence %d (%s) contains invalid characters: %s'%(idx, this_name, ' '.join(wrong_nt)))
amplicon_seq_set = set(amplicon_seq_arr)
if len(amplicon_seq_set) != len(amplicon_seq_arr):
raise CRISPRessoShared.BadParameterException('Provided amplicon sequences must be unique!')



Expand Down
4 changes: 2 additions & 2 deletions CRISPResso2/CRISPRessoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from CRISPResso2 import CRISPResso2Align
from CRISPResso2 import CRISPRessoCOREResources

__version__ = "2.2.10"
__version__ = "2.2.11"


###EXCEPTIONS############################
Expand Down Expand Up @@ -879,7 +879,7 @@ def default_sigpipe():
if pipes[-1].poll() != 0:
raise AutoException('Cannot retrieve most frequent amplicon sequences. Got nonzero return code.')
seq_lines = top_unaligned.decode('utf-8').strip().split("\n")
if len(seq_lines) == 0:
if len(seq_lines) == 0 or seq_lines == ['']:
raise AutoException('Cannot parse any frequent amplicons sequences.')

if split_interleaved_input:
Expand Down
5 changes: 3 additions & 2 deletions CRISPResso2/CRISPRessoWGSCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,8 @@ def extract_reads(row):
def extract_reads_chunk(df):
new_df = pd.DataFrame(columns=df.columns)
for i in range(len(df)):
new_df = new_df.append(extract_reads(df.iloc[i].copy()))
new_df.loc[i] = extract_reads(df.iloc[i].copy())
new_df.set_index(df.index,inplace=True)
return(new_df)


Expand Down Expand Up @@ -620,7 +621,7 @@ def set_filenames(row):
empty_line_els = [np.nan]*(header_el_count-1)
n_reads_index = header_els.index('Reads_total') - 1
for idx, row in df_regions.iterrows():
run_name = CRISPRessoShared.slugify(idx)
run_name = CRISPRessoShared.slugify(str(idx))
folder_name = 'CRISPResso_on_%s' % run_name

all_region_names.append(run_name)
Expand Down

0 comments on commit 6a2b342

Please sign in to comment.