Batch amplicon plots (#251)

* Error out if HDR amplicon matches existing amplicon * Add check for amplicon sequence uniqueness * Fix bug with bam_input not having bam_output * Test for no returned lines in auto mode, version bump to 2.2.11 * Fix pandas deprecation of df.append
pinellolab · Oct 9, 2022 · 6a2b342 · 6a2b342
1 parent 726b2b9
commit 6a2b342
Show file tree

Hide file tree

Showing 4 changed files with 110 additions and 99 deletions.
diff --git a/CRISPResso2/CRISPRessoBatchCORE.py b/CRISPResso2/CRISPRessoBatchCORE.py
@@ -352,6 +352,17 @@ def main():
         crispresso2_info['results']['general_plots']['summary_plot_titles'] = {}
         crispresso2_info['results']['general_plots']['summary_plot_labels'] = {}
         crispresso2_info['results']['general_plots']['summary_plot_datas'] = {}
+        crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_names'] = []
+        crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_paths'] = {}
+        crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_titles'] = {}
+        crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_labels'] = {}
+        crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_datas'] = {}
+
+        crispresso2_info['results']['general_plots']['allele_modification_line_plot_names'] = []
+        crispresso2_info['results']['general_plots']['allele_modification_line_plot_paths'] = {}
+        crispresso2_info['results']['general_plots']['allele_modification_line_plot_titles'] = {}
+        crispresso2_info['results']['general_plots']['allele_modification_line_plot_labels'] = {}
+        crispresso2_info['results']['general_plots']['allele_modification_line_plot_datas'] = {}
 
         # report for amplicons
         for amplicon_index, amplicon_seq in enumerate(all_amplicons):
@@ -684,105 +695,97 @@ def main():
                             crispresso2_info['results']['general_plots']['summary_plot_labels'][plot_name] = args.conversion_nuc_from + '->' + args.conversion_nuc_to +' conversion rates for the amplicon ' + amplicon_name
                             crispresso2_info['results']['general_plots']['summary_plot_datas'][plot_name] = [('Nucleotide frequencies', os.path.basename(nucleotide_frequency_summary_filename)), ('Modification frequencies', os.path.basename(modification_frequency_summary_filename))]
 
+                # allele modification frequency heatmap and line plots
+                if not args.suppress_plots and not args.suppress_batch_summary_plots:
+                    if guides_all_same:
+                        sgRNA_intervals = [consensus_sgRNA_intervals] * modification_frequency_summary_df.shape[0]
+                    else:
+                        sgRNA_intervals = [consensus_sgRNA_intervals]
+                    for modification_type in ['Insertions', 'Deletions', 'Substitutions']:
+                        modification_df = modification_frequency_summary_df[
+                            modification_frequency_summary_df['Modification'] == modification_type
+                        ]
+                        modification_df.index = [
+                            '{0} ({1})'.format(batch, batch_index)
+                            for batch_index, batch in enumerate(
+                                modification_df['Batch'], 1,
+                            )
+                        ]
+                        modification_df = modification_df.drop(
+                            ['Modification', 'Batch'], axis=1,
+                        )
+                        modification_df.columns = [
+                            '{0} ({1})'.format(column, position)
+                            for position, column in
+                            enumerate(modification_df.columns, 1)
+                        ]
+                        plot_name = 'CRISPRessoBatch_percentage_of_{0}_across_alleles_{1}_heatmap'.format(modification_type.lower(), amplicon_name)
+                        plot_path = '{0}.html'.format(_jp(plot_name))
+
+                        allele_modification_heatmap_input = {
+                            'sample_values': modification_df,
+                            'sample_sgRNA_intervals': sgRNA_intervals,
+                            'plot_path': plot_path,
+                            'title': modification_type,
+                        }
+                        plot(
+                            CRISPRessoPlot.plot_allele_modification_heatmap,
+                            allele_modification_heatmap_input,
+                        )
+
+                        crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_names'].append(plot_name)
+                        crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_paths'][plot_name] = plot_path
+                        crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_titles'][plot_name] = 'CRISPRessoBatch {0} Across Samples for {1}'.format(
+                            modification_type,
+                            amplicon_name,
+                        )
+                        crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_labels'][plot_name] = 'Each row is a sample and each column is a position in the amplicon sequence. Each cell shows the percentage of {0} for the sample at that position relative to the amplicon. Guides for each sample are identified by a black rectangle.'.format(modification_type.lower())
+                        crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_datas'][plot_name] = [
+                            (
+                                'CRISPRessoBatch Modification Frequency Summary',
+                                os.path.basename(
+                                    modification_frequency_summary_filename,
+                                ),
+                            ),
+                        ]
+
+                        plot_name = 'CRISPRessoBatch_percentage_of_{0}_across_alleles_{1}_line'.format(modification_type.lower(), amplicon_name)
+                        plot_path = '{0}.html'.format(_jp(plot_name))
+
+                        allele_modification_line_input = {
+                            'sample_values': modification_df,
+                            'sample_sgRNA_intervals': sgRNA_intervals,
+                            'plot_path': plot_path,
+                            'title': modification_type,
+                        }
+                        plot(
+                            CRISPRessoPlot.plot_allele_modification_line,
+                            allele_modification_line_input,
+                        )
+
+                        crispresso2_info['results']['general_plots']['allele_modification_line_plot_names'].append(plot_name)
+                        crispresso2_info['results']['general_plots']['allele_modification_line_plot_paths'][plot_name] = plot_path
+                        crispresso2_info['results']['general_plots']['allele_modification_line_plot_titles'][plot_name] = 'CRISPRessoBatch {0} Across Samples for {1}'.format(
+                            modification_type,
+                            amplicon_name,
+                        )
+                        crispresso2_info['results']['general_plots']['allele_modification_line_plot_labels'][plot_name] = 'Each line is a sample that indicates the percentage of {0} for the sample at that position relative to the amplicon. Guides are shown by a grey rectangle.'.format(modification_type.lower())
+                        crispresso2_info['results']['general_plots']['allele_modification_line_plot_datas'][plot_name] = [
+                            (
+                                'CRISPRessoBatch Modification Frequency Summary',
+                                os.path.basename(
+                                    modification_frequency_summary_filename,
+                                ),
+                            ),
+                        ]
+            #end if amp_found_count > 0 (how many folders had information for this amplicon)
+        #end per-amplicon analysis
+
         crispresso2_info['results']['general_plots']['window_nuc_pct_quilt_plot_names'] = window_nuc_pct_quilt_plot_names
         crispresso2_info['results']['general_plots']['nuc_pct_quilt_plot_names'] = nuc_pct_quilt_plot_names
         crispresso2_info['results']['general_plots']['window_nuc_conv_plot_names'] = window_nuc_conv_plot_names
         crispresso2_info['results']['general_plots']['nuc_conv_plot_names'] = nuc_conv_plot_names
 
-        # allele modification frequency heatmap and line plots
-        if not args.suppress_plots and not args.suppress_batch_summary_plots:
-            crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_names'] = []
-            crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_paths'] = {}
-            crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_titles'] = {}
-            crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_labels'] = {}
-            crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_datas'] = {}
-
-            crispresso2_info['results']['general_plots']['allele_modification_line_plot_names'] = []
-            crispresso2_info['results']['general_plots']['allele_modification_line_plot_paths'] = {}
-            crispresso2_info['results']['general_plots']['allele_modification_line_plot_titles'] = {}
-            crispresso2_info['results']['general_plots']['allele_modification_line_plot_labels'] = {}
-            crispresso2_info['results']['general_plots']['allele_modification_line_plot_datas'] = {}
-            if guides_all_same:
-                sgRNA_intervals = [consensus_sgRNA_intervals] * modification_frequency_summary_df.shape[0]
-            else:
-                sgRNA_intervals = [consensus_sgRNA_intervals]
-            for modification_type in ['Insertions', 'Deletions', 'Substitutions']:
-                modification_df = modification_frequency_summary_df[
-                    modification_frequency_summary_df['Modification'] == modification_type
-                ]
-                modification_df.index = [
-                    '{0} ({1})'.format(batch, batch_index)
-                    for batch_index, batch in enumerate(
-                        modification_df['Batch'], 1,
-                    )
-                ]
-                modification_df = modification_df.drop(
-                    ['Modification', 'Batch'], axis=1,
-                )
-                modification_df.columns = [
-                    '{0} ({1})'.format(column, position)
-                    for position, column in
-                    enumerate(modification_df.columns, 1)
-                ]
-                plot_name = 'CRISPRessoBatch_percentage_of_{0}_across_alleles_{1}_heatmap'.format(modification_type.lower(), amplicon_name)
-                plot_path = '{0}.html'.format(_jp(plot_name))
-
-                allele_modification_heatmap_input = {
-                    'sample_values': modification_df,
-                    'sample_sgRNA_intervals': sgRNA_intervals,
-                    'plot_path': plot_path,
-                    'title': modification_type,
-                }
-                plot(
-                    CRISPRessoPlot.plot_allele_modification_heatmap,
-                    allele_modification_heatmap_input,
-                )
-
-                crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_names'].append(plot_name)
-                crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_paths'][plot_name] = plot_path
-                crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_titles'][plot_name] = 'CRISPRessoBatch {0} Across Samples for {1}'.format(
-                    modification_type,
-                    amplicon_name,
-                )
-                crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_labels'][plot_name] = 'Each row is a sample and each column is a position in the amplicon sequence. Each cell shows the percentage of {0} for the sample at that position relative to the amplicon. Guides for each sample are identified by a black rectangle.'.format(modification_type.lower())
-                crispresso2_info['results']['general_plots']['allele_modification_heatmap_plot_datas'][plot_name] = [
-                    (
-                        'CRISPRessoBatch Modification Frequency Summary',
-                        os.path.basename(
-                            modification_frequency_summary_filename,
-                        ),
-                    ),
-                ]
-
-                plot_name = 'CRISPRessoBatch_percentage_of_{0}_across_alleles_{1}_line'.format(modification_type.lower(), amplicon_name)
-                plot_path = '{0}.html'.format(_jp(plot_name))
-
-                allele_modification_line_input = {
-                    'sample_values': modification_df,
-                    'sample_sgRNA_intervals': sgRNA_intervals,
-                    'plot_path': plot_path,
-                    'title': modification_type,
-                }
-                plot(
-                    CRISPRessoPlot.plot_allele_modification_line,
-                    allele_modification_line_input,
-                )
-
-                crispresso2_info['results']['general_plots']['allele_modification_line_plot_names'].append(plot_name)
-                crispresso2_info['results']['general_plots']['allele_modification_line_plot_paths'][plot_name] = plot_path
-                crispresso2_info['results']['general_plots']['allele_modification_line_plot_titles'][plot_name] = 'CRISPRessoBatch {0} Across Samples for {1}'.format(
-                    modification_type,
-                    amplicon_name,
-                )
-                crispresso2_info['results']['general_plots']['allele_modification_line_plot_labels'][plot_name] = 'Each line is a sample that indicates the percentage of {0} for the sample at that position relative to the amplicon. Guides are shown by a grey rectangle.'.format(modification_type.lower())
-                crispresso2_info['results']['general_plots']['allele_modification_line_plot_datas'][plot_name] = [
-                    (
-                        'CRISPRessoBatch Modification Frequency Summary',
-                        os.path.basename(
-                            modification_frequency_summary_filename,
-                        ),
-                    ),
-                ]
 
         # summarize amplicon modifications
         with open(_jp('CRISPRessoBatch_quantification_of_editing_frequency.txt'), 'w') as outfile:

diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py
@@ -1130,7 +1130,7 @@ def rreplace(s, old, new):
             fastq_output = _jp('CRISPResso_output.fastq.gz')
             crispresso2_info['fastq_output'] = fastq_output
             info('Writing fastq output file: ' + fastq_output)
-        if args.bam_output:
+        if args.bam_output or args.bam_input:
             if args.fastq_output:
                 raise CRISPRessoShared.BadParameterException('bam_output is not compatable with fastq_output! Please either use bam_output or fastq_output.')
             bam_output = _jp('CRISPResso_output.bam')
@@ -1299,6 +1299,10 @@ def rreplace(s, old, new):
                     amplicon_min_alignment_score_arr.append(this_min_aln_score)
 
         if args.expected_hdr_amplicon_seq != "":
+            if args.expected_hdr_amplicon_seq in amplicon_seq_arr:
+                raise CRISPRessoShared.BadParameterException(
+                    "HDR expected amplicon sequence is the same as a reference amplicon. Check the --expected_hdr_amplicon_seq parameter")
+
             amplicon_seq_arr.append(args.expected_hdr_amplicon_seq)
             amplicon_name_arr.append('HDR')
             amplicon_min_alignment_score_arr.append(args.default_min_aln_score)
@@ -1374,6 +1378,9 @@ def rreplace(s, old, new):
             if wrong_nt:
                 this_name = amplicon_name_arr[idx]
                 raise CRISPRessoShared.NTException('Reference amplicon sequence %d (%s) contains invalid characters: %s'%(idx, this_name, ' '.join(wrong_nt)))
+        amplicon_seq_set = set(amplicon_seq_arr)
+        if len(amplicon_seq_set) != len(amplicon_seq_arr):
+            raise CRISPRessoShared.BadParameterException('Provided amplicon sequences must be unique!')
 
 
 

diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py
@@ -22,7 +22,7 @@
 from CRISPResso2 import CRISPResso2Align
 from CRISPResso2 import CRISPRessoCOREResources
 
-__version__ = "2.2.10"
+__version__ = "2.2.11"
 
 
 ###EXCEPTIONS############################
@@ -879,7 +879,7 @@ def default_sigpipe():
     if pipes[-1].poll() != 0:
         raise AutoException('Cannot retrieve most frequent amplicon sequences. Got nonzero return code.')
     seq_lines = top_unaligned.decode('utf-8').strip().split("\n")
-    if len(seq_lines) == 0:
+    if len(seq_lines) == 0 or seq_lines == ['']:
         raise AutoException('Cannot parse any frequent amplicons sequences.')
 
     if split_interleaved_input:

diff --git a/CRISPResso2/CRISPRessoWGSCORE.py b/CRISPResso2/CRISPRessoWGSCORE.py
@@ -245,7 +245,8 @@ def extract_reads(row):
 def extract_reads_chunk(df):
     new_df = pd.DataFrame(columns=df.columns)
     for i in range(len(df)):
-        new_df = new_df.append(extract_reads(df.iloc[i].copy()))
+        new_df.loc[i] = extract_reads(df.iloc[i].copy())
+    new_df.set_index(df.index,inplace=True)
     return(new_df)
 
 
@@ -620,7 +621,7 @@ def set_filenames(row):
         empty_line_els = [np.nan]*(header_el_count-1)
         n_reads_index = header_els.index('Reads_total') - 1
         for idx, row in df_regions.iterrows():
-            run_name = CRISPRessoShared.slugify(idx)
+            run_name = CRISPRessoShared.slugify(str(idx))
             folder_name = 'CRISPResso_on_%s' % run_name
 
             all_region_names.append(run_name)