Skip to content

Commit

Permalink
v2.0.31 CRISPRessoPooled chr names fix, allele plot colors
Browse files Browse the repository at this point in the history
CRISPRessoPooled compatability with chr names with underscores (alternate scaffolds)
Additional function to plot allele table with custom set of colors for a completed run
  • Loading branch information
kclem committed Sep 25, 2019
1 parent c35d015 commit 039cc8e
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 21 deletions.
102 changes: 92 additions & 10 deletions CRISPResso2/CRISPRessoPlot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
(c) 2018 The General Hospital Corporation. All Rights Reserved.
'''

import os
import numpy as np
import pandas as pd
import matplotlib
Expand All @@ -20,8 +21,7 @@
sns.set_context('poster')
sns.set(font_scale=2.2)



from CRISPResso2 import CRISPRessoShared

def setMatplotlibDefaults():
font = {'size' : 22}
Expand Down Expand Up @@ -1018,7 +1018,7 @@ def prep_alleles_table_compare(df_alleles,sample_name_1,sample_name_2,MAX_N_ROWS

return X,annot,y_labels,insertion_dict,per_element_annot_kws

def plot_alleles_heatmap(reference_seq,fig_filename_root,X,annot,y_labels,insertion_dict,per_element_annot_kws,SAVE_ALSO_PNG=False,base_editor_output=False,sgRNA_intervals=None):
def plot_alleles_heatmap(reference_seq,fig_filename_root,X,annot,y_labels,insertion_dict,per_element_annot_kws,SAVE_ALSO_PNG=False,base_editor_output=False,sgRNA_intervals=None,custom_colors=None):
"""
Plots alleles in a heatmap (nucleotides color-coded for easy visualization)
input:
Expand All @@ -1031,6 +1031,7 @@ def plot_alleles_heatmap(reference_seq,fig_filename_root,X,annot,y_labels,insert
-per_element_annot_kws: annotations for each cell (e.g. bold for substitutions, etc.)
-SAVE_ALSO_PNG: whether to write png file as well
-base_editor_output: if true, won't draw 'predicted cleavage' line
-custom_colors: dict of colors to plot (e.g. colors['A'] = (1,0,0,0.4) # red,blue,green,alpha )
"""
plot_nuc_len=len(reference_seq)

Expand All @@ -1042,6 +1043,18 @@ def plot_alleles_heatmap(reference_seq,fig_filename_root,X,annot,y_labels,insert
G_color=get_nuc_color('G',alpha)
INDEL_color = get_nuc_color('N',alpha)

if custom_colors is not None:
if 'A' in custom_colors:
A_color = custom_colors['A']
if 'T' in custom_colors:
T_color = custom_colors['T']
if 'C' in custom_colors:
C_color = custom_colors['C']
if 'G' in custom_colors:
G_color = custom_colors['G']
if 'N' in custom_colors:
INDEL_color = custom_colors['N']

dna_to_numbers={'-':0,'A':1,'T':2,'C':3,'G':4,'N':5}
seq_to_numbers= lambda seq: [dna_to_numbers[x] for x in seq]

Expand Down Expand Up @@ -1148,7 +1161,7 @@ def plot_alleles_heatmap(reference_seq,fig_filename_root,X,annot,y_labels,insert
plt.savefig(fig_filename_root+'.png',bbox_inches='tight',bbox_extra_artists=(lgd,))
plt.close()

def plot_alleles_heatmap_hist(reference_seq,fig_filename_root,X,annot,y_labels,insertion_dict,per_element_annot_kws,count_values,SAVE_ALSO_PNG=False,base_editor_output=False,sgRNA_intervals=None):
def plot_alleles_heatmap_hist(reference_seq,fig_filename_root,X,annot,y_labels,insertion_dict,per_element_annot_kws,count_values,SAVE_ALSO_PNG=False,base_editor_output=False,sgRNA_intervals=None,custom_colors=None):
"""
Plots alleles in a heatmap (nucleotides color-coded for easy visualization)
input:
Expand All @@ -1161,6 +1174,7 @@ def plot_alleles_heatmap_hist(reference_seq,fig_filename_root,X,annot,y_labels,i
-per_element_annot_kws: annotations for each cell (e.g. bold for substitutions, etc.)
-SAVE_ALSO_PNG: whether to write png file as well
-base_editor_output: if true, won't draw 'predicted cleavage' line
-custom_colors: dict of colors to plot (e.g. colors['A'] = (1,0,0,0.4) # red,blue,green,alpha )
"""
plot_nuc_len=len(reference_seq)

Expand All @@ -1172,6 +1186,18 @@ def plot_alleles_heatmap_hist(reference_seq,fig_filename_root,X,annot,y_labels,i
G_color=get_nuc_color('G',alpha)
INDEL_color = get_nuc_color('N',alpha)

if custom_colors is not None:
if 'A' in custom_colors:
A_color = custom_colors['A']
if 'T' in custom_colors:
T_color = custom_colors['T']
if 'C' in custom_colors:
C_color = custom_colors['C']
if 'G' in custom_colors:
G_color = custom_colors['G']
if 'N' in custom_colors:
INDEL_color = custom_colors['N']

dna_to_numbers={'-':0,'A':1,'T':2,'C':3,'G':4,'N':5}
seq_to_numbers= lambda seq: [dna_to_numbers[x] for x in seq]

Expand Down Expand Up @@ -1274,7 +1300,7 @@ def plot_alleles_heatmap_hist(reference_seq,fig_filename_root,X,annot,y_labels,i
plt.savefig(fig_filename_root+'.png',bbox_inches='tight',bbox_extra_artists=(lgd,),pad=1)
plt.close()

def plot_alleles_table(reference_seq,df_alleles,fig_filename_root,MIN_FREQUENCY=0.5,MAX_N_ROWS=100,SAVE_ALSO_PNG=False,base_editor_output=False,sgRNA_intervals=None):
def plot_alleles_table(reference_seq,df_alleles,fig_filename_root,MIN_FREQUENCY=0.5,MAX_N_ROWS=100,SAVE_ALSO_PNG=False,base_editor_output=False,sgRNA_intervals=None,custom_colors=None):
"""
plots an allele table for a dataframe with allele frequencies
input:
Expand All @@ -1285,11 +1311,12 @@ def plot_alleles_table(reference_seq,df_alleles,fig_filename_root,MIN_FREQUENCY=
MAX_N_ROWS: max rows to plot
SAVE_ALSO_PNG: whether to write png file as well
sgRNA_intervals: locations where sgRNA is located
custom_colors: dict of colors to plot (e.g. colors['A'] = (1,0,0,0.4) # red,blue,green,alpha )
"""
X,annot,y_labels,insertion_dict,per_element_annot_kws = prep_alleles_table(df_alleles,MAX_N_ROWS,MIN_FREQUENCY)
plot_alleles_heatmap(reference_seq,fig_filename_root,X,annot,y_labels,insertion_dict,per_element_annot_kws,SAVE_ALSO_PNG,base_editor_output,sgRNA_intervals)
plot_alleles_heatmap(reference_seq,fig_filename_root,X,annot,y_labels,insertion_dict,per_element_annot_kws,SAVE_ALSO_PNG,base_editor_output,sgRNA_intervals,custom_colors)

def plot_alleles_table_from_file(alleles_file_name,fig_filename_root,MIN_FREQUENCY=0.5,MAX_N_ROWS=100,SAVE_ALSO_PNG=False,base_editor_output=False,sgRNA_intervals=None):
def plot_alleles_table_from_file(alleles_file_name,fig_filename_root,MIN_FREQUENCY=0.5,MAX_N_ROWS=100,SAVE_ALSO_PNG=False,base_editor_output=False,sgRNA_intervals=None,custom_colors=None):
"""
plots an allele table for a dataframe with allele frequencies
infers the reference sequence by finding reference sequences without gaps (-)
Expand All @@ -1302,6 +1329,7 @@ def plot_alleles_table_from_file(alleles_file_name,fig_filename_root,MIN_FREQUEN
MAX_N_ROWS: max rows to plot
SAVE_ALSO_PNG: whether to write png file as well
sgRNA_intervals: locations where sgRNA is located
custom_colors: dict of colors to plot (e.g. colors['A'] = (1,0,0,0.4) # red,blue,green,alpha )
"""

df_alleles = pd.read_table(alleles_file_name)
Expand All @@ -1314,9 +1342,62 @@ def plot_alleles_table_from_file(alleles_file_name,fig_filename_root,MIN_FREQUEN
raise Exception('Could not infer reference sequence from allele table')

X,annot,y_labels,insertion_dict,per_element_annot_kws = prep_alleles_table(df_alleles,MAX_N_ROWS,MIN_FREQUENCY)
plot_alleles_heatmap(reference_seq,fig_filename_root,X,annot,y_labels,insertion_dict,per_element_annot_kws,SAVE_ALSO_PNG,base_editor_output,sgRNA_intervals)
plot_alleles_heatmap(reference_seq,fig_filename_root,X,annot,y_labels,insertion_dict,per_element_annot_kws,SAVE_ALSO_PNG,base_editor_output,sgRNA_intervals,custom_colors)

def plot_alleles_table_compare(reference_seq,df_alleles,sample_name_1,sample_name_2,fig_filename_root,MIN_FREQUENCY=0.5,MAX_N_ROWS=100,SAVE_ALSO_PNG=False,base_editor_output=False,sgRNA_intervals=None):
def plot_alleles_tables_from_folder(crispresso_output_folder,fig_filename_root,MIN_FREQUENCY=None,MAX_N_ROWS=None,SAVE_ALSO_PNG=False,base_editor_output=False,sgRNA_intervals=None,custom_colors=None):
"""
plots an allele table for each sgRNA/amplicon in a CRISPresso run (useful for plotting after running using the plot harness)
This function is only used for one-off plotting purposes and not for the general CRISPResso analysis
input:
crispresso2 output folder
fig_filename_root: figure filename to plot (not including '.pdf' or '.png')
MIN_FREQUENCY: sum of alleles % must add to this to be plotted
MAX_N_ROWS: max rows to plot
SAVE_ALSO_PNG: whether to write png file as well
sgRNA_intervals: locations where sgRNA is located
custom_colors: dict of colors to plot (e.g. colors['A'] = (1,0,0,0.4) # red,blue,green,alpha )
example:
from CRISPResso2 import CRISPRessoPlot
CRISPRessoPlot.plot_alleles_tables_from_folder('CRISPResso_on_allele_specific','test_plots')
custom_colors = {'A':(1,0,0,0.8), 'T':(0,1,0,0.8), 'C':(0,0,1,0.8), 'G':(1,1,0,0.8), 'N':(0,1,1,0.8)}
CRISPRessoPlot.plot_alleles_tables_from_folder('CRISPResso_on_allele_specific','test_plots_colors',custom_colors=custom_colors)
"""
crispresso2_info = CRISPRessoShared.load_crispresso_info(crispresso_output_folder)

if MIN_FREQUENCY is None:
MIN_FREQUENCY = crispresso2_info['args'].min_frequency_alleles_around_cut_to_plot
if MAX_N_ROWS is None:
MAX_N_ROWS = crispresso2_info['args'].max_rows_alleles_around_cut_to_plot

plot_count = 0

ref_names = crispresso2_info['ref_names']
refs = crispresso2_info['refs']
for ref_name in ref_names:
sgRNA_sequences = refs[ref_name]['sgRNA_sequences']
sgRNA_cut_points = refs[ref_name]['sgRNA_cut_points']
sgRNA_intervals = refs[ref_name]['sgRNA_intervals']
sgRNA_plot_idxs = refs[ref_name]['sgRNA_plot_idxs']

reference_seq = refs[ref_name]['sequence']

for ind,sgRNA in enumerate(sgRNA_sequences):
alleles_filename = os.path.join(crispresso_output_folder,crispresso2_info['refs'][ref_name]['allele_frequency_files'][ind])
df_alleles = pd.read_table(alleles_filename)
df_alleles = df_alleles.reset_index().set_index('Aligned_Sequence')

cut_point = sgRNA_cut_points[ind]
plot_idxs = sgRNA_plot_idxs[ind]
plot_half_window = max(1,crispresso2_info['args'].plot_window_size)
ref_seq_around_cut=refs[ref_name]['sequence'][cut_point-plot_half_window+1:cut_point+plot_half_window+1]
X,annot,y_labels,insertion_dict,per_element_annot_kws = prep_alleles_table(df_alleles,MAX_N_ROWS,MIN_FREQUENCY)
plot_alleles_heatmap(ref_seq_around_cut,fig_filename_root+"_"+ref_name+"_"+sgRNA,X,annot,y_labels,insertion_dict,per_element_annot_kws,SAVE_ALSO_PNG,base_editor_output,sgRNA_intervals,custom_colors)
plot_count += 1
print('Plotted ' + str(plot_count) + ' plots')

def plot_alleles_table_compare(reference_seq,df_alleles,sample_name_1,sample_name_2,fig_filename_root,MIN_FREQUENCY=0.5,MAX_N_ROWS=100,SAVE_ALSO_PNG=False,base_editor_output=False,sgRNA_intervals=None,custom_colors=None):
"""
plots an allele table for a dataframe with allele frequencies from two CRISPResso runs
input:
Expand All @@ -1327,9 +1408,10 @@ def plot_alleles_table_compare(reference_seq,df_alleles,sample_name_1,sample_nam
MIN_FREQUENCY: sum of alleles % must add to this to be plotted
MAX_N_ROWS: max rows to plot
SAVE_ALSO_PNG: whether to write png file as well
custom_colors: dict of colors to plot (e.g. colors['A'] = (1,0,0,0.4) # red,blue,green,alpha )
"""
X,annot,y_labels,insertion_dict,per_element_annot_kws = prep_alleles_table_compare(df_alleles,sample_name_1,sample_name_2,MAX_N_ROWS,MIN_FREQUENCY)
plot_alleles_heatmap(reference_seq,fig_filename_root,X,annot,y_labels,insertion_dict,per_element_annot_kws,SAVE_ALSO_PNG,base_editor_output,sgRNA_intervals)
plot_alleles_heatmap(reference_seq,fig_filename_root,X,annot,y_labels,insertion_dict,per_element_annot_kws,SAVE_ALSO_PNG,base_editor_output,sgRNA_intervals,custom_colors)

def plot_unmod_mod_pcts(fig_filename_root,df_summary_quantification,save_png,cutoff=None,max_samples_to_include_unprocessed=20):
"""
Expand Down
11 changes: 8 additions & 3 deletions CRISPResso2/CRISPRessoPooledCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def check_samtools():
return True
else:
sys.stdout.write('\nCRISPRessoPooled requires samtools')
sys.stdout.write('\n\nPlease install it and add to your path following the instructions at: http://www.htslib.org/download/')
sys.stdout.write('\n\nPlease install samtools and add it to your path following the instructions at: http://www.htslib.org/download/')
return False

def check_bowtie2():
Expand All @@ -97,7 +97,7 @@ def check_bowtie2():
return True
else:
sys.stdout.write('\nCRISPRessoPooled requires Bowtie2!')
sys.stdout.write('\n\nPlease install it and add to your path following the instructions at: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#obtaining-bowtie-2')
sys.stdout.write('\n\nPlease install Bowtie2 and add it to your path following the instructions at: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#obtaining-bowtie-2')
return False

#this is overkilling to run for many sequences,
Expand Down Expand Up @@ -727,7 +727,12 @@ def main():
info('Reporting problematic regions...')
coordinates=[]
for region in files_to_match:
coordinates.append(os.path.basename(region).replace('.fastq.gz','').replace('.fastq','').split('_')[1:4]+[region,get_n_reads_fastq(region)])
#region format: REGION_chr8_1077_1198.fastq.gz
#But if the chr has underscores, it could look like this:
# REGION_chr8_KI270812v1_alt_1077_1198.fastq.gz
region_info = os.path.basename(region).replace('.fastq.gz','').replace('.fastq','').split('_')
chr_string = "_".join(region_info[1:len(region_info)-2]) #in case there are underscores
coordinates.append([chr_string] + region_info[-2:]+[region,get_n_reads_fastq(region)])

df_regions=pd.DataFrame(coordinates,columns=['chr_id','bpstart','bpend','fastq_file','n_reads'])

Expand Down
16 changes: 15 additions & 1 deletion CRISPResso2/CRISPRessoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
else:
import cPickle as cp #python 2.7

__version__ = "2.0.30"
__version__ = "2.0.31"

###EXCEPTIONS############################
class FlashException(Exception):
Expand Down Expand Up @@ -355,6 +355,20 @@ def check_output_folder(output_folder):
else:
raise OutputFolderIncompleteException("The folder %s is not a valid CRISPResso2 output folder. Cannot find quantification file '%s'." %(output_folder,quantification_file))

def load_crispresso_info(crispresso_output_folder=""):
"""
loads the pickle associated with a crispresso run
input:
crispresso_output_folder: finished CRISPResso2 output folder
"""
crispresso_info_file = os.path.join(crispresso_output_folder,'CRISPResso2_info.pickle')
if os.path.isfile(crispresso_info_file) is False:
raise Exception('Cannot open CRISPResso info file at ' + crispresso_info_file)
crispresso2_info = cp.load(open(crispresso_info_file,'rb'))
return crispresso2_info


def get_most_frequent_reads(fastq_r1,fastq_r2,number_of_reads_to_consider,flash_command,max_paired_end_reads_overlap,min_paired_end_reads_overlap,debug=False):
"""
Gets the most frequent amplicon from a fastq file (or after merging a r1 and r2 fastq file)
Expand Down
10 changes: 3 additions & 7 deletions CRISPResso2/CRISPRessoWGSCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def check_samtools():
return True
else:
sys.stdout.write('\nCRISPRessoWGS requires samtools')
sys.stdout.write('\n\nPlease install it and add to your path following the instruction at: http://www.htslib.org/download/')
sys.stdout.write('\n\nPlease install samtools and add it to your path following the instructions at: http://www.htslib.org/download/')
return False


Expand All @@ -109,7 +109,7 @@ def check_bowtie2():
return True
else:
sys.stdout.write('\nCRISPRessoWGS requires Bowtie2!')
sys.stdout.write('\n\nPlease install it and add to your path following the instruction at: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#obtaining-bowtie-2')
sys.stdout.write('\n\nPlease install Bowtie2 and add it to your path following the instructions at: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml#obtaining-bowtie-2')
return False

#if a reference index is provided aligne the reads to it
Expand Down Expand Up @@ -199,10 +199,6 @@ def write_trimmed_fastq(in_bam_filename,bpstart,bpend,out_fastq_filename):
outfile.write('@%s_%d\n%s\n+\n%s\n' %(name,n_reads,seq[st:en],qual[st:en]))
return n_reads





pd=check_library('pandas')
np=check_library('numpy')

Expand Down Expand Up @@ -250,7 +246,7 @@ def print_stacktrace_if_debug():
parser.add_argument('--min_reads_to_use_region', type=float, help='Minimum number of reads that align to a region to perform the CRISPResso analysis', default=10)
parser.add_argument('--skip_failed', help='Continue with pooled analysis even if one sample fails',action='store_true')
parser.add_argument('--gene_annotations', type=str, help='Gene Annotation Table from UCSC Genome Browser Tables (http://genome.ucsc.edu/cgi-bin/hgTables?command=start), \
please select as table "knowGene", as output format "all fields from selected table" and as file returned "gzip compressed"', default='')
please select as table "knownGene", as output format "all fields from selected table" and as file returned "gzip compressed"', default='')
parser.add_argument('-p','--n_processes',type=int, help='Specify the number of processes to use for the quantification.\
Please use with caution since increasing this parameter will increase the memory required to run CRISPResso.',default=1)
parser.add_argument('--crispresso_command', help='CRISPResso command to call',default='CRISPResso')
Expand Down

0 comments on commit 039cc8e

Please sign in to comment.