v2.0.25 Add inferring of guides

pinellolab · Feb 20, 2019 · eca34ae · eca34ae
1 parent 2ccec08
commit eca34ae
Show file tree

Hide file tree

Showing 2 changed files with 128 additions and 3 deletions.
diff --git a/CRISPResso2/CRISPRessoCORE.py b/CRISPResso2/CRISPRessoCORE.py
@@ -517,12 +517,30 @@ def print_stacktrace_if_debug():
 
             amplicon_seq_arr = CRISPRessoShared.guess_amplicons(args.fastq_r1,args.fastq_r2,number_of_reads_to_consider,args.flash_command,args.max_paired_end_reads_overlap,args.min_paired_end_reads_overlap,aln_matrix,args.needleman_wunsch_gap_open,args.needleman_wunsch_gap_extend)
             amplicon_name_arr = ['Amplicon'+str(x) for x in ['',range(1,len(amplicon_seq_arr))]]
+            if len(guides) == 0:
+                for amplicon_seq in amplicon_seq_arr:
+                    (potential_guide,is_base_editor) = CRISPRessoShared.guess_guides(amplicon_seq,args.fastq_r1,args.fastq_r2,number_of_reads_to_consider,args.flash_command,
+                        args.max_paired_end_reads_overlap,args.min_paired_end_reads_overlap,aln_matrix,args.needleman_wunsch_gap_open,args.needleman_wunsch_gap_extend)
+                    if potential_guide is not None and potential_guide not in guides:
+                        guides.append(potential_guide)
 
             amplicon_min_alignment_score_arr = []
             plural_string = ""
             if len(amplicon_seq_arr) > 1:
                 plural_string = "s"
             info("Auto-detected %d reference amplicon%s"%(len(amplicon_seq_arr),plural_string))
+
+            if args.debug:
+                for idx,seq in enumerate(amplicon_seq_arr):
+                    print('Detected amplicon ' + str(idx) + ":" + str(seq))
+
+            if len(guides) > 1:
+                plural_string = "s"
+            info("Auto-detected %d guide%s"%(len(guides),plural_string))
+            if args.debug:
+                for idx,seq in enumerate(guides):
+                    print('Detected guide ' + str(idx) + ":" + str(seq))
+
         else: #not auto
             amplicon_seq_arr = args.amplicon_seq.split(",")
             amplicon_name_arr = args.amplicon_name.split(",")
@@ -964,7 +982,7 @@ def print_stacktrace_if_debug():
                 max_overlap = args.max_paired_end_reads_overlap
             if args.min_paired_end_reads_overlap:
                 min_overlap = args.min_paired_end_reads_overlap
-            cmd='%s %s %s --min-overlap %d --max-overlap %d --allow_outies -z -d %s >>%s 2>&1' %\
+            cmd='%s %s %s --min-overlap %d --max-overlap %d --allow-outies -z -d %s >>%s 2>&1' %\
             (args.flash_command,
                  output_forward_paired_filename,
                  output_reverse_paired_filename,

diff --git a/CRISPResso2/CRISPRessoShared.py b/CRISPResso2/CRISPRessoShared.py
@@ -18,6 +18,7 @@
 import unicodedata
 
 from CRISPResso2 import CRISPResso2Align
+from CRISPResso2 import CRISPRessoCOREResources
 
 running_python3 = False
 if sys.version_info > (3, 0):
@@ -28,7 +29,7 @@
 else:
     import cPickle as cp #python 2.7
 
-__version__ = "2.0.24"
+__version__ = "2.0.25"
 
 ###EXCEPTIONS############################
 class FlashException(Exception):
@@ -98,7 +99,7 @@ def getCRISPRessoArgParser(parserTitle = "CRISPResso Parameters",requiredParams=
     parser.add_argument('--split_paired_end',help='Splits a single fastq file containing paired end reads in two files before running CRISPResso',action='store_true')
     parser.add_argument('--trim_sequences',help='Enable the trimming of Illumina adapters with Trimmomatic',action='store_true')
     parser.add_argument('--trimmomatic_command', type=str, help='Command to run trimmomatic',default='trimmomatic')
-    parser.add_argument('--trimmomatic_options_string', type=str, help='Override options for Trimmomatic',default='')
+    parser.add_argument('--trimmomatic_options_string', type=str, help='Override options for Trimmomatic, e.g. "ILLUMINACLIP:/data/NexteraPE-PE.fa:0:90:10:0:true"',default='')
     parser.add_argument('--flash_command', type=str, help='Command to run flash',default='flash')
     parser.add_argument('--min_paired_end_reads_overlap',  type=int, help='Parameter for the FLASH read merging step. Minimum required overlap length between two reads to provide a confident overlap. ', default=None)
     parser.add_argument('--max_paired_end_reads_overlap',  type=int, help='Parameter for the FLASH merging step.  Maximum overlap length expected in approximately 90%% of read pairs. Please see the FLASH manual for more information.', default=None)
@@ -353,6 +354,7 @@ def check_output_folder(output_folder):
 def get_most_frequent_reads(fastq_r1,fastq_r2,number_of_reads_to_consider,flash_command,max_paired_end_reads_overlap,min_paired_end_reads_overlap):
     """
     Gets the most frequent amplicon from a fastq file (or after merging a r1 and r2 fastq file)
+    Note: only works on paired end or single end reads (not interleaved)
     input:
     fastq_r1: path to fastq r1 (can be gzipped)
     fastq_r2: path to fastq r2 (can be gzipped)
@@ -451,6 +453,111 @@ def guess_amplicons(fastq_r1,fastq_r2,number_of_reads_to_consider,flash_command,
 
     return amplicon_seq_arr
 
+def guess_guides(amplicon_sequence,fastq_r1,fastq_r2,number_of_reads_to_consider,flash_command,max_paired_end_reads_overlap,
+            min_paired_end_reads_overlap,aln_matrix,needleman_wunsch_gap_open,needleman_wunsch_gap_extend,min_edit_freq_to_consider=0.1,pam_seq="NGG",min_pct_subs_in_base_editor_win=0.8):
+    """
+    guesses the guides used in an experiment by identifying the most-frequently edited positions, editing types, and PAM sites
+    input:
+    ampilcon_sequence - amplicon to analyze
+    fastq_r1: path to fastq r1 (can be gzipped)
+    fastq_r2: path to fastq r2 (can be gzipped)
+    number_of_reads_to_consider: number of reads from the top of the file to examine
+    flash_command: command to call flash
+    min_paired_end_reads_overlap: min overlap in bp for flashing (merging) r1 and r2
+    max_paired_end_reads_overlap: max overlap in bp for flashing (merging) r1 and r2
+    needleman_wunsch_gap_open: alignment penalty assignment used to determine similarity of two sequences
+    needleman_wunsch_gap_extend: alignment penalty assignment used to determine similarity of two sequences
+    min_edit_freq_to_consider: edits must be at least this frequency for consideration
+    pam_seq: pam sequence to look for (can be regex or contain degenerate bases)
+    min_pct_subs_in_base_editor_win: if at least this percent of substitutions happen in the predicted base editor window, return base editor flag
+
+    returns:
+    tuple of (putative guide, boolean is_base_editor)
+    or (None, None)
+    """
+    seq_lines = get_most_frequent_reads(fastq_r1,fastq_r2,number_of_reads_to_consider,flash_command,max_paired_end_reads_overlap,min_paired_end_reads_overlap)
+
+    amp_len = len(amplicon_sequence)
+    gap_incentive = np.zeros(amp_len+1,dtype=np.int)
+    include_idxs = set(range(0,amp_len))
+
+    all_indel_count_vector = np.zeros(amp_len)
+    all_sub_count_vector = np.zeros(amp_len)
+    tot_count = 0;
+    for i in range(len(seq_lines)):
+        count,seq = seq_lines[i].strip().split()
+        count = int(count)
+        tot_count += count
+        fws1,fws2,fwscore=CRISPResso2Align.global_align(seq, amplicon_sequence,matrix=aln_matrix,gap_incentive=gap_incentive,
+            gap_open=needleman_wunsch_gap_open,gap_extend=needleman_wunsch_gap_extend,)
+        payload=CRISPRessoCOREResources.find_indels_substitutions(fws1,fws2,include_idxs)
+        all_indel_count_vector[payload['all_insertion_positions']]+=count
+        all_indel_count_vector[payload['all_deletion_positions']]+=count
+        all_sub_count_vector[payload['all_substitution_positions']]+=count
+
+    max_loc = np.argmax(all_indel_count_vector)
+    max_val = all_indel_count_vector[max_loc]
+
+    #return nothing if the max edit doesn't break threshold
+    if max_val/float(tot_count) < min_edit_freq_to_consider:
+        return (None,None)
+
+
+    pam_regex_string = pam_seq.upper()
+    pam_regex_string = pam_regex_string.replace('I','[ATCG]')
+    pam_regex_string = pam_regex_string.replace('N','[ATCG]')
+    pam_regex_string = pam_regex_string.replace('R','[AG]')
+    pam_regex_string = pam_regex_string.replace('Y','[CT]')
+    pam_regex_string = pam_regex_string.replace('S','[GC]')
+    pam_regex_string = pam_regex_string.replace('W','[AT]')
+    pam_regex_string = pam_regex_string.replace('K','[GT]')
+    pam_regex_string = pam_regex_string.replace('M','[AC]')
+    pam_regex_string = pam_regex_string.replace('B','[CGT]')
+    pam_regex_string = pam_regex_string.replace('D','[AGT]')
+    pam_regex_string = pam_regex_string.replace('H','[ACT]')
+    pam_regex_string = pam_regex_string.replace('V','[ACG]')
+
+    is_base_editor = False
+    #offset from expected position
+    for offset in (0,+1,-1,+2,+3,+4,-2):
+        #forward direction
+        #find pam near max edit loc
+        pam_start = max_loc+4 + offset
+        pam_end = max_loc+7 + offset
+        guide_start = max_loc-16 + offset
+        guide_end = max_loc+4 + offset
+        base_edit_start = max_loc-16 + offset
+        base_edit_end = max_loc-6 + offset
+        if pam_start > 0 and guide_end < amp_len:
+            if re.match(pam_regex_string, amplicon_sequence[pam_start:pam_end]):
+                guide_seq = amplicon_sequence[guide_start:guide_end]
+                sum_base_edits = sum(all_sub_count_vector[base_edit_start:base_edit_end])
+                #if a lot of edits are in the predicted base editor window, set base editor true
+                #specifically, if at least min_pct_subs_in_base_editor_win % of substitutions happen in the predicted base editor window
+                if sum_base_edits > min_pct_subs_in_base_editor_win * sum(all_sub_count_vector):
+                    is_base_editor = True
+                return(guide_seq,is_base_editor)
+
+        #reverse direction
+        pam_start = max_loc-5 - offset
+        pam_end = max_loc-2 - offset
+        guide_start = max_loc-2 - offset
+        guide_end = max_loc+18 - offset
+        base_edit_start = max_loc+8 - offset
+        base_edit_end = max_loc+18 - offset
+        if pam_start > 0 and guide_end < amp_len:
+            if re.match(pam_regex_string, amplicon_sequence[pam_start:pam_end]):
+                guide_seq = amplicon_sequence[guide_start:guide_end]
+                sum_base_edits = sum(all_sub_count_vector[base_edit_start:base_edit_end])
+                #if a lot of edits are in the predicted base editor window, set base editor true
+                #specifically, if at least min_pct_subs_in_base_editor_win % of substitutions happen in the predicted base editor window
+                if sum_base_edits > min_pct_subs_in_base_editor_win * sum(all_sub_count_vector):
+                    is_base_editor = True
+                return(guide_seq,is_base_editor)
+
+    return (None,None)
+
+
 ######
 # allele modification functions
 ######