From dfa6e408658a08350352ad371454d6f7b1f8882e Mon Sep 17 00:00:00 2001
From: Eric Bryant <EricEdwardBryant@gmail.com>
Date: Wed, 5 Jul 2017 17:33:58 -0400
Subject: [PATCH] Add ability to identify cut sites gained after edit

---
 R/add-RFLP.R                            | 94 +++++++++++++++++++------
 README.Rmd                              |  7 +-
 README.md                               |  7 +-
 man/add_RFLP.Rd                         |  7 +-
 tests/testthat/test-add_RFLP.R          | 40 ++++++++++-
 tests/testthat/test-search-off-target.R | 11 ++-
 6 files changed, 134 insertions(+), 32 deletions(-)

diff --git a/R/add-RFLP.R b/R/add-RFLP.R
index 4608e48..bcb635f 100644
--- a/R/add-RFLP.R
+++ b/R/add-RFLP.R
@@ -2,6 +2,9 @@
 #' Add RFLP enzymes to iSTOP data
 #'
 #' @param iSTOP A dataframe resulting from [locate_PAM].
+#' @param recognizes The base recognized by the enzyme. One of "c" or "t".
+#' In terms of base editing, "t" will return enzymes that only cut after editing,
+#' whereas "c" will return enzymes that only cut before editing.
 #' @param width An integer specifiying range of unique cutting.
 #' @param enzymes A dataframe of enzymes considered. Defaults to NULL which will
 #' use an enzyme dataset included in the iSTOP package.
@@ -17,11 +20,14 @@
 #' @importFrom purrr pmap map_int map_chr map2 map
 #' @md
 
-add_RFLP <- function(iSTOP, width = 150, enzymes = NULL, cores = 1) {
+add_RFLP <- function(iSTOP, recognizes = 'c', width = 150, enzymes = NULL, cores = 1) {
+
+  assertthat::assert_that(assertthat::is.number(width), assertthat::is.number(cores), assertthat::is.string(recognizes))
+  assertthat::assert_that(recognizes %in% c('c', 't'), msg = '`recognizes` argument for add_RFLP() must be one of "c", or "t".')
 
   message('Adding RFLP...')
 
-  enzyme_combinations <- process_enzymes(enzymes)
+  enzyme_combinations <- process_enzymes(enzymes, recognizes)
 
   if (cores <= 1L) cores <- NULL
 
@@ -30,19 +36,28 @@ add_RFLP <- function(iSTOP, width = 150, enzymes = NULL, cores = 1) {
   on.exit(pbapply::pboptions(pbo), add = TRUE)
 
   iSTOP %>%
-    split(.$chr) %>%
+    split(.$chr) %>%   # split on chromosome for parallelization
     pbapply::pblapply(function(df) {
-      center <- ceiling(str_length(df$searched) / 2)
-      search <- str_sub(df$searched, start = center - width, end = center + width)
-      df[[paste0('RFLP_', width)]] <- identify_RFLP_enzymes(search, enzyme_combinations)
+
+      center <- ceiling(stringr::str_length(df$searched) / 2)
+      search <- stringr::str_sub(df$searched, start = center - width, end = center + width)
+
+      if (recognizes == 't') {
+        search <- stringr::str_replace(search, 'c', 't')
+        column_name <- paste0('RFLP_T_', width)
+      } else {
+        column_name <- paste0('RFLP_C_', width)
+      }
+
+      df[[column_name]] <- identify_RFLP_enzymes(search, enzyme_combinations)
       return(df)
     }, cl = cores) %>%
-    bind_rows
+    bind_rows()
 }
 
 # Converts all Cs to lowercase one at a time to generate all possible match patterns
 # Supports ambiguities by converting compatible patterns to 'c' and excluding incombatible ambiguous matches
-process_enzymes <- function(enzymes = NULL) {
+process_enzymes <- function(enzymes = NULL, recognizes) {
 
   if (is.null(enzymes)) {
     enzymes <-
@@ -51,6 +66,17 @@ process_enzymes <- function(enzymes = NULL) {
       filter(!exclude)
   }
 
+  switch(
+    recognizes,
+    c = process_enzymes_c(enzymes),
+    t = process_enzymes_t(enzymes)
+  )
+
+}
+
+# ---- Process enzymes for recognizing a particular base ----
+
+process_enzymes_c <- function(enzymes) {
   enzymes %>%
     select(enzyme, forward, reverse) %>%
     # Be agnostic to strand orientation (i.e. keep both)
@@ -60,46 +86,72 @@ process_enzymes <- function(enzymes = NULL) {
     tidyr::unnest() %>%
     # If there is an ambiguous pattern for 'c' that does not include 'T' then change
     # pattern to only match 'c' (since 'T' is what 'C' will be changed to)
-    mutate(pattern = str_replace(pattern, '\\[Ac\\]|\\[cG\\]|\\[AcG\\]', 'c')) %>%
+    mutate(pattern = stringr::str_replace(pattern, '\\[Ac\\]|\\[cG\\]|\\[AcG\\]', 'c')) %>%
     # If 'c' is contained within an ambiguity that includes 'T' then remove pattern
     # (since we do not want to match after 'C' is chaned to 'T')
-    filter(!str_detect(pattern, '\\[cT\\]|\\[AcT\\]|\\[cGT\\]')) %>%
+    filter(!stringr::str_detect(pattern, '\\[cT\\]|\\[AcT\\]|\\[cGT\\]')) %>%
     select(enzyme, pattern) %>%
     distinct %>%
     # Replace standard forward and reverse patterns
     left_join(enzymes, by = 'enzyme') %>%
     # Ensure that forward and reverse patterns will match even when overlapping
     mutate(
-      forward = str_c('(?=', str_to_upper(forward), ')'),
-      reverse = str_c('(?=', str_to_upper(reverse), ')'),
-      fwd_rev = if_else(forward == reverse, forward, str_c(forward, '|', reverse))
+      forward = stringr::str_c('(?=', stringr::str_to_upper(forward), ')'),
+      reverse = stringr::str_c('(?=', stringr::str_to_upper(reverse), ')'),
+      fwd_rev = if_else(forward == reverse, forward, stringr::str_c(forward, '|', reverse))
     )
 }
 
+process_enzymes_t <- function(enzymes) {
+  enzymes %>%
+    select(enzyme, forward, reverse) %>%
+    # Be agnostic to strand orientation (i.e. keep both)
+    tidyr::gather(orientation, pattern, forward, reverse) %>%
+    # Change each occurence of 'T' to 't' one at a time (since this is the intended base change)
+    mutate(pattern = str_to_lower_each(pattern, 'T')) %>%
+    tidyr::unnest() %>%
+    # If there is an ambiguous pattern for 't' that does not include 'C' then change pattern to only match 't'
+    mutate(pattern = stringr::str_replace(pattern, '\\[At\\]|\\[Gt\\]|\\[AGt\\]', 't')) %>%
+    # If 't' is contained within an ambiguity that includes 'C' then remove pattern
+    filter(!stringr::str_detect(pattern, '\\[Ct\\]|\\[ACt\\]|\\[CGt\\]')) %>%
+    select(enzyme, pattern) %>%
+    distinct %>%
+    # Replace standard forward and reverse patterns
+    left_join(enzymes, by = 'enzyme') %>%
+    # Ensure that forward and reverse patterns will match even when overlapping
+    mutate(
+      forward = stringr::str_c('(?=', stringr::str_to_upper(forward), ')'),
+      reverse = stringr::str_c('(?=', stringr::str_to_upper(reverse), ')'),
+      fwd_rev = if_else(forward == reverse, forward, stringr::str_c(forward, '|', reverse))
+    )
+}
+
+# ---- Utilities: add_RFLP ----
+
 identify_RFLP_enzymes <- function(seqs, enzymes) {
   # Get all unique patterns
-  basic <- enzymes %>% select(enzyme, fwd_rev) %>% distinct
+  basic <- enzymes %>% select(enzyme, fwd_rev) %>% distinct()
   purrr::map_chr(seqs, match_one_seq, basic, enzymes)
 }
 
 
 match_one_seq <- function(seq, basic, enzymes) {
   # The enzymes that match the base edit (a small number and faster pattern matching)
-  match_base_edit <- enzymes$enzyme[which(str_detect(seq, enzymes$pattern))]
+  match_base_edit <- enzymes$enzyme[which(stringr::str_detect(seq, enzymes$pattern))]
 
   # Get outta here if nothing matched the intended base edit
   if (length(match_base_edit) == 0) return(NA_character_)
 
   # Otherwise check how many times the enzyme cuts the sequence
   search <- basic[basic$enzyme %in% match_base_edit, ]
-  nmatch <- str_locate_all(seq, regex(search$fwd_rev, ignore_case = T)) %>% purrr::map_int(nrow)
+  nmatch <- stringr::str_locate_all(seq, regex(search$fwd_rev, ignore_case = T)) %>% purrr::map_int(nrow)
   uniquely_match_base_edit <- search$enzyme[which(nmatch == 1L)]
 
   # Get outta here if nothing uniquely matched the intended base edit
   if (length(uniquely_match_base_edit) == 0) return(NA_character_)
 
   # Otherwise collapse enzymes into a single string
-  str_c(uniquely_match_base_edit, collapse = ' | ')
+  stringr::str_c(uniquely_match_base_edit, collapse = ' | ')
 }
 
 # ---- Misc utility functions ----
@@ -127,10 +179,10 @@ match_one_seq <- function(seq, basic, enzymes) {
 
 # Make each occurence of a pattern in a sting lower case individually
 str_to_lower_each <- function(string, pattern, locale = 'en') {
-  loc <- str_locate_all(string, pattern)
-  lhs <- purrr::map2(string, loc, ~str_sub(.x, start =  1L, end   = .y[, 'start'] - 1L))
-  rhs <- purrr::map2(string, loc, ~str_sub(.x, end   = -1L, start = .y[, 'end']   + 1L))
-  mid <- purrr::map2(string, loc, ~str_sub(.x, start = .y[, 'start'], end = .y[, 'end']) %>% str_to_lower(locale))
+  loc <- stringr::str_locate_all(string, pattern)
+  lhs <- purrr::map2(string, loc, ~stringr::str_sub(.x, start =  1L, end   = .y[, 'start'] - 1L))
+  rhs <- purrr::map2(string, loc, ~stringr::str_sub(.x, end   = -1L, start = .y[, 'end']   + 1L))
+  mid <- purrr::map2(string, loc, ~stringr::str_sub(.x, start = .y[, 'start'], end = .y[, 'end']) %>% stringr::str_to_lower(locale))
 
   purrr::pmap(list(lhs, mid, rhs), str_c)
 }
diff --git a/README.Rmd b/README.Rmd
index 1fbaea8..58865e5 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -75,7 +75,8 @@ BRCA1 <-
   filter(gene == 'BRCA1') %>%
   locate_codons(Genome) %>%
   locate_PAM(Genome) %>%
-  add_RFLP(width = 150)  # 150 is the maximum allowed
+  add_RFLP(width = 150) %>%  # 150 is the maximum allowed
+  add_RFLP(recognizes = 't') # Enzymes that cut if c edits to t
 
 # (2) Detect iSTOP targets for genes that contain the string "BRCA"
 BRCA <-
@@ -84,7 +85,7 @@ BRCA <-
   locate_codons(Genome) %>%
   locate_PAM(Genome) %>%
   add_RFLP(width = 150)
-  
+
 # (3) Detect iSTOP targets for a set of transcript IDs
 ATR <-
   CDS_Human %>%
@@ -109,7 +110,7 @@ plot_spliced_isoforms(
   colors  = c('red', 'black'),
   # Name of track = table with columns `gene` and `genome_coord` 
   NGG_NGA = filter(BRCA, has(sgNGG) | has(sgNGA)),   # `|` = or
-  RFLP    = filter(BRCA, match_any  & has(RFLP_150)) # `&` = and
+  RFLP    = filter(BRCA, match_any  & has(RFLP_C_150)) # `&` = and
 )
 ```
 
diff --git a/README.md b/README.md
index 57fb9e9..eb75e85 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,8 @@ BRCA1 <-
   filter(gene == 'BRCA1') %>%
   locate_codons(Genome) %>%
   locate_PAM(Genome) %>%
-  add_RFLP(width = 150)  # 150 is the maximum allowed
+  add_RFLP(width = 150) %>%  # 150 is the maximum allowed
+  add_RFLP(recognizes = 't') # Enzymes that cut if c edits to t
 
 # (2) Detect iSTOP targets for genes that contain the string "BRCA"
 BRCA <-
@@ -73,7 +74,7 @@ BRCA <-
   locate_codons(Genome) %>%
   locate_PAM(Genome) %>%
   add_RFLP(width = 150)
-  
+
 # (3) Detect iSTOP targets for a set of transcript IDs
 ATR <-
   CDS_Human %>%
@@ -99,7 +100,7 @@ plot_spliced_isoforms(
   colors  = c('red', 'black'),
   # Name of track = table with columns `gene` and `genome_coord` 
   NGG_NGA = filter(BRCA, has(sgNGG) | has(sgNGA)),   # `|` = or
-  RFLP    = filter(BRCA, match_any  & has(RFLP_150)) # `&` = and
+  RFLP    = filter(BRCA, match_any  & has(RFLP_C_150)) # `&` = and
 )
 ```
 
diff --git a/man/add_RFLP.Rd b/man/add_RFLP.Rd
index 2e4847c..e162e10 100644
--- a/man/add_RFLP.Rd
+++ b/man/add_RFLP.Rd
@@ -4,11 +4,16 @@
 \alias{add_RFLP}
 \title{Add RFLP enzymes to iSTOP data}
 \usage{
-add_RFLP(iSTOP, width = 150, enzymes = NULL, cores = 1)
+add_RFLP(iSTOP, recognizes = "c", width = 150, enzymes = NULL,
+  cores = 1)
 }
 \arguments{
 \item{iSTOP}{A dataframe resulting from \link{locate_PAM}.}
 
+\item{recognizes}{The base recognized by the enzyme. One of "c" or "t".
+In terms of base editing, "t" will return enzymes that only cut after editing,
+whereas "c" will return enzymes that only cut before editing.}
+
 \item{width}{An integer specifiying range of unique cutting.}
 
 \item{enzymes}{A dataframe of enzymes considered. Defaults to NULL which will
diff --git a/tests/testthat/test-add_RFLP.R b/tests/testthat/test-add_RFLP.R
index b820404..43d7009 100644
--- a/tests/testthat/test-add_RFLP.R
+++ b/tests/testthat/test-add_RFLP.R
@@ -2,7 +2,7 @@ context('RFLP')
 
 test_that('RFLP matching considers both strands', {
 
-  enzymes <- iSTOP:::process_enzymes()
+  enzymes <- iSTOP:::process_enzymes(recognizes = 'c')
   enzymes <- enzymes[enzymes$enzyme == 'MnlI', ]
 
   # The basic pattern should match if edited base is present
@@ -36,3 +36,41 @@ test_that('RFLP matching considers both strands', {
     2
   )
 })
+
+
+test_that('RFLP matching works for gained cutsite with t recognition', {
+
+  enzymes <- iSTOP:::process_enzymes(recognizes = 't')
+  enzymes <- enzymes[enzymes$enzyme == 'MnlI', ]
+
+  # The basic pattern should match if edited base is present
+  expect_equal(
+    iSTOP:::identify_RFLP_enzymes('CCtC', enzymes),
+    'MnlI'
+  )
+  # But should not match if no edited base is present
+  expect_equal(
+    iSTOP:::identify_RFLP_enzymes('CCTC', enzymes),
+    NA_character_
+  )
+  # If match on opposite strand then should match nothing and return an empty string
+  expect_equal(
+    iSTOP:::identify_RFLP_enzymes('GAGGCCtC', enzymes),
+    NA_character_
+  )
+  # If match two overlapping sites then should return nothing
+  expect_equal(
+    iSTOP:::identify_RFLP_enzymes('CCTCCtC', enzymes),
+    NA_character_
+  )
+  # If match two overlapping sites then should return nothing
+  expect_equal(
+    iSTOP:::identify_RFLP_enzymes('GGTtCAG', enzymes),
+    NA_character_
+  )
+  # Overlapping on opposite forward and reverse patterns should match either
+  expect_equal(
+    stringr::str_locate_all('GGTtCAG', regex('(?=GGTt)|(?=tCAG)', ignore_case = T)) %>% purrr::map_int(nrow),
+    2
+  )
+})
diff --git a/tests/testthat/test-search-off-target.R b/tests/testthat/test-search-off-target.R
index e9adb75..b380c3c 100644
--- a/tests/testthat/test-search-off-target.R
+++ b/tests/testthat/test-search-off-target.R
@@ -17,6 +17,11 @@ test_that('Off target search gets same results regardless of strand', {
 
   expect_matching <- c(1, 2, 3, 6, 7, 8)
 
+  #genome <- BSgenome.Hsapiens.UCSC.hg38::Hsapiens
+  #chromosomes <- BSgenome::seqnames(genome)[1:25]
+
+  #full_search <- iSTOP::search_off_target(seqs, genome, chromosomes, cores = 6)
+
   plus <-
     iSTOP:::search_off_target_chr(
       seqs,
@@ -43,12 +48,12 @@ test_that('Off target search gets same results regardless of strand', {
   )
 
   expect_equal(
-    plus$sequence,
-    minus$sequence
+    plus$guide,
+    minus$guide
   )
 
   expect_equal(
-    plus$sequence,
+    plus$guide,
     seqs[expect_matching]
   )