From dfa6e408658a08350352ad371454d6f7b1f8882e Mon Sep 17 00:00:00 2001 From: Eric Bryant Date: Wed, 5 Jul 2017 17:33:58 -0400 Subject: [PATCH] Add ability to identify cut sites gained after edit --- R/add-RFLP.R | 94 +++++++++++++++++++------ README.Rmd | 7 +- README.md | 7 +- man/add_RFLP.Rd | 7 +- tests/testthat/test-add_RFLP.R | 40 ++++++++++- tests/testthat/test-search-off-target.R | 11 ++- 6 files changed, 134 insertions(+), 32 deletions(-) diff --git a/R/add-RFLP.R b/R/add-RFLP.R index 4608e48..bcb635f 100644 --- a/R/add-RFLP.R +++ b/R/add-RFLP.R @@ -2,6 +2,9 @@ #' Add RFLP enzymes to iSTOP data #' #' @param iSTOP A dataframe resulting from [locate_PAM]. +#' @param recognizes The base recognized by the enzyme. One of "c" or "t". +#' In terms of base editing, "t" will return enzymes that only cut after editing, +#' whereas "c" will return enzymes that only cut before editing. #' @param width An integer specifiying range of unique cutting. #' @param enzymes A dataframe of enzymes considered. Defaults to NULL which will #' use an enzyme dataset included in the iSTOP package. @@ -17,11 +20,14 @@ #' @importFrom purrr pmap map_int map_chr map2 map #' @md -add_RFLP <- function(iSTOP, width = 150, enzymes = NULL, cores = 1) { +add_RFLP <- function(iSTOP, recognizes = 'c', width = 150, enzymes = NULL, cores = 1) { + + assertthat::assert_that(assertthat::is.number(width), assertthat::is.number(cores), assertthat::is.string(recognizes)) + assertthat::assert_that(recognizes %in% c('c', 't'), msg = '`recognizes` argument for add_RFLP() must be one of "c", or "t".') message('Adding RFLP...') - enzyme_combinations <- process_enzymes(enzymes) + enzyme_combinations <- process_enzymes(enzymes, recognizes) if (cores <= 1L) cores <- NULL @@ -30,19 +36,28 @@ add_RFLP <- function(iSTOP, width = 150, enzymes = NULL, cores = 1) { on.exit(pbapply::pboptions(pbo), add = TRUE) iSTOP %>% - split(.$chr) %>% + split(.$chr) %>% # split on chromosome for parallelization pbapply::pblapply(function(df) { - center <- ceiling(str_length(df$searched) / 2) - search <- str_sub(df$searched, start = center - width, end = center + width) - df[[paste0('RFLP_', width)]] <- identify_RFLP_enzymes(search, enzyme_combinations) + + center <- ceiling(stringr::str_length(df$searched) / 2) + search <- stringr::str_sub(df$searched, start = center - width, end = center + width) + + if (recognizes == 't') { + search <- stringr::str_replace(search, 'c', 't') + column_name <- paste0('RFLP_T_', width) + } else { + column_name <- paste0('RFLP_C_', width) + } + + df[[column_name]] <- identify_RFLP_enzymes(search, enzyme_combinations) return(df) }, cl = cores) %>% - bind_rows + bind_rows() } # Converts all Cs to lowercase one at a time to generate all possible match patterns # Supports ambiguities by converting compatible patterns to 'c' and excluding incombatible ambiguous matches -process_enzymes <- function(enzymes = NULL) { +process_enzymes <- function(enzymes = NULL, recognizes) { if (is.null(enzymes)) { enzymes <- @@ -51,6 +66,17 @@ process_enzymes <- function(enzymes = NULL) { filter(!exclude) } + switch( + recognizes, + c = process_enzymes_c(enzymes), + t = process_enzymes_t(enzymes) + ) + +} + +# ---- Process enzymes for recognizing a particular base ---- + +process_enzymes_c <- function(enzymes) { enzymes %>% select(enzyme, forward, reverse) %>% # Be agnostic to strand orientation (i.e. keep both) @@ -60,46 +86,72 @@ process_enzymes <- function(enzymes = NULL) { tidyr::unnest() %>% # If there is an ambiguous pattern for 'c' that does not include 'T' then change # pattern to only match 'c' (since 'T' is what 'C' will be changed to) - mutate(pattern = str_replace(pattern, '\\[Ac\\]|\\[cG\\]|\\[AcG\\]', 'c')) %>% + mutate(pattern = stringr::str_replace(pattern, '\\[Ac\\]|\\[cG\\]|\\[AcG\\]', 'c')) %>% # If 'c' is contained within an ambiguity that includes 'T' then remove pattern # (since we do not want to match after 'C' is chaned to 'T') - filter(!str_detect(pattern, '\\[cT\\]|\\[AcT\\]|\\[cGT\\]')) %>% + filter(!stringr::str_detect(pattern, '\\[cT\\]|\\[AcT\\]|\\[cGT\\]')) %>% select(enzyme, pattern) %>% distinct %>% # Replace standard forward and reverse patterns left_join(enzymes, by = 'enzyme') %>% # Ensure that forward and reverse patterns will match even when overlapping mutate( - forward = str_c('(?=', str_to_upper(forward), ')'), - reverse = str_c('(?=', str_to_upper(reverse), ')'), - fwd_rev = if_else(forward == reverse, forward, str_c(forward, '|', reverse)) + forward = stringr::str_c('(?=', stringr::str_to_upper(forward), ')'), + reverse = stringr::str_c('(?=', stringr::str_to_upper(reverse), ')'), + fwd_rev = if_else(forward == reverse, forward, stringr::str_c(forward, '|', reverse)) ) } +process_enzymes_t <- function(enzymes) { + enzymes %>% + select(enzyme, forward, reverse) %>% + # Be agnostic to strand orientation (i.e. keep both) + tidyr::gather(orientation, pattern, forward, reverse) %>% + # Change each occurence of 'T' to 't' one at a time (since this is the intended base change) + mutate(pattern = str_to_lower_each(pattern, 'T')) %>% + tidyr::unnest() %>% + # If there is an ambiguous pattern for 't' that does not include 'C' then change pattern to only match 't' + mutate(pattern = stringr::str_replace(pattern, '\\[At\\]|\\[Gt\\]|\\[AGt\\]', 't')) %>% + # If 't' is contained within an ambiguity that includes 'C' then remove pattern + filter(!stringr::str_detect(pattern, '\\[Ct\\]|\\[ACt\\]|\\[CGt\\]')) %>% + select(enzyme, pattern) %>% + distinct %>% + # Replace standard forward and reverse patterns + left_join(enzymes, by = 'enzyme') %>% + # Ensure that forward and reverse patterns will match even when overlapping + mutate( + forward = stringr::str_c('(?=', stringr::str_to_upper(forward), ')'), + reverse = stringr::str_c('(?=', stringr::str_to_upper(reverse), ')'), + fwd_rev = if_else(forward == reverse, forward, stringr::str_c(forward, '|', reverse)) + ) +} + +# ---- Utilities: add_RFLP ---- + identify_RFLP_enzymes <- function(seqs, enzymes) { # Get all unique patterns - basic <- enzymes %>% select(enzyme, fwd_rev) %>% distinct + basic <- enzymes %>% select(enzyme, fwd_rev) %>% distinct() purrr::map_chr(seqs, match_one_seq, basic, enzymes) } match_one_seq <- function(seq, basic, enzymes) { # The enzymes that match the base edit (a small number and faster pattern matching) - match_base_edit <- enzymes$enzyme[which(str_detect(seq, enzymes$pattern))] + match_base_edit <- enzymes$enzyme[which(stringr::str_detect(seq, enzymes$pattern))] # Get outta here if nothing matched the intended base edit if (length(match_base_edit) == 0) return(NA_character_) # Otherwise check how many times the enzyme cuts the sequence search <- basic[basic$enzyme %in% match_base_edit, ] - nmatch <- str_locate_all(seq, regex(search$fwd_rev, ignore_case = T)) %>% purrr::map_int(nrow) + nmatch <- stringr::str_locate_all(seq, regex(search$fwd_rev, ignore_case = T)) %>% purrr::map_int(nrow) uniquely_match_base_edit <- search$enzyme[which(nmatch == 1L)] # Get outta here if nothing uniquely matched the intended base edit if (length(uniquely_match_base_edit) == 0) return(NA_character_) # Otherwise collapse enzymes into a single string - str_c(uniquely_match_base_edit, collapse = ' | ') + stringr::str_c(uniquely_match_base_edit, collapse = ' | ') } # ---- Misc utility functions ---- @@ -127,10 +179,10 @@ match_one_seq <- function(seq, basic, enzymes) { # Make each occurence of a pattern in a sting lower case individually str_to_lower_each <- function(string, pattern, locale = 'en') { - loc <- str_locate_all(string, pattern) - lhs <- purrr::map2(string, loc, ~str_sub(.x, start = 1L, end = .y[, 'start'] - 1L)) - rhs <- purrr::map2(string, loc, ~str_sub(.x, end = -1L, start = .y[, 'end'] + 1L)) - mid <- purrr::map2(string, loc, ~str_sub(.x, start = .y[, 'start'], end = .y[, 'end']) %>% str_to_lower(locale)) + loc <- stringr::str_locate_all(string, pattern) + lhs <- purrr::map2(string, loc, ~stringr::str_sub(.x, start = 1L, end = .y[, 'start'] - 1L)) + rhs <- purrr::map2(string, loc, ~stringr::str_sub(.x, end = -1L, start = .y[, 'end'] + 1L)) + mid <- purrr::map2(string, loc, ~stringr::str_sub(.x, start = .y[, 'start'], end = .y[, 'end']) %>% stringr::str_to_lower(locale)) purrr::pmap(list(lhs, mid, rhs), str_c) } diff --git a/README.Rmd b/README.Rmd index 1fbaea8..58865e5 100644 --- a/README.Rmd +++ b/README.Rmd @@ -75,7 +75,8 @@ BRCA1 <- filter(gene == 'BRCA1') %>% locate_codons(Genome) %>% locate_PAM(Genome) %>% - add_RFLP(width = 150) # 150 is the maximum allowed + add_RFLP(width = 150) %>% # 150 is the maximum allowed + add_RFLP(recognizes = 't') # Enzymes that cut if c edits to t # (2) Detect iSTOP targets for genes that contain the string "BRCA" BRCA <- @@ -84,7 +85,7 @@ BRCA <- locate_codons(Genome) %>% locate_PAM(Genome) %>% add_RFLP(width = 150) - + # (3) Detect iSTOP targets for a set of transcript IDs ATR <- CDS_Human %>% @@ -109,7 +110,7 @@ plot_spliced_isoforms( colors = c('red', 'black'), # Name of track = table with columns `gene` and `genome_coord` NGG_NGA = filter(BRCA, has(sgNGG) | has(sgNGA)), # `|` = or - RFLP = filter(BRCA, match_any & has(RFLP_150)) # `&` = and + RFLP = filter(BRCA, match_any & has(RFLP_C_150)) # `&` = and ) ``` diff --git a/README.md b/README.md index 57fb9e9..eb75e85 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,8 @@ BRCA1 <- filter(gene == 'BRCA1') %>% locate_codons(Genome) %>% locate_PAM(Genome) %>% - add_RFLP(width = 150) # 150 is the maximum allowed + add_RFLP(width = 150) %>% # 150 is the maximum allowed + add_RFLP(recognizes = 't') # Enzymes that cut if c edits to t # (2) Detect iSTOP targets for genes that contain the string "BRCA" BRCA <- @@ -73,7 +74,7 @@ BRCA <- locate_codons(Genome) %>% locate_PAM(Genome) %>% add_RFLP(width = 150) - + # (3) Detect iSTOP targets for a set of transcript IDs ATR <- CDS_Human %>% @@ -99,7 +100,7 @@ plot_spliced_isoforms( colors = c('red', 'black'), # Name of track = table with columns `gene` and `genome_coord` NGG_NGA = filter(BRCA, has(sgNGG) | has(sgNGA)), # `|` = or - RFLP = filter(BRCA, match_any & has(RFLP_150)) # `&` = and + RFLP = filter(BRCA, match_any & has(RFLP_C_150)) # `&` = and ) ``` diff --git a/man/add_RFLP.Rd b/man/add_RFLP.Rd index 2e4847c..e162e10 100644 --- a/man/add_RFLP.Rd +++ b/man/add_RFLP.Rd @@ -4,11 +4,16 @@ \alias{add_RFLP} \title{Add RFLP enzymes to iSTOP data} \usage{ -add_RFLP(iSTOP, width = 150, enzymes = NULL, cores = 1) +add_RFLP(iSTOP, recognizes = "c", width = 150, enzymes = NULL, + cores = 1) } \arguments{ \item{iSTOP}{A dataframe resulting from \link{locate_PAM}.} +\item{recognizes}{The base recognized by the enzyme. One of "c" or "t". +In terms of base editing, "t" will return enzymes that only cut after editing, +whereas "c" will return enzymes that only cut before editing.} + \item{width}{An integer specifiying range of unique cutting.} \item{enzymes}{A dataframe of enzymes considered. Defaults to NULL which will diff --git a/tests/testthat/test-add_RFLP.R b/tests/testthat/test-add_RFLP.R index b820404..43d7009 100644 --- a/tests/testthat/test-add_RFLP.R +++ b/tests/testthat/test-add_RFLP.R @@ -2,7 +2,7 @@ context('RFLP') test_that('RFLP matching considers both strands', { - enzymes <- iSTOP:::process_enzymes() + enzymes <- iSTOP:::process_enzymes(recognizes = 'c') enzymes <- enzymes[enzymes$enzyme == 'MnlI', ] # The basic pattern should match if edited base is present @@ -36,3 +36,41 @@ test_that('RFLP matching considers both strands', { 2 ) }) + + +test_that('RFLP matching works for gained cutsite with t recognition', { + + enzymes <- iSTOP:::process_enzymes(recognizes = 't') + enzymes <- enzymes[enzymes$enzyme == 'MnlI', ] + + # The basic pattern should match if edited base is present + expect_equal( + iSTOP:::identify_RFLP_enzymes('CCtC', enzymes), + 'MnlI' + ) + # But should not match if no edited base is present + expect_equal( + iSTOP:::identify_RFLP_enzymes('CCTC', enzymes), + NA_character_ + ) + # If match on opposite strand then should match nothing and return an empty string + expect_equal( + iSTOP:::identify_RFLP_enzymes('GAGGCCtC', enzymes), + NA_character_ + ) + # If match two overlapping sites then should return nothing + expect_equal( + iSTOP:::identify_RFLP_enzymes('CCTCCtC', enzymes), + NA_character_ + ) + # If match two overlapping sites then should return nothing + expect_equal( + iSTOP:::identify_RFLP_enzymes('GGTtCAG', enzymes), + NA_character_ + ) + # Overlapping on opposite forward and reverse patterns should match either + expect_equal( + stringr::str_locate_all('GGTtCAG', regex('(?=GGTt)|(?=tCAG)', ignore_case = T)) %>% purrr::map_int(nrow), + 2 + ) +}) diff --git a/tests/testthat/test-search-off-target.R b/tests/testthat/test-search-off-target.R index e9adb75..b380c3c 100644 --- a/tests/testthat/test-search-off-target.R +++ b/tests/testthat/test-search-off-target.R @@ -17,6 +17,11 @@ test_that('Off target search gets same results regardless of strand', { expect_matching <- c(1, 2, 3, 6, 7, 8) + #genome <- BSgenome.Hsapiens.UCSC.hg38::Hsapiens + #chromosomes <- BSgenome::seqnames(genome)[1:25] + + #full_search <- iSTOP::search_off_target(seqs, genome, chromosomes, cores = 6) + plus <- iSTOP:::search_off_target_chr( seqs, @@ -43,12 +48,12 @@ test_that('Off target search gets same results regardless of strand', { ) expect_equal( - plus$sequence, - minus$sequence + plus$guide, + minus$guide ) expect_equal( - plus$sequence, + plus$guide, seqs[expect_matching] )