Skip to content

Commit

Permalink
Add ability to identify cut sites gained after edit
Browse files Browse the repository at this point in the history
  • Loading branch information
EricEdwardBryant committed Jul 5, 2017
1 parent 9350709 commit dfa6e40
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 32 deletions.
94 changes: 73 additions & 21 deletions R/add-RFLP.R
Expand Up @@ -2,6 +2,9 @@
#' Add RFLP enzymes to iSTOP data
#'
#' @param iSTOP A dataframe resulting from [locate_PAM].
#' @param recognizes The base recognized by the enzyme. One of "c" or "t".
#' In terms of base editing, "t" will return enzymes that only cut after editing,
#' whereas "c" will return enzymes that only cut before editing.
#' @param width An integer specifiying range of unique cutting.
#' @param enzymes A dataframe of enzymes considered. Defaults to NULL which will
#' use an enzyme dataset included in the iSTOP package.
Expand All @@ -17,11 +20,14 @@
#' @importFrom purrr pmap map_int map_chr map2 map
#' @md

add_RFLP <- function(iSTOP, width = 150, enzymes = NULL, cores = 1) {
add_RFLP <- function(iSTOP, recognizes = 'c', width = 150, enzymes = NULL, cores = 1) {

assertthat::assert_that(assertthat::is.number(width), assertthat::is.number(cores), assertthat::is.string(recognizes))
assertthat::assert_that(recognizes %in% c('c', 't'), msg = '`recognizes` argument for add_RFLP() must be one of "c", or "t".')

message('Adding RFLP...')

enzyme_combinations <- process_enzymes(enzymes)
enzyme_combinations <- process_enzymes(enzymes, recognizes)

if (cores <= 1L) cores <- NULL

Expand All @@ -30,19 +36,28 @@ add_RFLP <- function(iSTOP, width = 150, enzymes = NULL, cores = 1) {
on.exit(pbapply::pboptions(pbo), add = TRUE)

iSTOP %>%
split(.$chr) %>%
split(.$chr) %>% # split on chromosome for parallelization
pbapply::pblapply(function(df) {
center <- ceiling(str_length(df$searched) / 2)
search <- str_sub(df$searched, start = center - width, end = center + width)
df[[paste0('RFLP_', width)]] <- identify_RFLP_enzymes(search, enzyme_combinations)

center <- ceiling(stringr::str_length(df$searched) / 2)
search <- stringr::str_sub(df$searched, start = center - width, end = center + width)

if (recognizes == 't') {
search <- stringr::str_replace(search, 'c', 't')
column_name <- paste0('RFLP_T_', width)
} else {
column_name <- paste0('RFLP_C_', width)
}

df[[column_name]] <- identify_RFLP_enzymes(search, enzyme_combinations)
return(df)
}, cl = cores) %>%
bind_rows
bind_rows()
}

# Converts all Cs to lowercase one at a time to generate all possible match patterns
# Supports ambiguities by converting compatible patterns to 'c' and excluding incombatible ambiguous matches
process_enzymes <- function(enzymes = NULL) {
process_enzymes <- function(enzymes = NULL, recognizes) {

if (is.null(enzymes)) {
enzymes <-
Expand All @@ -51,6 +66,17 @@ process_enzymes <- function(enzymes = NULL) {
filter(!exclude)
}

switch(
recognizes,
c = process_enzymes_c(enzymes),
t = process_enzymes_t(enzymes)
)

}

# ---- Process enzymes for recognizing a particular base ----

process_enzymes_c <- function(enzymes) {
enzymes %>%
select(enzyme, forward, reverse) %>%
# Be agnostic to strand orientation (i.e. keep both)
Expand All @@ -60,46 +86,72 @@ process_enzymes <- function(enzymes = NULL) {
tidyr::unnest() %>%
# If there is an ambiguous pattern for 'c' that does not include 'T' then change
# pattern to only match 'c' (since 'T' is what 'C' will be changed to)
mutate(pattern = str_replace(pattern, '\\[Ac\\]|\\[cG\\]|\\[AcG\\]', 'c')) %>%
mutate(pattern = stringr::str_replace(pattern, '\\[Ac\\]|\\[cG\\]|\\[AcG\\]', 'c')) %>%
# If 'c' is contained within an ambiguity that includes 'T' then remove pattern
# (since we do not want to match after 'C' is chaned to 'T')
filter(!str_detect(pattern, '\\[cT\\]|\\[AcT\\]|\\[cGT\\]')) %>%
filter(!stringr::str_detect(pattern, '\\[cT\\]|\\[AcT\\]|\\[cGT\\]')) %>%
select(enzyme, pattern) %>%
distinct %>%
# Replace standard forward and reverse patterns
left_join(enzymes, by = 'enzyme') %>%
# Ensure that forward and reverse patterns will match even when overlapping
mutate(
forward = str_c('(?=', str_to_upper(forward), ')'),
reverse = str_c('(?=', str_to_upper(reverse), ')'),
fwd_rev = if_else(forward == reverse, forward, str_c(forward, '|', reverse))
forward = stringr::str_c('(?=', stringr::str_to_upper(forward), ')'),
reverse = stringr::str_c('(?=', stringr::str_to_upper(reverse), ')'),
fwd_rev = if_else(forward == reverse, forward, stringr::str_c(forward, '|', reverse))
)
}

process_enzymes_t <- function(enzymes) {
enzymes %>%
select(enzyme, forward, reverse) %>%
# Be agnostic to strand orientation (i.e. keep both)
tidyr::gather(orientation, pattern, forward, reverse) %>%
# Change each occurence of 'T' to 't' one at a time (since this is the intended base change)
mutate(pattern = str_to_lower_each(pattern, 'T')) %>%
tidyr::unnest() %>%
# If there is an ambiguous pattern for 't' that does not include 'C' then change pattern to only match 't'
mutate(pattern = stringr::str_replace(pattern, '\\[At\\]|\\[Gt\\]|\\[AGt\\]', 't')) %>%
# If 't' is contained within an ambiguity that includes 'C' then remove pattern
filter(!stringr::str_detect(pattern, '\\[Ct\\]|\\[ACt\\]|\\[CGt\\]')) %>%
select(enzyme, pattern) %>%
distinct %>%
# Replace standard forward and reverse patterns
left_join(enzymes, by = 'enzyme') %>%
# Ensure that forward and reverse patterns will match even when overlapping
mutate(
forward = stringr::str_c('(?=', stringr::str_to_upper(forward), ')'),
reverse = stringr::str_c('(?=', stringr::str_to_upper(reverse), ')'),
fwd_rev = if_else(forward == reverse, forward, stringr::str_c(forward, '|', reverse))
)
}

# ---- Utilities: add_RFLP ----

identify_RFLP_enzymes <- function(seqs, enzymes) {
# Get all unique patterns
basic <- enzymes %>% select(enzyme, fwd_rev) %>% distinct
basic <- enzymes %>% select(enzyme, fwd_rev) %>% distinct()
purrr::map_chr(seqs, match_one_seq, basic, enzymes)
}


match_one_seq <- function(seq, basic, enzymes) {
# The enzymes that match the base edit (a small number and faster pattern matching)
match_base_edit <- enzymes$enzyme[which(str_detect(seq, enzymes$pattern))]
match_base_edit <- enzymes$enzyme[which(stringr::str_detect(seq, enzymes$pattern))]

# Get outta here if nothing matched the intended base edit
if (length(match_base_edit) == 0) return(NA_character_)

# Otherwise check how many times the enzyme cuts the sequence
search <- basic[basic$enzyme %in% match_base_edit, ]
nmatch <- str_locate_all(seq, regex(search$fwd_rev, ignore_case = T)) %>% purrr::map_int(nrow)
nmatch <- stringr::str_locate_all(seq, regex(search$fwd_rev, ignore_case = T)) %>% purrr::map_int(nrow)
uniquely_match_base_edit <- search$enzyme[which(nmatch == 1L)]

# Get outta here if nothing uniquely matched the intended base edit
if (length(uniquely_match_base_edit) == 0) return(NA_character_)

# Otherwise collapse enzymes into a single string
str_c(uniquely_match_base_edit, collapse = ' | ')
stringr::str_c(uniquely_match_base_edit, collapse = ' | ')
}

# ---- Misc utility functions ----
Expand Down Expand Up @@ -127,10 +179,10 @@ match_one_seq <- function(seq, basic, enzymes) {

# Make each occurence of a pattern in a sting lower case individually
str_to_lower_each <- function(string, pattern, locale = 'en') {
loc <- str_locate_all(string, pattern)
lhs <- purrr::map2(string, loc, ~str_sub(.x, start = 1L, end = .y[, 'start'] - 1L))
rhs <- purrr::map2(string, loc, ~str_sub(.x, end = -1L, start = .y[, 'end'] + 1L))
mid <- purrr::map2(string, loc, ~str_sub(.x, start = .y[, 'start'], end = .y[, 'end']) %>% str_to_lower(locale))
loc <- stringr::str_locate_all(string, pattern)
lhs <- purrr::map2(string, loc, ~stringr::str_sub(.x, start = 1L, end = .y[, 'start'] - 1L))
rhs <- purrr::map2(string, loc, ~stringr::str_sub(.x, end = -1L, start = .y[, 'end'] + 1L))
mid <- purrr::map2(string, loc, ~stringr::str_sub(.x, start = .y[, 'start'], end = .y[, 'end']) %>% stringr::str_to_lower(locale))

purrr::pmap(list(lhs, mid, rhs), str_c)
}
7 changes: 4 additions & 3 deletions README.Rmd
Expand Up @@ -75,7 +75,8 @@ BRCA1 <-
filter(gene == 'BRCA1') %>%
locate_codons(Genome) %>%
locate_PAM(Genome) %>%
add_RFLP(width = 150) # 150 is the maximum allowed
add_RFLP(width = 150) %>% # 150 is the maximum allowed
add_RFLP(recognizes = 't') # Enzymes that cut if c edits to t
# (2) Detect iSTOP targets for genes that contain the string "BRCA"
BRCA <-
Expand All @@ -84,7 +85,7 @@ BRCA <-
locate_codons(Genome) %>%
locate_PAM(Genome) %>%
add_RFLP(width = 150)
# (3) Detect iSTOP targets for a set of transcript IDs
ATR <-
CDS_Human %>%
Expand All @@ -109,7 +110,7 @@ plot_spliced_isoforms(
colors = c('red', 'black'),
# Name of track = table with columns `gene` and `genome_coord`
NGG_NGA = filter(BRCA, has(sgNGG) | has(sgNGA)), # `|` = or
RFLP = filter(BRCA, match_any & has(RFLP_150)) # `&` = and
RFLP = filter(BRCA, match_any & has(RFLP_C_150)) # `&` = and
)
```

Expand Down
7 changes: 4 additions & 3 deletions README.md
Expand Up @@ -64,7 +64,8 @@ BRCA1 <-
filter(gene == 'BRCA1') %>%
locate_codons(Genome) %>%
locate_PAM(Genome) %>%
add_RFLP(width = 150) # 150 is the maximum allowed
add_RFLP(width = 150) %>% # 150 is the maximum allowed
add_RFLP(recognizes = 't') # Enzymes that cut if c edits to t

# (2) Detect iSTOP targets for genes that contain the string "BRCA"
BRCA <-
Expand All @@ -73,7 +74,7 @@ BRCA <-
locate_codons(Genome) %>%
locate_PAM(Genome) %>%
add_RFLP(width = 150)

# (3) Detect iSTOP targets for a set of transcript IDs
ATR <-
CDS_Human %>%
Expand All @@ -99,7 +100,7 @@ plot_spliced_isoforms(
colors = c('red', 'black'),
# Name of track = table with columns `gene` and `genome_coord`
NGG_NGA = filter(BRCA, has(sgNGG) | has(sgNGA)), # `|` = or
RFLP = filter(BRCA, match_any & has(RFLP_150)) # `&` = and
RFLP = filter(BRCA, match_any & has(RFLP_C_150)) # `&` = and
)
```

Expand Down
7 changes: 6 additions & 1 deletion man/add_RFLP.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

40 changes: 39 additions & 1 deletion tests/testthat/test-add_RFLP.R
Expand Up @@ -2,7 +2,7 @@ context('RFLP')

test_that('RFLP matching considers both strands', {

enzymes <- iSTOP:::process_enzymes()
enzymes <- iSTOP:::process_enzymes(recognizes = 'c')
enzymes <- enzymes[enzymes$enzyme == 'MnlI', ]

# The basic pattern should match if edited base is present
Expand Down Expand Up @@ -36,3 +36,41 @@ test_that('RFLP matching considers both strands', {
2
)
})


test_that('RFLP matching works for gained cutsite with t recognition', {

enzymes <- iSTOP:::process_enzymes(recognizes = 't')
enzymes <- enzymes[enzymes$enzyme == 'MnlI', ]

# The basic pattern should match if edited base is present
expect_equal(
iSTOP:::identify_RFLP_enzymes('CCtC', enzymes),
'MnlI'
)
# But should not match if no edited base is present
expect_equal(
iSTOP:::identify_RFLP_enzymes('CCTC', enzymes),
NA_character_
)
# If match on opposite strand then should match nothing and return an empty string
expect_equal(
iSTOP:::identify_RFLP_enzymes('GAGGCCtC', enzymes),
NA_character_
)
# If match two overlapping sites then should return nothing
expect_equal(
iSTOP:::identify_RFLP_enzymes('CCTCCtC', enzymes),
NA_character_
)
# If match two overlapping sites then should return nothing
expect_equal(
iSTOP:::identify_RFLP_enzymes('GGTtCAG', enzymes),
NA_character_
)
# Overlapping on opposite forward and reverse patterns should match either
expect_equal(
stringr::str_locate_all('GGTtCAG', regex('(?=GGTt)|(?=tCAG)', ignore_case = T)) %>% purrr::map_int(nrow),
2
)
})
11 changes: 8 additions & 3 deletions tests/testthat/test-search-off-target.R
Expand Up @@ -17,6 +17,11 @@ test_that('Off target search gets same results regardless of strand', {

expect_matching <- c(1, 2, 3, 6, 7, 8)

#genome <- BSgenome.Hsapiens.UCSC.hg38::Hsapiens
#chromosomes <- BSgenome::seqnames(genome)[1:25]

#full_search <- iSTOP::search_off_target(seqs, genome, chromosomes, cores = 6)

plus <-
iSTOP:::search_off_target_chr(
seqs,
Expand All @@ -43,12 +48,12 @@ test_that('Off target search gets same results regardless of strand', {
)

expect_equal(
plus$sequence,
minus$sequence
plus$guide,
minus$guide
)

expect_equal(
plus$sequence,
plus$guide,
seqs[expect_matching]
)

Expand Down

0 comments on commit dfa6e40

Please sign in to comment.