Skip to content

Commit

Permalink
address data.table R CMD check warnings
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Blätte authored and Andreas Blätte committed Apr 10, 2024
1 parent 759940d commit 7371b00
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 39 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Expand Up @@ -2,6 +2,7 @@

* A new auxiliary function `segment()` generates overlapping segments of text for strings longer than the maximum nchar that can be processed by DBpedia Spotlight.
* Method `get_dbpedia_uris()` has new argument `overlap` passed into `segment()`.
* Method `get_dbpedia_uris()` has new argument `offset` to indicate base offset number other than 1.


## dbpedia v0.1.2.9004
Expand Down
50 changes: 34 additions & 16 deletions R/dbpedia.R
@@ -1,5 +1,5 @@
`:=` <- function(...) NULL
.SD <- NULL
.SD <- .GRP <- .I <- .N <- NULL

#' Set and report status of DBpedia Spotlight
#'
Expand Down Expand Up @@ -326,6 +326,7 @@ setGeneric(
#' @exportMethod get_dbpedia_uris
#' @rdname get_dbpedia_uris
#' @importFrom data.table data.table
#' @importFrom utils capture.output
#' @examples
#' \dontrun{
#' # Process AnnotatedPlainTextDocument (example available in NLP package)
Expand Down Expand Up @@ -361,6 +362,7 @@ setMethod(
language = getOption("dbpedia.lang"),
max_len = 5600L,
overlap = 500L,
offset = 1L,
confidence = 0.35,
api = getOption("dbpedia.endpoint"),
retry = TRUE,
Expand All @@ -378,13 +380,14 @@ setMethod(
)
segs <- segment(x = x, max_len = max_len, overlap = overlap)
dts <- lapply(
segs,
function(seg){
seq_along(segs),
function(i){
get_dbpedia_uris(
x = seg,
x = segs[[i]],
language = language,
max_len = max_len, # input 'seg' must be below this threshold
overlap = overlap, # may not be needed
offset = as.integer(names(segs)[i]),
confidence = confidence,
api = api,
retry = retry,
Expand All @@ -398,18 +401,21 @@ setMethod(
}
)

pos <- as.integer(names(segs))
offset <- as.integer(names(segs))
for (i in seq_along(dts)){
if (i == 1){
breakpoint <- (nchar(dts[[1L]]) - pos[2L]) / 2
dts[[1L]] <- dts[[1L]][dts[[1L]][["start"]] < breakpoint]
if (i == 1L){
breakpoint_r <- offset[2L] + (nchar(segs[1L]) - offset[2L]) / 2
dts[[1L]] <- dts[[1L]][dts[[1L]][["start"]] < breakpoint_r]
} else if (i == length(dts)){
breakpoint <- ((pos[i - 1L] + nchar(segs[i - 1L]) - 1L) - pos[i]) / 2
dts[[i]] <- dts[[i]][dts[[i]][["start"]] > breakpoint]
offset_prev <- offset[i - 1L] + nchar(segs[i - 1L]) - 1L
breakpoint_l <- offset[i] + ((offset_prev - offset[i]) / 2)
dts[[i]] <- dts[[i]][dts[[i]][["start"]] >= breakpoint_l]
} else {
breakpoint_l <- ((pos[i - 1L] + nchar(segs[i - 1L] - 1L)) - pos[i]) / 2
breakpoint_r <- ((pos[i] + nchar(segs[i] - 1L)) - pos[i + 1]) / 2
dts[[i]] <- dts[[i]][dts[[i]][["start"]] > breakpoint_l]
offset_prev <- offset[i - 1L] + nchar(segs[i - 1L]) - 1L
breakpoint_l <- offset[i] + ((offset_prev - offset[i]) / 2)
rbound_current <- offset[i] + nchar(segs[i]) - 1L
breakpoint_r <- offset[i + 1] + ((rbound_current - offset[i + 1]) / 2)
dts[[i]] <- dts[[i]][dts[[i]][["start"]] >= breakpoint_l]
dts[[i]] <- dts[[i]][dts[[i]][["start"]] < breakpoint_r]
}
}
Expand All @@ -428,7 +434,7 @@ setMethod(
)

if (verbose) cli_progress_step("send request to DBpedia Spotlight")
request_max <- if (is.logical(retry)) as.integer(request) else retry
request_max <- if (is.logical(retry)) as.integer(retry) else retry
request_number <- 1L
proceed <- TRUE

Expand Down Expand Up @@ -523,8 +529,8 @@ setMethod(
)
setcolorder(resources_min, c("start", "text", "dbpedia_uri", "types"))

resources_min[, "start" := as.integer(resources_min[["start"]]) + 1L]
resources_min[, "start" := as.integer(resources_min[["start"]]) + offset]

# See issue 41.
types_list <- strsplit(x = resources_min[["types"]], split = ",")

Expand Down Expand Up @@ -581,6 +587,7 @@ setMethod(
x,
language = getOption("dbpedia.lang"),
max_len = 5600L,
overlap = 1000L,
confidence = 0.35,
api = getOption("dbpedia.endpoint"),
retry = TRUE,
Expand All @@ -594,6 +601,7 @@ setMethod(
x = as.character(x[["content"]]),
language = language,
max_len = max_len,
overlap = overlap,
confidence = confidence,
api = api,
retry = retry,
Expand Down Expand Up @@ -626,6 +634,8 @@ setMethod(
#' threshold of 5600 characters is the default value.
#' @param overlap If the input string `x` is longer than `max_len`, the numnber
#' of overlapping characters (passed into `segment()`).
#' @param offset An integer value with the base offset position of the text to
#' be annotated.
#' @param language The language of the input text ("en", "fr", "de", ...) to
#' determine the stopwords used.
#' @param confidence A `numeric` value, the minimum similarity score that serves
Expand Down Expand Up @@ -706,6 +716,7 @@ setMethod(
p_attribute = "word",
s_attribute = NULL,
max_len = 5600L,
overlap = 1000L,
confidence = 0.35,
api = getOption("dbpedia.endpoint"),
retry = TRUE,
Expand Down Expand Up @@ -744,6 +755,7 @@ setMethod(
x = doc,
language = language,
max_len = max_len,
overlap = overlap,
confidence = confidence,
api = api,
retry = retry,
Expand Down Expand Up @@ -890,6 +902,7 @@ setMethod(
types = character(),
support = 20,
max_len = 5600L,
overlap = 1000L,
expand_to_token = FALSE,
verbose = TRUE,
progress = FALSE
Expand All @@ -909,6 +922,7 @@ setMethod(
language = language,
s_attribute = s_attribute,
max_len = max_len,
overlap = overlap,
confidence = confidence,
api = api,
retry = retry,
Expand Down Expand Up @@ -956,6 +970,7 @@ setMethod(
x,
language = getOption("dbpedia.lang"),
max_len = 5600L,
overlap = 1000L,
confidence = 0.35,
api = getOption("dbpedia.endpoint"),
retry = TRUE,
Expand Down Expand Up @@ -992,6 +1007,7 @@ setMethod(
x = docs[[docname]],
language = language,
max_len = max_len,
overlap = overlap,
confidence = confidence,
api = api,
retry = retry,
Expand Down Expand Up @@ -1032,6 +1048,7 @@ setMethod(
token_tags = c("w", "pc"),
text_tag = NULL,
max_len = 5600L,
overlap = 1000L,
confidence = 0.35,
api = getOption("dbpedia.endpoint"),
retry = TRUE,
Expand Down Expand Up @@ -1127,6 +1144,7 @@ setMethod(
x = doc,
language = language,
max_len = max_len,
overlap = overlap,
confidence = confidence,
api = api,
retry = retry,
Expand Down
34 changes: 17 additions & 17 deletions R/overlaps.R
Expand Up @@ -64,7 +64,7 @@ detect_overlap <- function(x,
}

if ("doc" %in% colnames(x)) {
x[, ovl_id := detect_overlap_aux(.SD,
x[, "ovl_id" := detect_overlap_aux(.SD,
group_id = .GRP,
start_col = start_col,
end_col = end_col,
Expand All @@ -73,7 +73,7 @@ detect_overlap <- function(x,

} else {

x[, ovl_id := detect_overlap_aux(input_dt = x,
x[, "ovl_id" := detect_overlap_aux(input_dt = x,
group_id = NULL,
start_col = start_col,
end_col = end_col,
Expand Down Expand Up @@ -192,7 +192,7 @@ detect_overlap_aux <- function(input_dt,
}

# merge to input
ovl_dt[overlaps_out_long, on = "row_idx", ovl_id := i.overlap_id]
ovl_dt[overlaps_out_long, on = "row_idx", "ovl_id" := i.overlap_id]

retval <- ovl_dt[["ovl_id"]]
}
Expand Down Expand Up @@ -304,7 +304,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
}

# Create new column with document type "NA"
x[, ovl_type := ifelse(is.na(x[["ovl_id"]]), NA_character_, "ovl_undetermined")]
x[, "ovl_type" := ifelse(is.na(x[["ovl_id"]]), NA_character_, "ovl_undetermined")]

# set key for later foverlaps
setkeyv(x, c(start_col, end_col))
Expand All @@ -320,7 +320,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
.SD[["text"]],
.SD[["types"]],
get_outer_inner_ovl_aux(.SD, start_col = start_col, end_col = end_col, verbose = verbose)),
by = ovl_id]
by = "ovl_id"]

# For "partial" matches, create an inner and an outer version of the
# annotation. This is currently experimental as it introduces annotations not
Expand All @@ -332,8 +332,8 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
cli_alert_info(text = "Finding outer and inner segments for partial matches. This is experimental.")
}

if (x[ovl_type %in% c("ovl_partial", "ovl_partial|ovl_distinct"), .N] > 0) {
overlaps_outer_dt <- x[ovl_type %in% c("ovl_partial", "ovl_partial|ovl_distinct"),
if (x[x[["ovl_type"]] %in% c("ovl_partial", "ovl_partial|ovl_distinct"), .N] > 0) {
overlaps_outer_dt <- x[x[["ovl_type"]] %in% c("ovl_partial", "ovl_partial|ovl_distinct"),
list(
doc = ifelse("doc" %in% colnames(.SD), .SD[["doc"]], NA),
start = min(.SD[[start_col]]),
Expand All @@ -343,9 +343,9 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
types = ifelse(length(unique(.SD[["dbpedia_uri"]])) == 1, unique(.SD[["types"]]), list(list())),
ovl_type = ifelse(length(unique(.SD[["dbpedia_uri"]])) == 1, "ovl_partial|ovl_outer", "ovl_partial|ovl_multiple|ovl_outer")
),
by = ovl_id]
by = "ovl_id"]

overlaps_inner_dt <- x[ovl_type %in% c("ovl_partial", "ovl_partial|ovl_distinct"),
overlaps_inner_dt <- x[x[["ovl_type"]] %in% c("ovl_partial", "ovl_partial|ovl_distinct"),
list(
doc = ifelse("doc" %in% colnames(.SD), .SD[["doc"]], NA),
start = min(get_inner_overlap_range(.SD, start_col = start_col, end_col = end_col)),
Expand All @@ -357,7 +357,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
list(list())),
ovl_type = ifelse(length(unique(.SD[["dbpedia_uri"]])) == 1, "ovl_partial|ovl_inner", "ovl_partial|ovl_multiple|ovl_inner")
),
by = ovl_id]
by = "ovl_id"]

add_ents_dt <- rbind(overlaps_outer_dt, overlaps_inner_dt)

Expand All @@ -381,10 +381,10 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
cols <- c("ovl_longest", "ovl_shortest", "ovl_inner", "ovl_outer",
"ovl_partial", "ovl_multiple", "ovl_distinct", "ovl_undetermined")

x[!is.na(ovl_id), (cols) := lapply(cols, function(x) grepl(pattern = x, ovl_type)), by = .I]
x[!is.na(x[["ovl_id"]]), (cols) := lapply(cols, function(x) grepl(pattern = x, x[["ovl_type"]])), by = .I]

# after this, the ovl_type column is not needed anymore.
x[, ovl_type := NULL]
x[, "ovl_type" := NULL]

if ("doc" %in% colnames(x)) {
setorderv(x, c("doc", start_col))
Expand Down Expand Up @@ -558,15 +558,15 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {
ovl_unique_before <- length(unique(x[!is.na(ovl_id), ][["ovl_id"]])) # number of overlaps

# first, keep all non-overlapping entities
x[is.na(ovl_id), ovl_keep := 1L]
x[is.na(x[["ovl_id"]]), ovl_keep := 1L]

if (isTRUE(verbose)) {
cli_alert_info("Identifing entities to {.strong keep}.")
}

for (i in seq_along(keep)) {
keep_i <- paste0("ovl_", keep[i])
x[x[, .I[which(get(keep_i) == TRUE)], by = ovl_id]$V1, c("ovl_keep", "ovl_by") := list(i, keep[i])]
x[x[, .I[which(get(keep_i) == TRUE)], by = "ovl_id"]$V1, c("ovl_keep", "ovl_by") := list(i, keep[i])]
}

if (!is.null(omit)) {
Expand All @@ -577,7 +577,7 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {

for (i in seq_along(omit)) {
omit_i <- paste0("ovl_", omit[i])
x[x[, .I[which(get(omit_i) == TRUE)], by = ovl_id]$V1, ovl_keep := -1L]
x[x[, .I[which(get(omit_i) == TRUE)], by = "ovl_id"]$V1, ovl_keep := -1L]
}
}

Expand Down Expand Up @@ -616,10 +616,10 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {
)
}

x[!is.na(ovl_id), c("ovl_keep", "ovl_by") := tiebreak_fun(.SD, tiebreak_mode = tiebreak), by = ovl_id]
x[!is.na(ovl_id), c("ovl_keep", "ovl_by") := tiebreak_fun(.SD, tiebreak_mode = tiebreak), by = "ovl_id"]
x <- x[x[, .I[which(.SD[["ovl_keep"]] > 0 & .SD[["ovl_keep"]] == min(.SD[["ovl_keep"]], na.rm = TRUE))], by = ovl_id][["V1"]], ]

ovl_unique_after <- length(unique(x[!is.na(ovl_id), ][["ovl_id"]]))
ovl_unique_after <- length(unique(x[!is.na(x[["ovl_id"]]), ][["ovl_id"]]))

if (isTRUE(verbose)) {
cli::cli_alert_info(
Expand Down
17 changes: 13 additions & 4 deletions R/segment.R
Expand Up @@ -29,13 +29,14 @@ segment <- function(x, max_len = 7900L, overlap = 500L){

df[["esc"]] <- curl::curl_escape(df[["src"]])
df[["begin_esc"]] <- cumsum(c(1L, (nchar(df$esc) + 3L)[1L:(nrow(df) - 1L)]))
df[["end_esc"]] <- df[["begin_esc"]] + nchar(df[["esc"]])

# The total number of characters of the escaped string is the beginning of
# the last offset plus the nchar of the last token
nchar_esc <- df$begin_esc[nrow(df)] + nchar(df$esc[nrow(df)]) - 1L

# based on paper & pencil math
n_segments <- ceiling((nchar_esc - overlap) / (max_len - overlap))
n_segments <- ceiling((nchar_esc - overlap) / (max_len - overlap)) + 2

if (n_segments > 1){
half <- floor(max_len / 2)
Expand All @@ -57,13 +58,13 @@ segment <- function(x, max_len = 7900L, overlap = 500L){
from <- if (i == 1L){
1L
} else {
max(which(df[["begin_esc"]] <= (anchors[i] - half)))
min(which(df[["begin_esc"]] > (anchors[i] - half)))
}

to <- if (i == length(anchors)){
nrow(df)
} else {
min(which(df[["begin_esc"]] >= (anchors[i] + half)))
max(which(df[["end_esc"]] < (anchors[i] + half)))
}
df[from:to,]
}
Expand All @@ -75,5 +76,13 @@ segment <- function(x, max_len = 7900L, overlap = 500L){
segments <- list(x)
names(segments) <- as.character(1)
}
as.character(segments)

nchar_seg_esc <- nchar(
unlist(lapply(lapply(y, `[[`, "esc"), paste, collapse = "%20"))
)

if (any(nchar_seg_esc > max_len))
cli_alert_warning("segments exceed `max_len`")

unlist(segments)
}

0 comments on commit 7371b00

Please sign in to comment.