Skip to content

Commit

Permalink
addressing NOTES, more robust tests and retry
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristophLeonhardt committed Apr 11, 2024
1 parent 7371b00 commit a8ca6e4
Show file tree
Hide file tree
Showing 8 changed files with 145 additions and 142 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Expand Up @@ -63,6 +63,7 @@ importFrom(stats,setNames)
importFrom(stringi,stri_c)
importFrom(tibble,as_tibble)
importFrom(utils,URLencode)
importFrom(utils,capture.output)
importFrom(xml2,read_xml)
importFrom(xml2,xml_attr)
importFrom(xml2,xml_children)
Expand Down
8 changes: 4 additions & 4 deletions R/dbpedia.R
Expand Up @@ -365,7 +365,7 @@ setMethod(
offset = 1L,
confidence = 0.35,
api = getOption("dbpedia.endpoint"),
retry = TRUE,
retry = 0L,
logfile = NULL,
types = character(),
support = 20,
Expand Down Expand Up @@ -434,7 +434,6 @@ setMethod(
)

if (verbose) cli_progress_step("send request to DBpedia Spotlight")
request_max <- if (is.logical(retry)) as.integer(retry) else retry
request_number <- 1L
proceed <- TRUE

Expand Down Expand Up @@ -465,7 +464,7 @@ setMethod(

if (httr::http_error(request)) {
cli_alert_danger("http error response")
if (request_number <= request_max){
if (request_number <= retry){
if (!is.null(logfile)){
cat(x, file = logfile, append = TRUE)
cat("\n", file = logfile, append = TRUE)
Expand Down Expand Up @@ -641,7 +640,8 @@ setMethod(
#' @param confidence A `numeric` value, the minimum similarity score that serves
#' as threshold before DBpedia Spotlight includes a link into the report.
#' @param api An URL of the DBpedia Spotlight API.
#' @param retry A `logical` value, whether to retry in case of a http error.
#' @param retry A `numeric` value, the number of times to retry in case of a http
#' error.
#' @param logfile Filename for writing logs (e.g. for debugging purposes).
#' @param types A `character` vector to restrict result returned to certain
#' entity types, such as 'Company' or 'Organization'. If the `character`
Expand Down
2 changes: 1 addition & 1 deletion R/entity_types.R
Expand Up @@ -115,6 +115,6 @@ setMethod(
"mapping values in column {.var types} to new column {.var category}"
)

x[, category := entity_types_map(x = x[["types"]], mapping_vector = mapping_vector, other = other, verbose = verbose)]
x[, "category" := entity_types_map(x = x[["types"]], mapping_vector = mapping_vector, other = other, verbose = verbose)]
x
})
45 changes: 23 additions & 22 deletions R/overlaps.R
Expand Up @@ -59,7 +59,7 @@ detect_overlap <- function(x,
"Argument {.var end} is NULL. Setting {.var end} to {.var start_col + nchar}.
This can be wrong in case of CWB corpora."
)
x[, end := get(start_col) + nchar(text)]
x[, "end" := get(start_col) + nchar(x[["text"]])]
end_col <- "end"
}

Expand All @@ -69,7 +69,7 @@ detect_overlap <- function(x,
start_col = start_col,
end_col = end_col,
verbose = verbose),
by = doc]
by = "doc"]

} else {

Expand All @@ -94,7 +94,7 @@ detect_overlap_aux <- function(input_dt,
with = FALSE]

# add temporary row idx for later join
ovl_dt[, row_idx := 1:nrow(ovl_dt)]
ovl_dt[, "row_idx" := 1:nrow(ovl_dt)]

# set keys for the following foverlaps. Should be start and end.
setkeyv(ovl_dt, c(start_col, end_col))
Expand All @@ -108,7 +108,8 @@ detect_overlap_aux <- function(input_dt,
# data.tables), subset by those in which the row idx in x is smaller than in
# y.

overlaps_out <- foverlaps(ovl_dt, ovl_dt, type = "any", which = TRUE)[xid < yid]
overlaps_out_all <- foverlaps(ovl_dt, ovl_dt, type = "any", which = TRUE)
overlaps_out <- overlaps_out_all[overlaps_out_all[["xid"]] < overlaps_out_all[["yid"]]]

if (nrow(overlaps_out) == 0) {

Expand All @@ -124,7 +125,7 @@ detect_overlap_aux <- function(input_dt,
# the same overlap. See which intersect.

if (nrow(overlaps_out) == 1) {
overlaps_out[, overlap_group_idx := 1]
overlaps_out[, "overlap_group_idx" := 1]
} else {
idx_ranges <- lapply(1:nrow(overlaps_out), function(i) {
overlaps_out[i, ][["xid"]]:overlaps_out[i, ][["yid"]]
Expand Down Expand Up @@ -165,16 +166,16 @@ detect_overlap_aux <- function(input_dt,

# overlapping entities should now all have the same region.
overlap_groups_unique <- unique(overlap_groups)
overlaps_out[, overlap_group_idx := which(sapply(overlap_groups_unique, function(x) .I %in% x)), by = .I]
overlaps_out[, "overlap_group_idx" := which(sapply(overlap_groups_unique, function(x) .I %in% x)), by = .I]
}

# add an ID for individual overlaps
if (!is.null(group_id)) {
overlaps_out[, overlap_id := sprintf("ovl_%s_%s",
overlaps_out[, "overlap_id" := sprintf("ovl_%s_%s",
group_id,
overlap_group_idx)]
overlaps_out[["overlap_group_idx"]])]
} else {
overlaps_out[, overlap_id := sprintf("ovl_%s", overlap_group_idx)]
overlaps_out[, "overlap_id" := sprintf("ovl_%s", overlaps_out[["overlap_group_idx"]])]
}

# and make from wide to long table for join
Expand All @@ -183,7 +184,7 @@ detect_overlap_aux <- function(input_dt,
measure.vars = c("xid", "yid"),
value.name = "row_idx")

overlaps_out_long[, variable := NULL]
overlaps_out_long[, "variable" := NULL]

if (isTRUE(verbose)) {
cli_alert_info(
Expand All @@ -192,7 +193,7 @@ detect_overlap_aux <- function(input_dt,
}

# merge to input
ovl_dt[overlaps_out_long, on = "row_idx", "ovl_id" := i.overlap_id]
ovl_dt[overlaps_out_long, on = "row_idx", "ovl_id" := overlaps_out_long[["overlap_id"]]]

retval <- ovl_dt[["ovl_id"]]
}
Expand Down Expand Up @@ -362,7 +363,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
add_ents_dt <- rbind(overlaps_outer_dt, overlaps_inner_dt)

if (all(is.na(add_ents_dt[["doc"]]))) {
add_ents_dt[, doc := NULL]
add_ents_dt[, "doc" := NULL]
}

if (!start_col %in% colnames(add_ents_dt)) {
Expand All @@ -381,7 +382,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
cols <- c("ovl_longest", "ovl_shortest", "ovl_inner", "ovl_outer",
"ovl_partial", "ovl_multiple", "ovl_distinct", "ovl_undetermined")

x[!is.na(x[["ovl_id"]]), (cols) := lapply(cols, function(x) grepl(pattern = x, x[["ovl_type"]])), by = .I]
x[!is.na(x[["ovl_id"]]), (cols) := lapply(cols, function(p) grepl(pattern = p, x[.I, ][["ovl_type"]])), by = .I]

# after this, the ovl_type column is not needed anymore.
x[, "ovl_type" := NULL]
Expand Down Expand Up @@ -439,7 +440,7 @@ get_outer_inner_ovl_aux = function(.SD, start_col, end_col, verbose = verbose) {
# check if there is a row in which the short index is in x and the long
# index is in y. This suggests that these are totally within each.

inner_idx <- overlap_dt[xid > yid][["xid"]]
inner_idx <- overlap_dt[overlap_dt[["xid"]] > overlap_dt[["yid"]]][["xid"]]

if (length(inner_idx) > 0) {
overlap_types[inner_idx] <- "ovl_inner"
Expand All @@ -449,7 +450,7 @@ get_outer_inner_ovl_aux = function(.SD, start_col, end_col, verbose = verbose) {
# entities)
}

outer_idx <- overlap_dt[xid > yid][["yid"]]
outer_idx <- overlap_dt[overlap_dt[["xid"]] > overlap_dt[["yid"]]][["yid"]]

if (length(outer_idx) > 0) {
overlap_types[outer_idx] <- "ovl_outer"
Expand Down Expand Up @@ -555,10 +556,10 @@ get_combined_text = function(.SD, start_col, end_col, segment, corpus = NULL) {
#' @export
resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {

ovl_unique_before <- length(unique(x[!is.na(ovl_id), ][["ovl_id"]])) # number of overlaps
ovl_unique_before <- length(unique(x[!is.na(x[["ovl_id"]]), ][["ovl_id"]])) # number of overlaps

# first, keep all non-overlapping entities
x[is.na(x[["ovl_id"]]), ovl_keep := 1L]
x[is.na(x[["ovl_id"]]), "ovl_keep" := 1L]

if (isTRUE(verbose)) {
cli_alert_info("Identifing entities to {.strong keep}.")
Expand All @@ -577,7 +578,7 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {

for (i in seq_along(omit)) {
omit_i <- paste0("ovl_", omit[i])
x[x[, .I[which(get(omit_i) == TRUE)], by = "ovl_id"]$V1, ovl_keep := -1L]
x[x[, .I[which(get(omit_i) == TRUE)], by = "ovl_id"]$V1, "ovl_keep" := -1L]
}
}

Expand Down Expand Up @@ -616,8 +617,8 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {
)
}

x[!is.na(ovl_id), c("ovl_keep", "ovl_by") := tiebreak_fun(.SD, tiebreak_mode = tiebreak), by = "ovl_id"]
x <- x[x[, .I[which(.SD[["ovl_keep"]] > 0 & .SD[["ovl_keep"]] == min(.SD[["ovl_keep"]], na.rm = TRUE))], by = ovl_id][["V1"]], ]
x[!is.na(x[["ovl_id"]]), c("ovl_keep", "ovl_by") := tiebreak_fun(.SD, tiebreak_mode = tiebreak), by = "ovl_id"]
x <- x[x[, .I[which(.SD[["ovl_keep"]] > 0 & .SD[["ovl_keep"]] == min(.SD[["ovl_keep"]], na.rm = TRUE))], by = "ovl_id"][["V1"]], ]

ovl_unique_after <- length(unique(x[!is.na(x[["ovl_id"]]), ][["ovl_id"]]))

Expand All @@ -627,10 +628,10 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {
)
}

x[, ovl_keep := NULL]
x[, "ovl_keep" := NULL]

# as a result, all overlap IDs should only occur once
stopifnot(all(table(x$ovl_id) == 1))
stopifnot(all(table(x[["ovl_id"]]) == 1))

return(x)
}
5 changes: 3 additions & 2 deletions man/get_dbpedia_uris.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

102 changes: 102 additions & 0 deletions tests/testthat/test-categorize_overlaps.R
@@ -0,0 +1,102 @@
test_that(
"categorize_overlap() categorizes entity overlaps based on character vectors",
{

doc <- "Crude oil prices on the rise."

x <- get_dbpedia_uris(
x = doc,
max_len = 5600L,
confidence = 0.35,
api = "https://api.dbpedia-spotlight.org/en/annotate",
language = "en",
types = character(),
support = 20,
types_src = c("DBpedia", "Wikidata"),
verbose = TRUE
) |>
detect_overlap(start_col = "start", verbose = TRUE)

y <- categorize_overlap(x,
start_col = "start",
end_col = "end",
experimental = TRUE,
verbose = TRUE)

# expect four rows
expect_equal(nrow(y), 4L)

# expect specific entity texts
expect_contains(y[["text"]], c("Crude oil", "Crude oil prices", "oil", "oil prices"))

# expect specific values in specific row
example_row <- y[text == "Crude oil prices"]

# concatenated uris
expect_equal(example_row[["dbpedia_uri"]],
"http://dbpedia.org/resource/Petroleum|http://dbpedia.org/resource/Price_of_oil")

expect_equal(
unname(
unlist(
example_row[, ovl_longest:ovl_undetermined]
)
),
c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE)
)
}
)

test_that(
"categorize_overlap() categorizes entity overlaps based on CWB corpora",
{

withr::local_package("polmineR")

use("RcppCWB")

reuters_anno <- corpus("REUTERS") |>
polmineR::subset(id == "353") |>
get_dbpedia_uris(
max_len = 5600L,
confidence = 0.35,
api = "https://api.dbpedia-spotlight.org/en/annotate",
language = "en",
types = character(),
support = 20,
verbose = TRUE
) |>
detect_overlap(start_col = "cpos_left", end_col = "cpos_right", verbose = TRUE)

y <- categorize_overlap(reuters_anno,
start_col = "cpos_left",
end_col = "cpos_right",
experimental = TRUE,
corpus = "REUTERS",
verbose = TRUE)

ymin <- y[!is.na(ovl_id)]

# expect four rows
expect_equal(nrow(ymin), 4L)

# expect specific entity texts
expect_contains(ymin[["text"]], c("Crude oil", "Crude oil prices fell", "oil prices fell", "oil"))

# expect specific values in specific row
example_row <- ymin[text == "Crude oil prices fell"]

# concatenated uris
expect_equal(example_row[["dbpedia_uri"]],
"http://dbpedia.org/resource/West_Texas_Intermediate|http://dbpedia.org/resource/1980s_oil_glut")

expect_equal(
unname(
unlist(
example_row[, ovl_longest:ovl_undetermined]
)
),
c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE)
)
}
)

0 comments on commit a8ca6e4

Please sign in to comment.