addressing NOTES, more robust tests and retry

PolMine · Apr 11, 2024 · a8ca6e4 · a8ca6e4
1 parent 7371b00
commit a8ca6e4
Show file tree

Hide file tree

Showing 8 changed files with 145 additions and 142 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -63,6 +63,7 @@ importFrom(stats,setNames)
 importFrom(stringi,stri_c)
 importFrom(tibble,as_tibble)
 importFrom(utils,URLencode)
+importFrom(utils,capture.output)
 importFrom(xml2,read_xml)
 importFrom(xml2,xml_attr)
 importFrom(xml2,xml_children)

diff --git a/R/dbpedia.R b/R/dbpedia.R
@@ -365,7 +365,7 @@ setMethod(
     offset = 1L,
     confidence = 0.35,
     api = getOption("dbpedia.endpoint"),
-    retry = TRUE,
+    retry = 0L,
     logfile = NULL,
     types = character(),
     support = 20,
@@ -434,7 +434,6 @@ setMethod(
     )
 
     if (verbose) cli_progress_step("send request to DBpedia Spotlight")
-    request_max <- if (is.logical(retry)) as.integer(retry) else retry
     request_number <- 1L
     proceed <- TRUE
 
@@ -465,7 +464,7 @@ setMethod(
 
       if (httr::http_error(request)) {
         cli_alert_danger("http error response")
-        if (request_number <= request_max){
+        if (request_number <= retry){
           if (!is.null(logfile)){
             cat(x, file = logfile, append = TRUE)
             cat("\n", file = logfile, append = TRUE)
@@ -641,7 +640,8 @@ setMethod(
 #' @param confidence A `numeric` value, the minimum similarity score that serves
 #'   as threshold before DBpedia Spotlight includes a link into the report.
 #' @param api An URL of the DBpedia Spotlight API.
-#' @param retry A `logical` value, whether to retry in case of a http error.
+#' @param retry A `numeric` value, the number of times to retry in case of a http
+#'   error.
 #' @param logfile Filename for writing logs (e.g. for debugging purposes).
 #' @param types A `character` vector to restrict result returned to certain
 #'   entity types, such as 'Company' or 'Organization'. If the `character`

diff --git a/R/entity_types.R b/R/entity_types.R
@@ -115,6 +115,6 @@ setMethod(
       "mapping values in column {.var types} to new column {.var category}"
     )
 
-  x[, category := entity_types_map(x = x[["types"]], mapping_vector = mapping_vector, other = other, verbose = verbose)]
+  x[, "category" := entity_types_map(x = x[["types"]], mapping_vector = mapping_vector, other = other, verbose = verbose)]
   x
 })
diff --git a/R/overlaps.R b/R/overlaps.R
@@ -59,7 +59,7 @@ detect_overlap <- function(x,
       "Argument {.var end} is NULL. Setting {.var end} to {.var start_col + nchar}.
       This can be wrong in case of CWB corpora."
     )
-    x[, end := get(start_col) + nchar(text)]
+    x[, "end" := get(start_col) + nchar(x[["text"]])]
     end_col <- "end"
   }
 
@@ -69,7 +69,7 @@ detect_overlap <- function(x,
                                      start_col = start_col,
                                      end_col = end_col,
                                      verbose = verbose),
-      by = doc]
+      by = "doc"]
 
   } else {
 
@@ -94,7 +94,7 @@ detect_overlap_aux <- function(input_dt,
                      with = FALSE]
 
   # add temporary row idx for later join
-  ovl_dt[, row_idx := 1:nrow(ovl_dt)]
+  ovl_dt[, "row_idx" := 1:nrow(ovl_dt)]
 
   # set keys for the following foverlaps. Should be start and end.
   setkeyv(ovl_dt, c(start_col, end_col))
@@ -108,7 +108,8 @@ detect_overlap_aux <- function(input_dt,
   # data.tables), subset by those in which the row idx in x is smaller than in
   # y.
 
-  overlaps_out <- foverlaps(ovl_dt, ovl_dt, type = "any", which = TRUE)[xid < yid]
+  overlaps_out_all <- foverlaps(ovl_dt, ovl_dt, type = "any", which = TRUE)
+  overlaps_out <- overlaps_out_all[overlaps_out_all[["xid"]] < overlaps_out_all[["yid"]]]
 
   if (nrow(overlaps_out) == 0) {
 
@@ -124,7 +125,7 @@ detect_overlap_aux <- function(input_dt,
     # the same overlap. See which intersect.
 
     if (nrow(overlaps_out) == 1) {
-      overlaps_out[, overlap_group_idx := 1]
+      overlaps_out[, "overlap_group_idx" := 1]
     } else {
       idx_ranges <- lapply(1:nrow(overlaps_out), function(i) {
         overlaps_out[i, ][["xid"]]:overlaps_out[i, ][["yid"]]
@@ -165,16 +166,16 @@ detect_overlap_aux <- function(input_dt,
 
       # overlapping entities should now all have the same region.
       overlap_groups_unique <- unique(overlap_groups)
-      overlaps_out[, overlap_group_idx := which(sapply(overlap_groups_unique, function(x) .I %in% x)), by = .I]
+      overlaps_out[, "overlap_group_idx" := which(sapply(overlap_groups_unique, function(x) .I %in% x)), by = .I]
     }
 
     # add an ID for individual overlaps
     if (!is.null(group_id)) {
-      overlaps_out[, overlap_id := sprintf("ovl_%s_%s",
+      overlaps_out[, "overlap_id" := sprintf("ovl_%s_%s",
                                            group_id,
-                                           overlap_group_idx)]
+                                           overlaps_out[["overlap_group_idx"]])]
     } else {
-      overlaps_out[, overlap_id := sprintf("ovl_%s", overlap_group_idx)]
+      overlaps_out[, "overlap_id" := sprintf("ovl_%s", overlaps_out[["overlap_group_idx"]])]
     }
 
     # and make from wide to long table for join
@@ -183,7 +184,7 @@ detect_overlap_aux <- function(input_dt,
                               measure.vars = c("xid", "yid"),
                               value.name = "row_idx")
 
-    overlaps_out_long[, variable := NULL]
+    overlaps_out_long[, "variable" := NULL]
 
     if (isTRUE(verbose)) {
       cli_alert_info(
@@ -192,7 +193,7 @@ detect_overlap_aux <- function(input_dt,
     }
 
     # merge to input
-    ovl_dt[overlaps_out_long, on = "row_idx", "ovl_id" := i.overlap_id]
+    ovl_dt[overlaps_out_long, on = "row_idx", "ovl_id" := overlaps_out_long[["overlap_id"]]]
 
     retval <- ovl_dt[["ovl_id"]]
   }
@@ -362,7 +363,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
       add_ents_dt <- rbind(overlaps_outer_dt, overlaps_inner_dt)
 
       if (all(is.na(add_ents_dt[["doc"]]))) {
-        add_ents_dt[, doc := NULL]
+        add_ents_dt[, "doc" := NULL]
       }
 
       if (!start_col %in% colnames(add_ents_dt)) {
@@ -381,7 +382,7 @@ categorize_overlap <- function(x, start_col, end_col, experimental = FALSE, corp
   cols <- c("ovl_longest", "ovl_shortest", "ovl_inner", "ovl_outer",
             "ovl_partial", "ovl_multiple", "ovl_distinct", "ovl_undetermined")
 
-  x[!is.na(x[["ovl_id"]]), (cols) := lapply(cols, function(x) grepl(pattern = x, x[["ovl_type"]])), by = .I]
+  x[!is.na(x[["ovl_id"]]), (cols) := lapply(cols, function(p) grepl(pattern = p, x[.I, ][["ovl_type"]])), by = .I]
 
   # after this, the ovl_type column is not needed anymore.
   x[, "ovl_type" := NULL]
@@ -439,7 +440,7 @@ get_outer_inner_ovl_aux = function(.SD, start_col, end_col, verbose = verbose) {
     # check if there is a row in which the short index is in x and the long
     # index is in y. This suggests that these are totally within each.
 
-    inner_idx <- overlap_dt[xid > yid][["xid"]]
+    inner_idx <- overlap_dt[overlap_dt[["xid"]] > overlap_dt[["yid"]]][["xid"]]
 
     if (length(inner_idx) > 0) {
       overlap_types[inner_idx] <- "ovl_inner"
@@ -449,7 +450,7 @@ get_outer_inner_ovl_aux = function(.SD, start_col, end_col, verbose = verbose) {
       # entities)
     }
 
-    outer_idx <- overlap_dt[xid > yid][["yid"]]
+    outer_idx <- overlap_dt[overlap_dt[["xid"]] > overlap_dt[["yid"]]][["yid"]]
 
     if (length(outer_idx) > 0) {
       overlap_types[outer_idx] <- "ovl_outer"
@@ -555,10 +556,10 @@ get_combined_text = function(.SD, start_col, end_col, segment, corpus = NULL) {
 #' @export
 resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {
 
-  ovl_unique_before <- length(unique(x[!is.na(ovl_id), ][["ovl_id"]])) # number of overlaps
+  ovl_unique_before <- length(unique(x[!is.na(x[["ovl_id"]]), ][["ovl_id"]])) # number of overlaps
 
   # first, keep all non-overlapping entities
-  x[is.na(x[["ovl_id"]]), ovl_keep := 1L]
+  x[is.na(x[["ovl_id"]]), "ovl_keep" := 1L]
 
   if (isTRUE(verbose)) {
     cli_alert_info("Identifing entities to {.strong keep}.")
@@ -577,7 +578,7 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {
 
     for (i in seq_along(omit)) {
       omit_i <- paste0("ovl_", omit[i])
-      x[x[, .I[which(get(omit_i) == TRUE)], by = "ovl_id"]$V1, ovl_keep := -1L]
+      x[x[, .I[which(get(omit_i) == TRUE)], by = "ovl_id"]$V1, "ovl_keep" := -1L]
     }
   }
 
@@ -616,8 +617,8 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {
     )
   }
 
-  x[!is.na(ovl_id), c("ovl_keep", "ovl_by") := tiebreak_fun(.SD, tiebreak_mode = tiebreak), by = "ovl_id"]
-  x <- x[x[, .I[which(.SD[["ovl_keep"]] > 0 & .SD[["ovl_keep"]] == min(.SD[["ovl_keep"]], na.rm = TRUE))], by = ovl_id][["V1"]], ]
+  x[!is.na(x[["ovl_id"]]), c("ovl_keep", "ovl_by") := tiebreak_fun(.SD, tiebreak_mode = tiebreak), by = "ovl_id"]
+  x <- x[x[, .I[which(.SD[["ovl_keep"]] > 0 & .SD[["ovl_keep"]] == min(.SD[["ovl_keep"]], na.rm = TRUE))], by = "ovl_id"][["V1"]], ]
 
   ovl_unique_after <- length(unique(x[!is.na(x[["ovl_id"]]), ][["ovl_id"]]))
 
@@ -627,10 +628,10 @@ resolve_overlap = function(x, keep, omit = NULL, tiebreak, verbose = TRUE) {
     )
   }
 
-  x[, ovl_keep := NULL]
+  x[, "ovl_keep" := NULL]
 
   # as a result, all overlap IDs should only occur once
-  stopifnot(all(table(x$ovl_id) == 1))
+  stopifnot(all(table(x[["ovl_id"]]) == 1))
 
   return(x)
 }
diff --git a/man/get_dbpedia_uris.Rd b/man/get_dbpedia_uris.Rd
diff --git a/tests/testthat/test-categorize_overlaps.R b/tests/testthat/test-categorize_overlaps.R
@@ -0,0 +1,102 @@
+test_that(
+  "categorize_overlap() categorizes entity overlaps based on character vectors",
+  {
+
+    doc <- "Crude oil prices on the rise."
+
+    x <- get_dbpedia_uris(
+      x = doc,
+      max_len = 5600L,
+      confidence = 0.35,
+      api = "https://api.dbpedia-spotlight.org/en/annotate",
+      language = "en",
+      types = character(),
+      support = 20,
+      types_src = c("DBpedia", "Wikidata"),
+      verbose = TRUE
+    ) |>
+      detect_overlap(start_col = "start", verbose = TRUE)
+
+    y <- categorize_overlap(x,
+                            start_col = "start",
+                            end_col = "end",
+                            experimental = TRUE,
+                            verbose = TRUE)
+
+    # expect four rows
+    expect_equal(nrow(y), 4L)
+
+    # expect specific entity texts
+    expect_contains(y[["text"]], c("Crude oil", "Crude oil prices", "oil", "oil prices"))
+
+    # expect specific values in specific row
+    example_row <- y[text == "Crude oil prices"]
+
+    # concatenated uris
+    expect_equal(example_row[["dbpedia_uri"]],
+                 "http://dbpedia.org/resource/Petroleum|http://dbpedia.org/resource/Price_of_oil")
+
+    expect_equal(
+      unname(
+        unlist(
+          example_row[, ovl_longest:ovl_undetermined]
+        )
+      ),
+      c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE)
+    )
+  }
+)
+
+test_that(
+  "categorize_overlap() categorizes entity overlaps based on CWB corpora",
+  {
+
+    withr::local_package("polmineR")
+
+    use("RcppCWB")
+
+    reuters_anno <- corpus("REUTERS") |>
+      polmineR::subset(id == "353") |>
+      get_dbpedia_uris(
+        max_len = 5600L,
+        confidence = 0.35,
+        api = "https://api.dbpedia-spotlight.org/en/annotate",
+        language = "en",
+        types = character(),
+        support = 20,
+        verbose = TRUE
+      ) |>
+      detect_overlap(start_col = "cpos_left", end_col = "cpos_right", verbose = TRUE)
+
+    y <- categorize_overlap(reuters_anno,
+                            start_col = "cpos_left",
+                            end_col = "cpos_right",
+                            experimental = TRUE,
+                            corpus = "REUTERS",
+                            verbose = TRUE)
+
+    ymin <- y[!is.na(ovl_id)]
+
+    # expect four rows
+    expect_equal(nrow(ymin), 4L)
+
+    # expect specific entity texts
+    expect_contains(ymin[["text"]], c("Crude oil", "Crude oil prices fell", "oil prices fell", "oil"))
+
+    # expect specific values in specific row
+    example_row <- ymin[text == "Crude oil prices fell"]
+
+    # concatenated uris
+    expect_equal(example_row[["dbpedia_uri"]],
+                 "http://dbpedia.org/resource/West_Texas_Intermediate|http://dbpedia.org/resource/1980s_oil_glut")
+
+    expect_equal(
+      unname(
+        unlist(
+          example_row[, ovl_longest:ovl_undetermined]
+        )
+      ),
+      c(FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE)
+    )
+  }
+)