Skip to content

Commit

Permalink
start working on #26
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristophLeonhardt committed Feb 6, 2024
1 parent fc0e220 commit ef39573
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 37 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Expand Up @@ -2,7 +2,7 @@ Package: dbpedia
Type: Package
Title: R Wrapper for DBpedia Spotlight
Version: 0.1.1.9008
Date: 2024-01-26
Date: 2024-02-05
Authors@R: c(
person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de", comment = c(ORCID = "0000-0001-8970-8010")),
person("Christoph", "Leonhardt", role = "aut")
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Expand Up @@ -10,6 +10,11 @@ limited set of classes.
and in tandem with `map_types_to_class()` by avoiding hard-coding the column name
"ne_type". Also added a color code to entities which are not within "PERSON",
"LOCATION", "ORGANIZATION" and "MISC".
* started to implement the `expand_to_token` argument which ultimately should
resolve mismatches between DBpedia Spotlight's entity spans and CWB token spans
(issue #26)
* added `drop_inexact_annotations` argument to `get_dbpedia_uris()` to control
keeping or omitting inexact annotations in output data.table (see issue #26)

## dbpedia v0.1.1.9006

Expand Down
76 changes: 51 additions & 25 deletions R/dbpedia.R
Expand Up @@ -201,35 +201,43 @@ setMethod("get_dbpedia_uris", "AnnotatedPlainTextDocument", function(x, language


#' Get DBpedia links.
#'
#'
#' @param x A `subcorpus` object. Will be coerced to
#' 'AnnotatedPlainTextDocument' from NLP package.
#' @param max_len An `integer` value. The text passed to DBpedia Spotlight may
#' not exceed a defined length. If it does, an HTTP error results. The known
#' threshold of 5680 characters is the default value.
#' @param language The language of the input text ("en", "fr", "de", ...) to
#' determine the stopwords used.
#' threshold of 5600 characters is the default value.
#' @param language The language of the input text ("en", "fr", "de", ...) to
#' determine the stopwords used.
#' @param confidence A `numeric` value, the minimum similarity score that serves
#' as threshold before DBpedia Spotlight includes a link into the report.
#' @param api An URL of the DBpedia Spotlight API.
#' @param verbose A `logical` value - whether to display messages.
#' @param progress A `logical` value - whether to show progress.
#' @param s_attribute A length-one `character` vector indicating a s-attribute.
#' DBpedia URIs will be mapped on this s-attribute. Only regions covered by
#' this s-attribute will be kept. If missing, URIs will be
#' mapped on the token stream, and all URIs suggested will be kept.
#' DBpedia URIs will be mapped on this s-attribute. Only regions covered by
#' this s-attribute will be kept. If missing, URIs will be mapped on the token
#' stream, and all URIs suggested will be kept.
#' @param p_attribute The p-attribute used for decoding a `subcorpus` object.
#' @param types A `logical` value - whether to return all types provided by
#' DBpedia Spotlight.
#' @param return_types A `logical` value - whether to return all types provided
#' by DBpedia Spotlight.
#' @param expand_to_token A `logical` value - whether the character offsets of
#' DBpedia Spotlight are mapped to the token boundaries of the `subcorpus` if
#' the two differ. Also see the explanation in `details`. Defaults to FALSE.
#' @param drop_inexact_annotations A `logical` value - whether to drop entity
#' annotations when entity spans and token spans do not align exactly.
#' @param ... Further arguments.
#' @details `expand_to_token` is a rather experimental feature that resolves
#' mismatches between entity spans and token spans by expanding the former to
#' the last character position of the corresponding token. See issue #26 in
#' the `dbpedia` GitHub repository.
#' @exportMethod get_dbpedia_uris
#' @importFrom cli cli_alert_warning cli_progress_step cli_alert_danger
#' cli_progress_done cli_alert_info
#' @importFrom polmineR punctuation
#' @importFrom jsonlite fromJSON
#' @importFrom httr GET http_error content
#' @importFrom data.table setnames `:=` setDT setcolorder
#' as.data.table
#' @importFrom data.table setnames `:=` setDT setcolorder as.data.table
#' @importFrom stats setNames
#' @importFrom grDevices heat.colors
#' @importFrom polmineR decode get_token_stream
Expand All @@ -241,19 +249,19 @@ setMethod("get_dbpedia_uris", "AnnotatedPlainTextDocument", function(x, language
#' @examples
#' library(polmineR)
#' use("RcppCWB")
#'
#' uritab <- corpus("REUTERS") %>%
#'
#' uritab <- corpus("REUTERS") %>%
#' subset(id == "127") %>%
#' get_dbpedia_uris(language = "en", p_attribute = "word")
#'
#'
#' use("GermaParl2")
#'
#' uritab2 <- corpus("GERMAPARL2MINI") %>%
#'
#' uritab2 <- corpus("GERMAPARL2MINI") %>%
#' subset(speaker_name == "Carlo Schmid") %>%
#' subset(p_type == "speech") %>%
#' subset(p_type == "speech") %>%
#' get_dbpedia_uris(language = "de", s_attribute = "ne", max_len = 5067)
#'
setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbpedia.lang"), p_attribute = "word", s_attribute = NULL, max_len = 5600L, confidence = 0.35, api = getOption("dbpedia.endpoint"), types = FALSE, verbose = TRUE){
setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbpedia.lang"), p_attribute = "word", s_attribute = NULL, max_len = 5600L, confidence = 0.35, api = getOption("dbpedia.endpoint"), return_types = FALSE, expand_to_token = FALSE, drop_inexact_annotations = TRUE, verbose = TRUE){

if (verbose) cli_progress_step("convert input to `AnnotatedPlainTextDocument`")
doc <- decode(
Expand All @@ -277,15 +285,25 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp
api = api,
verbose = verbose
)


# prepare function to assign cpos_right depending on value and arguments
expand_fun = function(.SD) {
cpos_right <- dt[.SD[["end"]] == dt[["end"]]][["id"]]

if (length(cpos_right) == 0 & isTRUE(expand_to_token)) {
cpos_right <- dt[["id"]][which(dt[["end"]] > .SD[["end"]])[1]]
} else {
cpos_right
}
}

if (is.null(s_attribute)){
dt <- as.data.table(doc, what = s_attribute)
links[, "end" := links[["start"]] + nchar(links[["text"]]) - 1L]
tab <- links[,
list(
cpos_left = dt[.SD[["start"]] == dt[["start"]]][["id"]],
cpos_right = dt[.SD[["end"]] == dt[["end"]]][["id"]],
cpos_right = expand_fun(.SD),
dbpedia_uri = .SD[["dbpedia_uri"]],
text = .SD[["text"]],
types = .SD[["types"]]
Expand Down Expand Up @@ -346,9 +364,17 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp
}

# return types (optionally)
if (isFALSE(types))
if (isFALSE(return_types))
tab[, types := NULL]

# drop entities which cannot be mapped exactly to the tokenstream from the
# output (see issue #26).
if (isTRUE(drop_inexact_annotations) & any(is.na(tab[["cpos_right"]]))) {
missing_cpos_idx <- which(is.na(tab[["cpos_right"]]))
cli_alert_warning("Cannot map {length(missing_cpos_idx)} entit{?y/ies} exactly to tokenstream. Dropping {?it/them} from the annotation.")
tab <- tab[-missing_cpos_idx, ]
}

tab
})

Expand All @@ -359,7 +385,7 @@ setMethod("get_dbpedia_uris", "subcorpus", function(x, language = getOption("dbp
#' uritab <- corpus("REUTERS") %>%
#' split(s_attribute = "id", verbose = FALSE) %>%
#' get_dbpedia_uris(language = "en", p_attribute = "word", verbose = TRUE)
setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOption("dbpedia.lang"), p_attribute = "word", s_attribute = NULL, confidence = 0.35, api = getOption("dbpedia.endpoint"), max_len = 5600L, types = FALSE, verbose = TRUE, progress = FALSE){
setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOption("dbpedia.lang"), p_attribute = "word", s_attribute = NULL, confidence = 0.35, api = getOption("dbpedia.endpoint"), max_len = 5600L, return_types = FALSE, verbose = TRUE, progress = FALSE){

if (progress){
env <- parent.frame()
Expand All @@ -377,7 +403,7 @@ setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOpti
max_len = max_len,
confidence = confidence,
api = api,
types = types,
return_types = return_types,
verbose = if (progress) FALSE else verbose
)
}
Expand Down Expand Up @@ -421,7 +447,7 @@ setMethod(
max_len = 5600L,
confidence = 0.35,
api = getOption("dbpedia.endpoint"),
types = FALSE,
return_types = FALSE,
verbose = TRUE,
progress = FALSE
){
Expand Down Expand Up @@ -465,7 +491,7 @@ setMethod(
setcolorder(retval, neworder = "doc")

# return types (optionally)
if (isFALSE(types))
if (isFALSE(return_types))
retval[, types := NULL]

retval
Expand Down
37 changes: 26 additions & 11 deletions man/get_dbpedia_uris.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit ef39573

Please sign in to comment.