Skip to content

Commit

Permalink
new fun dbpedia_lookup
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Blätte authored and Andreas Blätte committed May 5, 2024
1 parent ff5bfdb commit 132e5a1
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 14 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
@@ -1,8 +1,8 @@
Package: dbpedia
Type: Package
Title: R Wrapper for DBpedia Spotlight
Version: 0.1.2.9007
Date: 2024-05-04
Version: 0.1.2.9008
Date: 2024-05-05
Authors@R: c(
person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de", comment = c(ORCID = "0000-0001-8970-8010")),
person("Christoph", "Leonhardt", role = "aut")
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Expand Up @@ -5,6 +5,7 @@ export(as_annotation)
export(as_subcorpus)
export(categorize_overlap)
export(dbpedia_get_wikidata_uris)
export(dbpedia_lookup)
export(dbpedia_spotlight_status)
export(detect_overlap)
export(namespaced_xpath)
Expand Down
6 changes: 5 additions & 1 deletion NEWS.md
@@ -1,5 +1,9 @@
## dbpedia v0.1.2.9008
* `get_dbpedia_uris()`-methods have new argument `types_drop` to drop 'types column that may result in nested data structures causing errors with large data.
* New function `dbpedia_lookup()`.

## dbpedia v0.1.2.9007
* All `get_dbpedia_uris()`-methods have argument `types_src`. If it is used, the column "types" is removed #55.
* All `get_dbpedia_uris()`-methods have argument `types_src` #55.
* `sparql_query()` has three dots (...) option for passing arguments into
`httr::GET()` - specifying a timeout as intended usage #56.
* `dbpedia_get_wikidata_uris()` has three-dots-argument to pass timeout into `httr::GET()` via `sparql_query()` #57.
Expand Down
112 changes: 101 additions & 11 deletions R/dbpedia.R
Expand Up @@ -371,6 +371,7 @@ setMethod(
types = character(),
support = 20,
types_src = c("DBpedia", "Wikidata"),
types_drop = FALSE,
verbose = TRUE
) {

Expand Down Expand Up @@ -409,6 +410,7 @@ setMethod(
types = types,
support = support,
types_src = types_src,
types_drop = types_drop,
verbose = verbose

)
Expand Down Expand Up @@ -594,9 +596,11 @@ setMethod(

resources_min[, (paste(src, "type", sep = "_")) := types_vec]
}
resources_min[, "types" := NULL]

}

if (isTRUE(types_drop)) resources_min[, "types" := NULL]

resources_min
}
)
Expand All @@ -620,6 +624,7 @@ setMethod(
types = character(),
support = 20,
types_src = c("DBpedia", "Wikidata"),
types_drop = FALSE,
verbose = TRUE
) {

Expand All @@ -636,6 +641,7 @@ setMethod(
types = types,
support = support,
types_src = types_src,
types_drop = types_drop,
verbose = verbose
)
}
Expand Down Expand Up @@ -688,6 +694,10 @@ setMethod(
#' (e.g. "DBpedia_types") with entity types (`NA` if not available) will be
#' added to the table. Values are wrapped and separated by vertical bars.
#' `types_src` defaults to "DBpedia" and "Wikidata".
#' @param types_drop A `logical` value - whether to drop the "types" column with
#' lists of entity types in the knowledge bases. Dropping the column is
#' recommend for processing large data to avoid nested data structure and
#' errors. Defaults to `FALSE`.
#' @param verbose A `logical` value - whether to display messages.
#' @param progress A `logical` value - whether to show progress.
#' @param s_attribute A length-one `character` vector indicating a s-attribute.
Expand All @@ -710,8 +720,7 @@ setMethod(
#' If the request to the endpoint failes, `NULL` is returned.
#'
#' If argument `types_src` is specified, the information in the column 'types'
#' is dissolved into columns such as `types_DBpedia`, and the 'types'-column
#' is dropped.
#' is dissolved into columns such as `types_DBpedia`.
#' @exportMethod get_dbpedia_uris
#' @importFrom cli cli_alert_warning cli_progress_step cli_alert_danger
#' cli_progress_done cli_alert_info
Expand Down Expand Up @@ -763,6 +772,7 @@ setMethod(
types = character(),
support = 20,
types_src = c("DBpedia", "Wikidata"),
types_drop = FALSE,
expand_to_token = FALSE,
drop_inexact_annotations = TRUE,
verbose = TRUE
Expand All @@ -779,6 +789,7 @@ setMethod(

if (verbose)
cli_progress_step("convert input to `AnnotatedPlainTextDocument`")

doc <- decode(
x,
to = "AnnotatedPlainTextDocument",
Expand All @@ -790,6 +801,7 @@ setMethod(
),
verbose = FALSE
)

if (verbose) cli_progress_done()

links <- get_dbpedia_uris(
Expand All @@ -805,6 +817,7 @@ setMethod(
types = types,
support = support,
types_src = types_src,
types_drop = types_drop,
verbose = verbose
)

Expand Down Expand Up @@ -844,8 +857,7 @@ setMethod(
by = c("start", "end"),
.SDcols = c("start", "end", "dbpedia_uri", "text", "types")
]
tab[, "start" := NULL]
tab[, "end" := NULL]
tab[, "start" := NULL][, "end" := NULL]

} else {

Expand Down Expand Up @@ -874,11 +886,8 @@ setMethod(
registry = x@registry_dir,
strucs = strucs
)
tab[["cpos_left"]] <- r[,1]
tab[["cpos_right"]] <- r[,2]
tab[["start"]] <- NULL
tab[["end"]] <- NULL
tab[["id"]] <- NULL
tab[, "cpos_left" := r[, 1]][, "cpos_right" := r[, 2]]
tab[, "start" := NULL][, "end" := NULL][, "id" := NULL]

setcolorder(
x = tab,
Expand Down Expand Up @@ -948,6 +957,8 @@ setMethod(
types = character(),
support = 20,
types_src = c("DBpedia", "Wikidata"),

types_drop = FALSE,
max_len = 7990L,
overlap = 1000L,
expand_to_token = FALSE,
Expand Down Expand Up @@ -977,6 +988,7 @@ setMethod(
types = types,
support = support,
types_src = types_src,
types_drop = types_drop,
expand_to_token = expand_to_token,
verbose = if (progress) FALSE else verbose
)
Expand Down Expand Up @@ -1026,6 +1038,7 @@ setMethod(
types = character(),
support = 20,
types_src = c("DBpedia", "Wikidata"),
types_drop = FALSE,
verbose = TRUE,
progress = FALSE
) {
Expand Down Expand Up @@ -1064,6 +1077,7 @@ setMethod(
types = types,
support = support,
types_src = types_src,
types_drop = types_drop,
verbose = if (progress) FALSE else verbose
)[, "doc" := docname]
}
Expand Down Expand Up @@ -1106,6 +1120,7 @@ setMethod(
types = character(),
support = 20,
types_src = c("DBpedia", "Wikidata"),
types_drop = FALSE,
expand_to_token = FALSE,
drop_inexact_annotations = TRUE,
verbose = if (progress) FALSE else verbose,
Expand Down Expand Up @@ -1211,6 +1226,7 @@ setMethod(
types = types,
support = support,
types_src = types_src,
types_drop = types_drop,
verbose = verbose
)

Expand Down Expand Up @@ -1255,7 +1271,10 @@ setMethod(
tab[["id_left"]] <- NULL
tab[["id_right"]] <- NULL

setcolorder(x = tab, neworder = c("dbpedia_uri", "text", "types", "original_id"))
setcolorder(
x = tab,
neworder = c("dbpedia_uri", "text", "types", "original_id")
)

}

Expand All @@ -1282,6 +1301,77 @@ setMethod(
})


#' Use DBpedia Lookup service to get DBpedia URIs
#'
#' DBpedia Lookup is a service to match a query on a DBpedia URI. See the
#' explanation [here](https://www.dbpedia.org/resources/lookup/) and the
#' experimental GUI [here](https://lookup.dbpedia.org/index.html).
#'
#' @param query A term/query to look up.
#' @param api The service to consult.
#' @param max_results The maximum number of results
#' @param progress A `logical` value, whether to show progress message.
#' @return A named list of vectors with length `max_results`; the names are the
#' queries.
#' @importFrom httr GET http_error content
#' @importFrom xml2 read_xml xml_find_all xml_text
#' @export
#' @examples
#' uri <- dbpedia_lookup(query = "Berlin")
#' uris <- dbpedia_lookup(query = c("Berlin", "Paris", "London"))
dbpedia_lookup <- function(
query,
api = "https://lookup.dbpedia.org/api/search",
max_results = 5,
progress = TRUE
) {

if (length(query) > 1L) {
if (progress) {
cli_progress_bar(
"processing DBpedia Lookup queries",
total = length(query),
type = "tasks"
)
}

li <- lapply(
seq_along(query),
function(i){
if (progress) cli_progress_update(.envir = parent.frame(n = 2))
dbpedia_lookup(
query = query[[i]],
api = api,
max_results = max_results,
progress = FALSE
)
}
)

retval <- setNames(lapply(li, `[[`, 1), sapply(lapply(li, names), `[[`, 1))

if (progress) cli_progress_done()
} else {
res <- GET(
url = api,
query = list(
query = query,
maxResults = max_results
)
)

if (http_error(res)) return(NA_character_)

txt <- content(res, as = "text", encoding = "UTF-8")
xml <- read_xml(txt)
uri_nodeset <- xml_find_all(xml, xpath = "/ArrayOfResults/Result/URI")
retval <- setNames(list(xml_text(uri_nodeset)), query)
}

retval
}



#' Stopwords used by DBpedia Spotlight
#'
Expand Down
35 changes: 35 additions & 0 deletions man/dbpedia_lookup.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions man/get_dbpedia_uris.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 132e5a1

Please sign in to comment.