Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
method
entity_types_map()
drafted #27
- Loading branch information
Andreas Blätte
authored and
Andreas Blätte
committed
Feb 27, 2024
1 parent
24c07f6
commit 8eae01e
Showing
3 changed files
with
120 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
#' Map types returned by DBpedia Spotlight to a limited set of classes | ||
#' | ||
#' This function takes the output of `get_dbpedia_uris()` and compares values in | ||
#' the `types` column with a named character vector. The main purpose of this | ||
#' function is to reduce the number of types to a limited set of classes. | ||
#' | ||
#' @param x A `data.table` with DBpedia URIs. | ||
#' @param mapping_vector A `named character vector` with desired class names (as | ||
#' names) and types from the DBpedia ontology as values. For example: | ||
#' c("PERSON" = "DBpedia:Person"). Can contain more than one pair of class and | ||
#' type. | ||
#' @param other a `character vector` with the name of the class of all types not | ||
#' matched by the `mapping_vector`. | ||
#' @param verbose A `logical` value - whether to display messages. | ||
#' @importFrom data.table is.data.table | ||
#' @importFrom cli format_error cli_alert_info | ||
#' @details If there is more than one match between the retrieved types and the | ||
#' `mapping vector`, unique classes are sorted alphabetically and collapsed. | ||
#' @return Function adds classes to input data.table by reference. | ||
#' @exportMethod generic | ||
#' @rdname entity_types_map | ||
setGeneric("entity_types_map", function(x, ...) | ||
standardGeneric("entity_types_map")) | ||
|
||
|
||
#' @rdname entity_types_map | ||
#' @examples | ||
#' inaugural_paragraphs <- data_corpus_inaugural %>% | ||
#' corpus_subset(Year == 2009) %>% # limit to Barack Obama 2009 | ||
#' corpus_reshape(to = "paragraphs") | ||
#' | ||
#' uritab_paragraphs <- get_dbpedia_uris( | ||
#' x = inaugural_paragraphs, | ||
#' language = "en", | ||
#' max_len = 5600L, | ||
#' confidence = 0.5, | ||
#' api = "http://api.dbpedia-spotlight.org/en/annotate", | ||
#' verbose = FALSE, | ||
#' progress = TRUE | ||
#' ) | ||
#' | ||
#' mapping_vector = c( | ||
#' "PERSON" = "DBpedia:Person", | ||
#' "ORGANIZATION" = "DBpedia:Organisation", | ||
#' "LOCATION" = "DBpedia:Place" | ||
#' ) | ||
#' | ||
#' entity_types_map( | ||
#' uritab_paragraphs[["types"]], | ||
#' mapping_vector = mapping_vector | ||
#' ) | ||
setMethod( | ||
"entity_types_map", "list", | ||
function(x, mapping_vector, other = "MISC", verbose = TRUE | ||
) { | ||
|
||
# there is not a check yet whether the vector is named | ||
if (!is.character(mapping_vector)) { | ||
stop(format_error(c( | ||
"{.var mapping_vector} is no character vector.", | ||
"i" = "The {.var mapping_vector} must be a named character vector." | ||
))) | ||
} | ||
|
||
if (!is.character(other) | length(other) > 1) | ||
stop(format_error("{.var other} not character vector of length {.val 1}.")) | ||
|
||
lapply( | ||
x, | ||
function(el){ | ||
# types is a list of lists. Transform to single character vector. | ||
type_list <- unlist(el, recursive = FALSE) | ||
|
||
# An unintended consequence here is that you may get DBpedia1, DBpedia2, ... | ||
|
||
types_with_class_raw <- lapply( | ||
seq_along(type_list), | ||
function(i) { | ||
list_name <- names(type_list)[[i]] | ||
list_elements <- type_list[[i]] | ||
paste0(list_name, ":", list_elements) | ||
}) | ||
types_with_class <- intersect(unlist(types_with_class_raw), mapping_vector) | ||
|
||
if (length(types_with_class) > 0L) { | ||
match_idx <- which(mapping_vector %in% types_with_class) | ||
|
||
class_name <- paste( | ||
sort(unique(names(mapping_vector)[match_idx])), | ||
collapse = "|" | ||
) | ||
} else { | ||
class_name <- other | ||
} | ||
} | ||
) | ||
} | ||
) | ||
|
||
#' @rdname entity_types_map | ||
setMethod( | ||
"entity_types_map", "data.table", | ||
function(x, mapping_vector, other = "MISC", verbose = TRUE) { | ||
|
||
if (!"types" %in% colnames(x)) { | ||
stop(format_error(c( | ||
"There is no {.var types} column in the input data.table.", | ||
"i" = "Types are returned by {.fn get_dbpedia_uri} only if the argument `types` is set to TRUE." | ||
))) | ||
} | ||
|
||
if (verbose) | ||
cli_alert_info( | ||
"mapping values in column {.var types} to new column {.var class}" | ||
) | ||
|
||
x[, class := entity_types_map(x = x[["types"]])] | ||
x | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters