Skip to content

Commit

Permalink
method entity_types_map() drafted #27
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Blätte authored and Andreas Blätte committed Feb 27, 2024
1 parent 24c07f6 commit 8eae01e
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 82 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Expand Up @@ -49,6 +49,7 @@ Collate:
'wikidata.R'
'get_annotation_table.R'
'xml.R'
'entity_types.R'
'zzz.R'
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
Expand Down
119 changes: 119 additions & 0 deletions R/entity_types.R
@@ -0,0 +1,119 @@
#' Map types returned by DBpedia Spotlight to a limited set of classes
#'
#' This function takes the output of `get_dbpedia_uris()` and compares values in
#' the `types` column with a named character vector. The main purpose of this
#' function is to reduce the number of types to a limited set of classes.
#'
#' @param x A `data.table` with DBpedia URIs.
#' @param mapping_vector A `named character vector` with desired class names (as
#' names) and types from the DBpedia ontology as values. For example:
#' c("PERSON" = "DBpedia:Person"). Can contain more than one pair of class and
#' type.
#' @param other a `character vector` with the name of the class of all types not
#' matched by the `mapping_vector`.
#' @param verbose A `logical` value - whether to display messages.
#' @importFrom data.table is.data.table
#' @importFrom cli format_error cli_alert_info
#' @details If there is more than one match between the retrieved types and the
#' `mapping vector`, unique classes are sorted alphabetically and collapsed.
#' @return Function adds classes to input data.table by reference.
#' @exportMethod generic
#' @rdname entity_types_map
setGeneric("entity_types_map", function(x, ...)
standardGeneric("entity_types_map"))


#' @rdname entity_types_map
#' @examples
#' inaugural_paragraphs <- data_corpus_inaugural %>%
#' corpus_subset(Year == 2009) %>% # limit to Barack Obama 2009
#' corpus_reshape(to = "paragraphs")
#'
#' uritab_paragraphs <- get_dbpedia_uris(
#' x = inaugural_paragraphs,
#' language = "en",
#' max_len = 5600L,
#' confidence = 0.5,
#' api = "http://api.dbpedia-spotlight.org/en/annotate",
#' verbose = FALSE,
#' progress = TRUE
#' )
#'
#' mapping_vector = c(
#' "PERSON" = "DBpedia:Person",
#' "ORGANIZATION" = "DBpedia:Organisation",
#' "LOCATION" = "DBpedia:Place"
#' )
#'
#' entity_types_map(
#' uritab_paragraphs[["types"]],
#' mapping_vector = mapping_vector
#' )
setMethod(
"entity_types_map", "list",
function(x, mapping_vector, other = "MISC", verbose = TRUE
) {

# there is not a check yet whether the vector is named
if (!is.character(mapping_vector)) {
stop(format_error(c(
"{.var mapping_vector} is no character vector.",
"i" = "The {.var mapping_vector} must be a named character vector."
)))
}

if (!is.character(other) | length(other) > 1)
stop(format_error("{.var other} not character vector of length {.val 1}."))

lapply(
x,
function(el){
# types is a list of lists. Transform to single character vector.
type_list <- unlist(el, recursive = FALSE)

# An unintended consequence here is that you may get DBpedia1, DBpedia2, ...

types_with_class_raw <- lapply(
seq_along(type_list),
function(i) {
list_name <- names(type_list)[[i]]
list_elements <- type_list[[i]]
paste0(list_name, ":", list_elements)
})
types_with_class <- intersect(unlist(types_with_class_raw), mapping_vector)

if (length(types_with_class) > 0L) {
match_idx <- which(mapping_vector %in% types_with_class)

class_name <- paste(
sort(unique(names(mapping_vector)[match_idx])),
collapse = "|"
)
} else {
class_name <- other
}
}
)
}
)

#' @rdname entity_types_map
setMethod(
"entity_types_map", "data.table",
function(x, mapping_vector, other = "MISC", verbose = TRUE) {

if (!"types" %in% colnames(x)) {
stop(format_error(c(
"There is no {.var types} column in the input data.table.",
"i" = "Types are returned by {.fn get_dbpedia_uri} only if the argument `types` is set to TRUE."
)))
}

if (verbose)
cli_alert_info(
"mapping values in column {.var types} to new column {.var class}"
)

x[, class := entity_types_map(x = x[["types"]])]
x
}
82 changes: 0 additions & 82 deletions R/utils.R
Expand Up @@ -170,85 +170,3 @@ unique_msg <- function(x, verbose = TRUE){
cli_alert_info("{.val {length(y)}} unique values to process")
y
}


#' Map types returned by DBpedia Spotlight to a limited set of classes
#'
#' This function takes the output of `get_dbpedia_uris()` and compares values in
#' the `types` column with a named character vector. The main purpose of this
#' function is to reduce the number of types to a limited set of classes.
#'
#' @param x A `data.table` with DBpedia URIs.
#' @param mapping_vector A `named character vector` with desired class names (as
#' names) and types from the DBpedia ontology as values. For example:
#' c("PERSON" = "DBpedia:Person"). Can contain more than one pair of class and
#' type.
#' @param other a `character vector` with the name of the class of all types not
#' matched by the `mapping_vector`.
#' @param verbose A `logical` value - whether to display messages.
#' @importFrom data.table is.data.table
#' @importFrom cli format_error cli_alert_info
#' @details If there is more than one match between the retrieved types and the
#' `mapping vector`, unique classes are sorted alphabetically and collapsed.
#' @return Function adds classes to input data.table by reference.
#' @export
map_types_to_class <- function(x, mapping_vector, other = "MISC", verbose = TRUE) {

if (!is.data.table(x))
stop(format_error("input {.var x} is no data.table."))

if (!is.character(mapping_vector)) {
stop(format_error(c(
"{.var mapping_vector} is no character vector.",
"i" = "The {.var mapping_vector} must be a named character vector."
)))
}

if (!is.character(other) | length(other) > 1) {

stop(format_error("{.var other} not character vector of length {.val 1}."))
}

if (!"types" %in% colnames(x)) {

stop(format_error(c(
"There is no {.var types} column in the input data.table.",
"i" = "Types are returned by {.fn get_dbpedia_uri} only if the argument `types` is set to TRUE."
)))
}

types_to_class_fun <- function(types) {

# types is a list of lists. Transform to single character vector.
type_list <- unlist(types, recursive = FALSE)

types_with_class_raw <- lapply(seq_along(type_list), function(i) {
list_name <- names(type_list)[[i]]
list_elements <- type_list[[i]]
paste0(list_name, ":", list_elements)
})
types_with_class <- intersect(unlist(types_with_class_raw), mapping_vector)

if (length(types_with_class) > 0L) {
match_idx <- which(mapping_vector %in% types_with_class)

class_name <- paste(
sort(unique(names(mapping_vector)[match_idx])),
collapse = "|"
)

} else {
class_name <- other
}

return(class_name)
}

if (verbose)
cli_alert_info(
"mapping values in column {.var types} to new column {.var class}"
)

x[, class := types_to_class_fun(types = x[["types"]]), by = 1:nrow(x)]

}

0 comments on commit 8eae01e

Please sign in to comment.