Skip to content

Commit

Permalink
pipes dropped for oldrel compatibility #38
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Blätte authored and Andreas Blätte committed Feb 26, 2024
1 parent 12d01fc commit a52d1fb
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 62 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Expand Up @@ -57,6 +57,7 @@ importFrom(tibble,as_tibble)
importFrom(utils,URLencode)
importFrom(xml2,read_xml)
importFrom(xml2,xml_attr)
importFrom(xml2,xml_children)
importFrom(xml2,xml_find_all)
importFrom(xml2,xml_set_attrs)
importFrom(xml2,xml_text)
65 changes: 32 additions & 33 deletions R/dbpedia.R
Expand Up @@ -155,6 +155,7 @@ as_annotation <- function(x){
#' @param feature_tag ...
#' @importFrom stringi stri_c
#' @importFrom NLP Annotation
#' @importFrom xml2 xml_children
to_annotation = function(nodes, xml, token_tags, feature_tag) {

if (inherits(nodes, "xml_nodeset")) {
Expand All @@ -169,8 +170,10 @@ to_annotation = function(nodes, xml, token_tags, feature_tag) {

} else {

token_elements <- nodes |>
xml2::xml_find_all(xpath = namespaced_xpath(xml = xml, tags = token_tags))
token_elements <- xml2::xml_find_all(
nodes,
xpath = namespaced_xpath(xml = xml, tags = token_tags)
)

# make token annotation data annotation

Expand Down Expand Up @@ -210,9 +213,10 @@ to_annotation = function(nodes, xml, token_tags, feature_tag) {
# data.frame split to rwos

token_feat_dataframe <- data.frame(word = toks, id = tok_ids)
token_feat_list <- split(token_feat_dataframe, seq(nrow(token_feat_dataframe))) |>
unname()

token_feat_list <- unname(
split(token_feat_dataframe, seq(nrow(token_feat_dataframe)))
)

token_annotation <- NLP::Annotation(
seq_along(tok_ids), # IDs must be integer, which is a bit unfortunate
rep("word", length(tok_ids)),
Expand All @@ -224,49 +228,45 @@ to_annotation = function(nodes, xml, token_tags, feature_tag) {
# and add feature elements if chosen

if (!is.null(feature_tag)) {
feature_elements <- nodes |>
xml2::xml_find_all(xpath = namespaced_xpath(xml = xml, tags = feature_tag))
feature_elements <- xml2::xml_find_all(
nodes,
xpath = namespaced_xpath(xml = xml, tags = feature_tag)
)
} else {
feature_elements <- NULL
}

if (length(feature_elements) > 0) {


feature_ids <- sapply(feature_elements, function(element) {
xml2::xml_find_first(element,
xpath = namespaced_xpath(xml = xml, tags = token_tags)) |>
xml2::xml_attr("id")
}
)
feature_ids <- sapply(
feature_elements,
function(element) {
el <- xml2::xml_find_first(
element,
xpath = namespaced_xpath(xml = xml, tags = token_tags)
)
xml2::xml_attr(el, "id")
})

feature_ids <- sprintf("%s_%s", feature_ids, feature_tag)

# get attributes of features
feature_ids <- feature_ids # name has no ID. We use the first word ID (assuming that there are no overlaps?)
feature_kinds <- xml2::xml_attr(feature_elements, "type")
feature_texts <- sapply(feature_elements, function(feat) {
xml2::xml_children(feat) |>
xml2::xml_text() |>
paste(collapse = " ")
}
feature_texts <- sapply(
feature_elements,
function(feat) paste(xml_text(xml_children(feat)), collapse = " ")
)

# get spans for features

entity_spans <- sapply(feature_elements, function(element) {
child_id <- element |>
xml2::xml_children() |>
xml2::xml_attr("id")

entity_spans <- t(sapply(feature_elements, function(element) {
child_id <- xml_attr(xml_children(element), "id")
child_idx <- which(tok_ids %in% child_id)
child_start <- min(start_positions[child_idx])
child_end <- max(end_positions[child_idx])

matrix(c(child_start, child_end), nrow = 1, ncol = 2)

}
) |> t()
matrix(c(child_start, child_end), nrow = 1L, ncol = 2L)
}))


feature_annotation <- NLP::Annotation(
Expand Down Expand Up @@ -300,7 +300,7 @@ to_annotation = function(nodes, xml, token_tags, feature_tag) {

# make string
word_with_ws <- paste(toks, ifelse(is.na(tok_joins), " ", ""), sep = "")
s <- stringi::stri_c(word_with_ws, collapse = "") |> trimws()
s <- trimws(stringi::stri_c(word_with_ws, collapse = ""))

# add segment id as metadata (should work if segment is NULL as the TEI has
# an ID as well).
Expand Down Expand Up @@ -705,13 +705,12 @@ setMethod("get_dbpedia_uris", "subcorpus_bundle", function(x, language = getOpti
#'
#' # Process quanteda corpus
#' library(quanteda)
#' uritab <- data_char_ukimmig2010 |>
#' corpus() |>
#' uritab <- data_char_ukimmig2010 %>%
#' corpus() %>%
#' get_dbpedia_uris(
#' verbose = FALSE,
#' config = httr::config(http_version = 1.1)
#' )
#'
#' @rdname get_dbpedia_uris
setMethod(
"get_dbpedia_uris",
Expand Down
19 changes: 8 additions & 11 deletions R/utils.R
Expand Up @@ -222,23 +222,20 @@ map_types_to_class <- function(x, mapping_vector, other = "MISC", verbose = TRUE
# types is a list of lists. Transform to single character vector.
type_list <- unlist(types, recursive = FALSE)

types_with_class <- lapply(seq_along(type_list), function(i) {
types_with_class_raw <- lapply(seq_along(type_list), function(i) {
list_name <- names(type_list)[[i]]
list_elements <- type_list[[i]]
paste0(list_name, ":", list_elements)
}) |>
unlist() |>
intersect(mapping_vector)
})
types_with_class <- intersect(unlist(types_with_class_raw), mapping_vector)

if (length(types_with_class) > 0) {
if (length(types_with_class) > 0L) {
match_idx <- which(mapping_vector %in% types_with_class)

class_name <- mapping_vector |>
names() |>
_[match_idx] |>
unique() |>
sort() |>
paste(collapse = "|")
class_name <- paste(
sort(unique(names(mapping_vector)[match_idx])),
collapse = "|"
)

} else {
class_name <- other
Expand Down
15 changes: 11 additions & 4 deletions R/wikidata.R
Expand Up @@ -263,10 +263,17 @@ setGeneric(
#'
#' httr::set_config(httr::config(ssl_verifypeer = 0L))
#'
#' uritab <- data_char_ukimmig2010 |>
#' corpus() |>
#' get_dbpedia_uris(progress = TRUE) %>%
#' add_wikidata_uris(endpoint = "https://dbpedia.org/sparql/", progress = TRUE, chunksize = 100) %>%
#' uritab <- data_char_ukimmig2010 %>%
#' corpus() %>%
#' get_dbpedia_uris(
#' progress = TRUE,
#' config = httr::config(http_version = 1.1)
#' ) %>%
#' add_wikidata_uris(
#' endpoint = "https://dbpedia.org/sparql/",
#' progress = TRUE,
#' chunksize = 100
#' ) %>%
#' wikidata_query(id = "P31")
#' }
#'
Expand Down
17 changes: 10 additions & 7 deletions R/xml.R
Expand Up @@ -26,11 +26,12 @@ xml_enrich <- function(xml,
) {

# get all nodes which might contain entities
nodes <- xml |>
xml2::xml_find_all(xpath = namespaced_xpath(xml = xml, tags = token_tags))
nodes <- xml2::xml_find_all(
xml,
xpath = namespaced_xpath(xml = xml, tags = token_tags)
)

node_ids <- nodes |>
xml2::xml_attr("id")
node_ids <- xml2::xml_attr(nodes, "id")

# for each annotation, extract identified words

Expand All @@ -46,9 +47,11 @@ xml_enrich <- function(xml,
# if there is no feature tag, pre-annotated named entities weren't
# provided. Add identified named entities to tokens.

annotation_id <- annotation_dt[i, ][["original_id"]] |>
strsplit(split = "\\|") |>
unlist()
annotation_id <- unlist(strsplit(
annotation_dt[i, ][["original_id"]],
split = "\\|"
)
)

# there could be additional values such as the type?
nodes_idx <- which(node_ids %in% annotation_id)
Expand Down
5 changes: 2 additions & 3 deletions man/get_dbpedia_uris.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 11 additions & 4 deletions man/wikidata_uris.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit a52d1fb

Please sign in to comment.