Skip to content

Commit

Permalink
Merge branch 'devel' of https://github.com/PolMine/dbpedia into devel
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Blätte authored and Andreas Blätte committed Apr 12, 2024
2 parents cfecd18 + 5766f6b commit ce0cf32
Show file tree
Hide file tree
Showing 7 changed files with 207 additions and 52 deletions.
7 changes: 6 additions & 1 deletion NAMESPACE
Expand Up @@ -66,8 +66,13 @@ importFrom(tibble,as_tibble)
importFrom(utils,URLencode)
importFrom(utils,capture.output)
importFrom(xml2,read_xml)
importFrom(xml2,xml_add_parent)
importFrom(xml2,xml_add_sibling)
importFrom(xml2,xml_attr)
importFrom(xml2,xml_children)
importFrom(xml2,xml_find_all)
importFrom(xml2,xml_set_attrs)
importFrom(xml2,xml_name)
importFrom(xml2,xml_ns)
importFrom(xml2,xml_parent)
importFrom(xml2,xml_set_attr)
importFrom(xml2,xml_text)
6 changes: 5 additions & 1 deletion NEWS.md
Expand Up @@ -14,7 +14,11 @@
## dbpedia v0.1.2.9004

* Method `get_dbpedia_uris()` has new argument `retry` to retry if API is stalled #45 and new argument `logfile` for tracking and debugging longrunning annotation tasks. If the annotation failes, `NULL` is returned (no abort).

* improved verbosity of XML processing by implementing the approach used when processing subcorpus bundles for XML nodes as well
* addressed issue with overlapping annotations in XML, described for CWB corpora in issue #43
* added the possibility to enrich previous named entities without restricting the initial annotation process to them only (I.e. enrich existing and adding new ones is possible now)
* changed encoding of URIs in XML from "dbpedia_uri" to "type" and "ref"
* made the annotation of nested pre-annotated data more robust

## dbpedia v0.1.2.9003

Expand Down
51 changes: 31 additions & 20 deletions R/dbpedia.R
Expand Up @@ -1096,32 +1096,33 @@ setMethod(
support = 20,
expand_to_token = FALSE,
drop_inexact_annotations = TRUE,
verbose = TRUE
verbose = if (progress) FALSE else verbose,
progress = FALSE
) {

# sometimes, there are nodes of the same name in different parts of the
# document (such as <name>) in ParlaMint which describes persons in the TEI
# header and named entities in the text body. It can be useful to focus on the
# text part.

if (!is.null(text_tag)) {
nodes <- xml2::xml_find_all(
x,
xpath = namespaced_xpath(xml = x, tags = text_tag)
)
} else {
nodes <- x

# Note: these two nodes objects are different since the first is a nodeset,
# the second is a xml_document.
}

# get units which should be send to the DBpedia Spotlight (to account for
# max_len, etc.). This can be the entire text or a paragraph or a sentence,
# depending on the structure. Provided by "segment" argument.

# get both tokens and features (NEs, etc.)

if (is.null(segment)) {
nodes_to_process <- nodes
} else {
Expand All @@ -1130,19 +1131,19 @@ setMethod(
xpath = namespaced_xpath(xml = x, tags = segment)
)
}

if (verbose)
cli_progress_step("preparing {.val {length(nodes_to_process)}} annotation tables.")
cli_progress_step("preparing {.val {length(nodes_to_process)}} segments to process.")

docs <- to_annotation(
nodes = nodes_to_process,
xml = x,
token_tags = token_tags,
feature_tag = feature_tag
)

if (verbose) cli_progress_done()

# prepare function to assign ID depending on value and arguments
expand_fun = function(.SD, dt) {
id_right <- dt[.SD[["end"]] == dt[["end"]]][["id"]]
Expand All @@ -1153,10 +1154,10 @@ setMethod(
id_right
}
}

# Note: The following function should probably overload the existing
# dbpedia:::as.data.table.AnnotatedPlainTextDocument() function.

AnnotatedPlainTextDocument_to_datatable2 = function (x, what = NULL) {
dt <- setDT(as.data.frame(x[["annotation"]]))
if (!is.null(what)) {
Expand All @@ -1176,9 +1177,16 @@ setMethod(
}
dt
}


if (progress) {
env <- parent.frame()
cli_progress_bar("Tasks", total = length(docs), type = "tasks", .envir = env)
}

annotations <- lapply(docs, function(doc) {


if (progress) cli_progress_update(.envir = env)

links <- get_dbpedia_uris(
x = doc,
language = language,
Expand All @@ -1192,9 +1200,9 @@ setMethod(
support = support,
verbose = verbose
)

if (nrow(links) == 0) return(NULL) # no entities in this segment

if (is.null(feature_tag)) {
dt <- AnnotatedPlainTextDocument_to_datatable2(doc, what = feature_tag)
links[, "end" := links[["start"]] + nchar(links[["text"]]) - 1L]
Expand All @@ -1207,10 +1215,11 @@ setMethod(
text = .SD[["text"]],
types = .SD[["types"]]
),
by = "start",
by = c("start", "end"),
.SDcols = c("start", "end", "dbpedia_uri", "text", "types")
]
tab[, "start" := NULL]
tab[, "end" := NULL]

} else {

Expand Down Expand Up @@ -1253,7 +1262,9 @@ setMethod(

}
)


if (progress) cli_progress_done(.envir = env)

data.table::rbindlist(annotations)
})

Expand Down

0 comments on commit ce0cf32

Please sign in to comment.