Skip to content

Commit

Permalink
corpus() handles corpora defined in multiple registry files #267
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Blätte authored and Andreas Blätte committed Oct 27, 2023
1 parent 0ab1878 commit da44ca7
Show file tree
Hide file tree
Showing 14 changed files with 148 additions and 61 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Expand Up @@ -223,6 +223,7 @@ importFrom(cli,cli_alert_danger)
importFrom(cli,cli_alert_info)
importFrom(cli,cli_alert_success)
importFrom(cli,cli_alert_warning)
importFrom(cli,cli_bullets)
importFrom(cli,cli_process_done)
importFrom(cli,cli_process_start)
importFrom(cli,cli_progress_bar)
Expand Down
24 changes: 13 additions & 11 deletions R/context.R
Expand Up @@ -149,7 +149,8 @@ setMethod("context", "slice", function(
left = left, right = right,
p_attribute = p_attribute, region = region,
boundary = boundary,
corpus = .Object@corpus
corpus = .Object@corpus,
registry = .Object@registry_dir
)

ctxt@query <- query
Expand Down Expand Up @@ -225,12 +226,15 @@ setMethod("context", "subcorpus", function(
#' `RcppCWB::region_matrix_context()`, the worker behind the
#' `context()`-method.
#' @param corpus A length-one `character` vector stating a corpus ID.
#' @param registry The registry directory with the registry file for `corpus`.
#' @rdname context-method
#' @importFrom data.table between
#' @importFrom RcppCWB region_matrix_context corpus_s_attributes
setMethod("context", "matrix", function(.Object, corpus, left, right, p_attribute, region = NULL, boundary = NULL){
setMethod("context", "matrix", function(.Object, corpus, registry = Sys.getenv("CORPUS_REGISTRY"), left, right, p_attribute, region = NULL, boundary = NULL){
if (ncol(.Object) != 2L) stop("context,matrix-method: .Object is required to be a two-column matrix")

stopifnot(is.character(corpus), length(corpus) == 1L)

if (inherits(left, "numeric"))
left <- setNames(as.integer(left), nm = names(left))

Expand All @@ -242,11 +246,8 @@ setMethod("context", "matrix", function(.Object, corpus, left, right, p_attribut
if (is.null(region)){
s_attr <- NULL
} else {
s_attr_present <- corpus_s_attributes(
corpus = corpus,
registry = corpus_registry_dir(corpus)
)
if (region %in% s_attr_present){
s_attrs <- corpus_s_attributes(corpus = corpus, registry = registry)
if (region %in% s_attrs){
s_attr <- region
} else {
warning(sprintf("s-attribute '%s' not defined", region))
Expand All @@ -265,15 +266,14 @@ setMethod("context", "matrix", function(.Object, corpus, left, right, p_attribut
right <- 0L
}

regdir <- corpus_registry_dir(corpus)
cpos_matrix <- region_matrix_context(
corpus = corpus,
matrix = .Object,
s_attribute = s_attr,
p_attribute = p_attribute[1],
left = left, right = right,
boundary = boundary,
registry = regdir
registry = registry
)
cpos_dt <- as.data.table(cpos_matrix)

Expand All @@ -286,8 +286,10 @@ setMethod("context", "matrix", function(.Object, corpus, left, right, p_attribut
if (length(p_attribute) > 1L){
for (i in 2L:length(p_attribute)){
ids <- cl_cpos2id(
corpus = corpus, registry = regdir,
p_attribute = p_attribute[i], cpos = cpos_dt[["cpos"]]
corpus = corpus,
registry = registry,
p_attribute = p_attribute[i],
cpos = cpos_dt[["cpos"]]
)
cpos_dt[, (paste(p_attribute[i], "id", sep = "_")) := ids]
}
Expand Down
17 changes: 12 additions & 5 deletions R/corpus.R
Expand Up @@ -22,6 +22,7 @@ setGeneric("corpus", function(.Object, ...) standardGeneric("corpus"))
#' @importFrom RcppCWB cqp_list_corpora corpus_data_dir corpus_registry_dir
#' corpus_info_file corpus_full_name
#' @importFrom fs path path_expand
#' @importFrom cli cli_bullets
setMethod("corpus", "character", function(
.Object, registry_dir,
server = NULL, restricted
Expand Down Expand Up @@ -63,11 +64,17 @@ setMethod("corpus", "character", function(
c_regdir <- path(corpus_registry_dir(.Object))
if (missing(registry_dir)){
if (length(c_regdir) > 1L){
print(c_regdir)
stop(
"Cannot initialize corpus object - ",
"corpus defined by two different registry files."
)
c_regdir <- path(unique(path(c_regdir)))
if (length(c_regdir) > 1L){
cli_alert_warning(
"corpus loaded multiple times with following registries:"
)
cli_bullets(setNames(c_regdir, rep("*", times = length(c_regdir))))
cli_alert_info(
"using {c_regdir[1]}"
)
}
registry_dir <- c_regdir[1]
} else if (is.na(c_regdir)){
stop(
"Cannot initialize corpus object - ",
Expand Down
37 changes: 30 additions & 7 deletions R/count.R
Expand Up @@ -150,14 +150,17 @@ setMethod("count", "subcorpus", function(
function(x){
region_matrix <- cpos(
.Object = .Object,
query = x, p_attribute = p_attribute,
query = x,
p_attribute = p_attribute,
cqp = cqp,
check = check
)
if (is.null(region_matrix)) return(NULL)
token <- get_token_stream(
ranges_to_cpos(region_matrix),
corpus = .Object@corpus, p_attribute = p_attribute,
corpus = .Object@corpus,
registry = .Object@registry_dir,
p_attribute = p_attribute,
encoding = .Object@encoding
)
ids <- unlist(
Expand Down Expand Up @@ -287,7 +290,12 @@ setMethod("count", "partition_bundle", function(.Object, query = NULL, cqp = FAL
)@stat
.message("rearranging table", verbose = verbose)
if (nrow(DT) > 0L){
DT_cast <- dcast.data.table(DT, partition ~ query, value.var = "count", fill = 0)
DT_cast <- dcast.data.table(
DT,
partition ~ query,
value.var = "count",
fill = 0
)
} else {
# If there are no matches, we generate a data.table directly that will be
# filled later on.
Expand Down Expand Up @@ -338,18 +346,27 @@ setMethod("count", "partition_bundle", function(.Object, query = NULL, cqp = FAL

enc <- encoding(.Object)
corpus <- get_corpus(.Object)
if (length(corpus) > 1L) stop("The objects in the bundle must be derived from the same corpus.")
if (length(corpus) > 1L)
stop("The objects in the bundle must be derived from the same corpus.")

if (verbose) message("... creating data.table with corpus positions")
DT <- data.table(
cpos = ranges_to_cpos(
do.call(rbind, lapply(.Object@objects, slot, "cpos"))
),
name_id = do.call(c, Map(rep, 1:length(.Object@objects), unname(sapply(.Object@objects, slot, "size"))))
name_id = do.call(
c,
Map(
rep,
1:length(.Object@objects),
unname(sapply(.Object@objects, slot, "size"))
)
)
)

if (is.null(phrases)){
if (verbose) message(sprintf("... adding ids for p-attribute '%s'", p_attribute))
if (verbose)
message(sprintf("... adding ids for p-attribute '%s'", p_attribute))
DT[, "id" := cl_cpos2id(
corpus = corpus, registry = corpus_registry_dir(corpus),
p_attribute = p_attribute, cpos = DT[["cpos"]]
Expand Down Expand Up @@ -560,7 +577,13 @@ setMethod("count", "corpus", function(.Object, query = NULL, cqp = is.cqp, check
# call the method for the slice class. There is an additional check whether
# hits are within the regions defined by the subcorpus, but this extra
# cost is minimal.
retval <- count(as(.Object, "subcorpus"), query = query, cqp = cqp, p_attribute = p_attribute, breakdown = TRUE)
retval <- count(
as(.Object, "subcorpus"),
query = query,
cqp = cqp,
p_attribute = p_attribute,
breakdown = TRUE
)
return( retval )
}
}
Expand Down
6 changes: 4 additions & 2 deletions R/hits.R
Expand Up @@ -233,7 +233,8 @@ setMethod("hits", "partition_bundle", function(
if ("pAttribute" %in% names(list(...))) p_attribute <- list(...)[["pAttribute"]]

corpus_id <- unique(unlist(lapply(.Object@objects, function(x) x@corpus)))
if (length(corpus_id) > 1L) stop("partiton_bundle not derived from one corpus")
if (length(corpus_id) > 1L)
stop("partiton_bundle not derived from one corpus")
corpus_obj <- corpus(corpus_id)
s_attribute_strucs <- unique(unlist(
lapply(.Object@objects, slot, "s_attribute_strucs")
Expand Down Expand Up @@ -274,7 +275,8 @@ setMethod("hits", "partition_bundle", function(
.message("finalizing tables", verbose = verbose)
if (nrow(count_dt) > 0L){
strucs <- cl_cpos2struc(
corpus = corpus_id, registry = corpus_registry_dir(corpus_id),
corpus = corpus_id,
registry = corpus_obj@registry_dir,
s_attribute = s_attribute_strucs, cpos = count_dt[["V1"]]
)
count_dt[, "struc" := strucs, with = TRUE][, "V1" := NULL][, "V2" := NULL]
Expand Down
21 changes: 18 additions & 3 deletions R/kwic.R
Expand Up @@ -94,7 +94,11 @@ setMethod("knit_print", "kwic", function(x, options = knitr::opts_chunk){
#'
setMethod("as.character", "kwic", function(x, fmt = "<i>%s</i>"){
if (!is.null(fmt)) x@stat[, "node" := sprintf(fmt, x@stat[["node"]])]
apply(x@stat, 1L, function(r) paste(r[["left"]], r[["node"]], r[["right"]], sep = " "))
apply(
x@stat,
1L,
function(r) paste(r[["left"]], r[["node"]], r[["right"]], sep = " ")
)
})

#' @docType methods
Expand Down Expand Up @@ -150,7 +154,13 @@ setMethod("as.data.frame", "kwic", function(x){
)
if (length(x@metadata) > 0L){
df <- data.frame(
meta = do.call(paste, c(lapply(x@metadata, function(s_attr) x@stat[[s_attr]]), sep = "<br/>")),
meta = do.call(
paste,
c(
lapply(x@metadata, function(s_attr) x@stat[[s_attr]]),
sep = "<br/>"
)
),
df,
stringsAsFactors = FALSE
)
Expand Down Expand Up @@ -426,12 +436,17 @@ setMethod("kwic", "corpus", function(
left = left, right = right,
p_attribute = p_attribute,
corpus = .Object@corpus,
registry = .Object@registry_dir,
boundary = boundary,
region = region,
...
)

ids <- cpos2id(x = .Object, p_attribute = p_attribute, cpos = ctxt@cpos[["cpos"]])
ids <- cpos2id(
x = .Object,
p_attribute = p_attribute,
cpos = ctxt@cpos[["cpos"]]
)

ctxt@cpos[, paste(p_attribute, "id", sep = "_") := ids, with = TRUE]

Expand Down
20 changes: 15 additions & 5 deletions R/token_stream.R
Expand Up @@ -105,7 +105,7 @@ setMethod("get_token_stream", "numeric", function(.Object, corpus, registry = NU

# apply cutoff if length of cpos exceeds maximum number of tokens specified by cutoff
if (!is.null(cutoff)) if (cutoff < length(.Object)) .Object <- .Object[1L:cutoff]
if (is.null(registry)) registry <- corpus_registry_dir(corpus)
if (is.null(registry)) registry <- corpus_registry_dir(corpus)[1]

ids <- cl_cpos2id(
corpus = corpus, registry = registry,
Expand Down Expand Up @@ -152,8 +152,13 @@ setMethod("get_token_stream", "numeric", function(.Object, corpus, registry = NU
})

#' @rdname get_token_stream-method
setMethod("get_token_stream", "matrix", function(.Object, split = FALSE, ...){
ts_vec <- get_token_stream(ranges_to_cpos(.Object), ...)
setMethod("get_token_stream", "matrix", function(.Object, corpus, registry = NULL, split = FALSE, ...){
ts_vec <- get_token_stream(
ranges_to_cpos(.Object),
corpus = corpus,
registry = registry,
...
)

if (isFALSE(is.logical(split))) stop("'split' needs to be a logical value.")
if (isFALSE(split)){
Expand Down Expand Up @@ -190,8 +195,13 @@ setMethod("get_token_stream", "character", function(.Object, left = NULL, right
#' @rdname get_token_stream-method
setMethod("get_token_stream", "slice", function(.Object, p_attribute, collapse = NULL, cpos = FALSE, ...){
get_token_stream(
.Object = .Object@cpos, corpus = .Object@corpus, p_attribute = p_attribute,
encoding = .Object@encoding, collapse = collapse, cpos = cpos,
.Object = .Object@cpos,
corpus = .Object@corpus,
registry = .Object@registry_dir,
p_attribute = p_attribute,
encoding = .Object@encoding,
collapse = collapse,
cpos = cpos,
...
)
})
Expand Down
15 changes: 12 additions & 3 deletions R/trim.R
Expand Up @@ -190,8 +190,11 @@ setMethod("trim", "context", function(.Object, s_attribute = NULL, positivelist
.Object@cpos <- .Object@cpos[.Object@cpos[["match_id"]] %in% matches_to_keep]
} else {
positivelist_ids <- .token2id(
corpus = .Object@corpus, p_attribute = p_attribute,
token = positivelist, regex = regex
corpus = .Object@corpus,
registry = .Object@registry_dir,
p_attribute = p_attribute,
token = positivelist,
regex = regex
)
.fn <- function(.SD){
neighbors <- .SD[[paste(p_attribute[1], "id", sep = "_")]][.SD[["position"]] != 0]
Expand All @@ -218,7 +221,13 @@ setMethod("trim", "context", function(.Object, s_attribute = NULL, positivelist
if (!is.null(stoplist)){
.message("applying stoplist", verbose = verbose)
before <- length(unique(.Object@cpos[["match_id"]]))
stoplist_ids <- .token2id(corpus = .Object@corpus, p_attribute = p_attribute, token = stoplist, regex = regex)
stoplist_ids <- .token2id(
corpus = .Object@corpus,
registry = .Object@registry_dir,
p_attribute = p_attribute,
token = stoplist,
regex = regex
)
.fn <- function(.SD){
p_attr <- paste(p_attribute[1], "id", sep = "_")
negatives <- which(.SD[[p_attr]] %in% stoplist_ids)
Expand Down
22 changes: 13 additions & 9 deletions R/utils.R
Expand Up @@ -229,16 +229,16 @@ s_attr_has_values <- function(s_attribute, x){
#'
#' Helper function for context method. Get ids for tokens.
#'
#' @param corpus the CWB corpus to use
#' @param p_attribute the p-attribute to use
#' @param corpus CWB corpus to use.
#' @param registry Registry directory with registry file describing `corpus`.
#' @param p_attribute The p-attribute to use.
#' @param token character tokens to turn into ids (character vector length >= 1)
#' @param regex logical
#' @noRd
.token2id <- function(corpus, p_attribute, token = NULL, regex = FALSE){
regdir <- corpus_registry_dir(corpus)
.token2id <- function(corpus, registry, p_attribute, token = NULL, regex = FALSE){
stopifnot(
corpus %in% cqp_list_corpora(),
p_attribute %in% corpus_p_attributes(corpus, regdir)
p_attribute %in% corpus_p_attributes(corpus, registry)
)

if (is.null(token)) return( NULL )
Expand All @@ -251,14 +251,18 @@ s_attr_has_values <- function(s_attribute, x){
token,
function(x)
cl_regex2id(
corpus = corpus, registry = regdir,
p_attribute = p_attribute, regex = x
corpus = corpus,
registry = registry,
p_attribute = p_attribute,
regex = x
)
))
} else {
retval <- cl_str2id(
corpus = corpus, registry = regdir,
p_attribute = p_attribute, str = token
corpus = corpus,
registry = registry,
p_attribute = p_attribute,
str = token
)
}
return( retval )
Expand Down
17 changes: 4 additions & 13 deletions cran-comments.md
@@ -1,22 +1,13 @@
## General remarks

This release catches up with markdown >= 1.3 such that markdown maintainers
will not have to rely on workarounds to ensure that markdown improvements
to not break polmineR reverse dependency tests.

The first, second and this submission of this update was rejected because some
examples ran too long (> 2.5 secs).

I have minimized long-running examples and replaced the (bigger) test data
"GERMAPARLMINI" with the (smaller) "REUTERS" data, which reduces execution time
for executing examples by 1/3. I also put long-running examples into donttest{}
sections.
Fedora R-devel checks indicated an encoding issue in a plot. I have resolved
the underlying issue and checked all plots.


## Test environments

* local OS X install, R 4.2.2
* GithubActions (Windows, macOS, Linux), R 4.2.3
* local OS X installation, R 4.3.1
* GithubActions (Windows, macOS, Linux), R 4.3.1
* win-builder (devel, release, oldrel)


Expand Down

0 comments on commit da44ca7

Please sign in to comment.