corpus() handles corpora defined in multiple registry files #267

PolMine · Oct 27, 2023 · da44ca7 · da44ca7
1 parent 0ab1878
commit da44ca7
Show file tree

Hide file tree

Showing 14 changed files with 148 additions and 61 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -223,6 +223,7 @@ importFrom(cli,cli_alert_danger)
 importFrom(cli,cli_alert_info)
 importFrom(cli,cli_alert_success)
 importFrom(cli,cli_alert_warning)
+importFrom(cli,cli_bullets)
 importFrom(cli,cli_process_done)
 importFrom(cli,cli_process_start)
 importFrom(cli,cli_progress_bar)

diff --git a/R/context.R b/R/context.R
@@ -149,7 +149,8 @@ setMethod("context", "slice", function(
     left = left, right = right,
     p_attribute = p_attribute, region = region,
     boundary = boundary,
-    corpus = .Object@corpus
+    corpus = .Object@corpus,
+    registry = .Object@registry_dir
   )
 
   ctxt@query <- query
@@ -225,12 +226,15 @@ setMethod("context", "subcorpus", function(
 #'   `RcppCWB::region_matrix_context()`, the worker behind the
 #'   `context()`-method.
 #' @param corpus A length-one `character` vector stating a corpus ID.
+#' @param registry The registry directory with the registry file for `corpus`.
 #' @rdname context-method
 #' @importFrom data.table between
 #' @importFrom RcppCWB region_matrix_context corpus_s_attributes
-setMethod("context", "matrix", function(.Object, corpus, left, right, p_attribute, region = NULL, boundary = NULL){
+setMethod("context", "matrix", function(.Object, corpus, registry = Sys.getenv("CORPUS_REGISTRY"), left, right, p_attribute, region = NULL, boundary = NULL){
   if (ncol(.Object) != 2L) stop("context,matrix-method: .Object is required to be a two-column matrix")
 
+  stopifnot(is.character(corpus), length(corpus) == 1L)
+
   if (inherits(left, "numeric"))
     left <- setNames(as.integer(left), nm = names(left))
 
@@ -242,11 +246,8 @@ setMethod("context", "matrix", function(.Object, corpus, left, right, p_attribut
       if (is.null(region)){
         s_attr <- NULL
       } else {
-        s_attr_present <- corpus_s_attributes(
-          corpus = corpus,
-          registry = corpus_registry_dir(corpus)
-        )
-        if (region %in% s_attr_present){
+        s_attrs <- corpus_s_attributes(corpus = corpus, registry = registry)
+        if (region %in% s_attrs){
           s_attr <- region
         } else {
           warning(sprintf("s-attribute '%s' not defined", region))
@@ -265,15 +266,14 @@ setMethod("context", "matrix", function(.Object, corpus, left, right, p_attribut
     right <- 0L
   }
 
-  regdir <- corpus_registry_dir(corpus)
   cpos_matrix <- region_matrix_context(
     corpus = corpus,
     matrix = .Object,
     s_attribute = s_attr,
     p_attribute = p_attribute[1],
     left = left, right = right,
     boundary = boundary,
-    registry = regdir
+    registry = registry
   )
   cpos_dt <- as.data.table(cpos_matrix)
 
@@ -286,8 +286,10 @@ setMethod("context", "matrix", function(.Object, corpus, left, right, p_attribut
   if (length(p_attribute) > 1L){
     for (i in 2L:length(p_attribute)){
       ids <- cl_cpos2id(
-        corpus = corpus, registry = regdir,
-        p_attribute = p_attribute[i], cpos = cpos_dt[["cpos"]]
+        corpus = corpus,
+        registry = registry,
+        p_attribute = p_attribute[i],
+        cpos = cpos_dt[["cpos"]]
       )
       cpos_dt[, (paste(p_attribute[i], "id", sep = "_")) := ids]
     }

diff --git a/R/corpus.R b/R/corpus.R
@@ -22,6 +22,7 @@ setGeneric("corpus", function(.Object, ...) standardGeneric("corpus"))
 #' @importFrom RcppCWB cqp_list_corpora corpus_data_dir corpus_registry_dir
 #'   corpus_info_file corpus_full_name
 #' @importFrom fs path path_expand
+#' @importFrom cli cli_bullets
 setMethod("corpus", "character", function(
     .Object, registry_dir,
     server = NULL, restricted
@@ -63,11 +64,17 @@ setMethod("corpus", "character", function(
     c_regdir <- path(corpus_registry_dir(.Object))
     if (missing(registry_dir)){
       if (length(c_regdir) > 1L){
-        print(c_regdir)
-        stop(
-          "Cannot initialize corpus object - ",
-          "corpus defined by two different registry files."
-        )
+        c_regdir <- path(unique(path(c_regdir)))
+        if (length(c_regdir) > 1L){
+          cli_alert_warning(
+            "corpus loaded multiple times with following registries:"
+          )
+          cli_bullets(setNames(c_regdir, rep("*", times = length(c_regdir))))
+          cli_alert_info(
+            "using {c_regdir[1]}"
+          )
+        }
+        registry_dir <- c_regdir[1]
       } else if (is.na(c_regdir)){
         stop(
           "Cannot initialize corpus object - ",

diff --git a/R/count.R b/R/count.R
@@ -150,14 +150,17 @@ setMethod("count", "subcorpus", function(
           function(x){
             region_matrix <- cpos(
               .Object = .Object,
-              query = x, p_attribute = p_attribute,
+              query = x,
+              p_attribute = p_attribute,
               cqp = cqp,
               check = check
             )
             if (is.null(region_matrix)) return(NULL)
             token <- get_token_stream(
               ranges_to_cpos(region_matrix),
-              corpus = .Object@corpus, p_attribute = p_attribute,
+              corpus = .Object@corpus,
+              registry = .Object@registry_dir,
+              p_attribute = p_attribute,
               encoding = .Object@encoding
             )
             ids <- unlist(
@@ -287,7 +290,12 @@ setMethod("count", "partition_bundle", function(.Object, query = NULL, cqp = FAL
     )@stat
     .message("rearranging table", verbose = verbose)
     if (nrow(DT) > 0L){
-      DT_cast <- dcast.data.table(DT, partition ~ query, value.var = "count", fill = 0)
+      DT_cast <- dcast.data.table(
+        DT,
+        partition ~ query,
+        value.var = "count",
+        fill = 0
+      )
     } else {
       # If there are no matches, we generate a data.table directly that will be 
       # filled later on.
@@ -338,18 +346,27 @@ setMethod("count", "partition_bundle", function(.Object, query = NULL, cqp = FAL
 
     enc <- encoding(.Object)
     corpus <- get_corpus(.Object)
-    if (length(corpus) > 1L) stop("The objects in the bundle must be derived from the same corpus.")
+    if (length(corpus) > 1L)
+      stop("The objects in the bundle must be derived from the same corpus.")
 
     if (verbose) message("... creating data.table with corpus positions")
     DT <- data.table(
       cpos = ranges_to_cpos(
         do.call(rbind, lapply(.Object@objects, slot, "cpos"))
       ),
-      name_id = do.call(c, Map(rep, 1:length(.Object@objects), unname(sapply(.Object@objects, slot, "size"))))
+      name_id = do.call(
+        c,
+        Map(
+          rep,
+          1:length(.Object@objects),
+          unname(sapply(.Object@objects, slot, "size"))
+        )
+      )
     )
 
     if (is.null(phrases)){
-      if (verbose) message(sprintf("... adding ids for p-attribute '%s'", p_attribute))
+      if (verbose)
+        message(sprintf("... adding ids for p-attribute '%s'", p_attribute))
       DT[, "id" :=  cl_cpos2id(
         corpus = corpus, registry = corpus_registry_dir(corpus),
         p_attribute = p_attribute, cpos = DT[["cpos"]]
@@ -560,7 +577,13 @@ setMethod("count", "corpus", function(.Object, query = NULL, cqp = is.cqp, check
         # call the method for the slice class. There is an additional check whether
         # hits are within the regions defined by the subcorpus, but this extra 
         # cost is minimal.
-        retval <- count(as(.Object, "subcorpus"), query = query, cqp = cqp, p_attribute = p_attribute, breakdown = TRUE)
+        retval <- count(
+          as(.Object, "subcorpus"),
+          query = query,
+          cqp = cqp,
+          p_attribute = p_attribute,
+          breakdown = TRUE
+        )
         return( retval )
       }
     }

diff --git a/R/hits.R b/R/hits.R
@@ -233,7 +233,8 @@ setMethod("hits", "partition_bundle", function(
   if ("pAttribute" %in% names(list(...))) p_attribute <- list(...)[["pAttribute"]]
 
   corpus_id <- unique(unlist(lapply(.Object@objects, function(x) x@corpus)))
-  if (length(corpus_id) > 1L) stop("partiton_bundle not derived from one corpus")
+  if (length(corpus_id) > 1L)
+    stop("partiton_bundle not derived from one corpus")
   corpus_obj <- corpus(corpus_id)
   s_attribute_strucs <- unique(unlist(
     lapply(.Object@objects, slot, "s_attribute_strucs")
@@ -274,7 +275,8 @@ setMethod("hits", "partition_bundle", function(
   .message("finalizing tables", verbose = verbose)
   if (nrow(count_dt) > 0L){
     strucs <- cl_cpos2struc(
-      corpus = corpus_id, registry = corpus_registry_dir(corpus_id),
+      corpus = corpus_id,
+      registry = corpus_obj@registry_dir,
       s_attribute = s_attribute_strucs, cpos = count_dt[["V1"]]
     )
     count_dt[, "struc" := strucs, with = TRUE][, "V1" := NULL][, "V2" := NULL]

diff --git a/R/kwic.R b/R/kwic.R
@@ -94,7 +94,11 @@ setMethod("knit_print", "kwic", function(x, options = knitr::opts_chunk){
 #' 
 setMethod("as.character", "kwic", function(x, fmt = "<i>%s</i>"){
   if (!is.null(fmt)) x@stat[, "node" := sprintf(fmt, x@stat[["node"]])]
-  apply(x@stat, 1L, function(r) paste(r[["left"]], r[["node"]], r[["right"]], sep = " "))
+  apply(
+    x@stat,
+    1L,
+    function(r) paste(r[["left"]], r[["node"]], r[["right"]], sep = " ")
+  )
 })
 
 #' @docType methods
@@ -150,7 +154,13 @@ setMethod("as.data.frame", "kwic", function(x){
     )
     if (length(x@metadata) > 0L){
       df <- data.frame(
-        meta = do.call(paste, c(lapply(x@metadata, function(s_attr) x@stat[[s_attr]]), sep = "<br/>")),
+        meta = do.call(
+          paste,
+          c(
+            lapply(x@metadata, function(s_attr) x@stat[[s_attr]]),
+            sep = "<br/>"
+          )
+        ),
         df,
         stringsAsFactors = FALSE
       )
@@ -426,12 +436,17 @@ setMethod("kwic", "corpus", function(
     left = left, right = right,
     p_attribute = p_attribute,
     corpus = .Object@corpus,
+    registry = .Object@registry_dir,
     boundary = boundary,
     region = region,
     ...
   )
 
-  ids <- cpos2id(x = .Object, p_attribute = p_attribute, cpos = ctxt@cpos[["cpos"]])
+  ids <- cpos2id(
+    x = .Object,
+    p_attribute = p_attribute,
+    cpos = ctxt@cpos[["cpos"]]
+  )
 
   ctxt@cpos[, paste(p_attribute, "id", sep = "_") := ids, with = TRUE]
 

diff --git a/R/token_stream.R b/R/token_stream.R
@@ -105,7 +105,7 @@ setMethod("get_token_stream", "numeric", function(.Object, corpus, registry = NU
 
   # apply cutoff if length of cpos exceeds maximum number of tokens specified by cutoff
   if (!is.null(cutoff)) if (cutoff < length(.Object)) .Object <- .Object[1L:cutoff]
-  if (is.null(registry)) registry <- corpus_registry_dir(corpus)
+  if (is.null(registry)) registry <- corpus_registry_dir(corpus)[1]
 
   ids <- cl_cpos2id(
     corpus = corpus, registry = registry,
@@ -152,8 +152,13 @@ setMethod("get_token_stream", "numeric", function(.Object, corpus, registry = NU
 })
 
 #' @rdname get_token_stream-method
-setMethod("get_token_stream", "matrix", function(.Object, split = FALSE, ...){
-  ts_vec <- get_token_stream(ranges_to_cpos(.Object), ...)
+setMethod("get_token_stream", "matrix", function(.Object, corpus, registry = NULL, split = FALSE, ...){
+  ts_vec <- get_token_stream(
+    ranges_to_cpos(.Object),
+    corpus = corpus, 
+    registry = registry,
+    ...
+  )
 
   if (isFALSE(is.logical(split))) stop("'split' needs to be a logical value.")
   if (isFALSE(split)){
@@ -190,8 +195,13 @@ setMethod("get_token_stream", "character", function(.Object, left = NULL, right
 #' @rdname get_token_stream-method
 setMethod("get_token_stream", "slice", function(.Object, p_attribute, collapse = NULL, cpos = FALSE, ...){
   get_token_stream(
-    .Object = .Object@cpos, corpus = .Object@corpus, p_attribute = p_attribute,
-    encoding = .Object@encoding, collapse = collapse, cpos = cpos,
+    .Object = .Object@cpos,
+    corpus = .Object@corpus,
+    registry = .Object@registry_dir,
+    p_attribute = p_attribute,
+    encoding = .Object@encoding,
+    collapse = collapse,
+    cpos = cpos,
     ...
     )
 })

diff --git a/R/trim.R b/R/trim.R
@@ -190,8 +190,11 @@ setMethod("trim", "context", function(.Object, s_attribute = NULL, positivelist
       .Object@cpos <- .Object@cpos[.Object@cpos[["match_id"]] %in% matches_to_keep]
     } else {
       positivelist_ids <- .token2id(
-        corpus = .Object@corpus, p_attribute = p_attribute,
-        token = positivelist, regex = regex
+        corpus = .Object@corpus,
+        registry = .Object@registry_dir,
+        p_attribute = p_attribute,
+        token = positivelist,
+        regex = regex
       )
       .fn <- function(.SD){
         neighbors <- .SD[[paste(p_attribute[1], "id", sep = "_")]][.SD[["position"]] != 0]
@@ -218,7 +221,13 @@ setMethod("trim", "context", function(.Object, s_attribute = NULL, positivelist
   if (!is.null(stoplist)){
     .message("applying stoplist", verbose = verbose)
     before <- length(unique(.Object@cpos[["match_id"]]))
-    stoplist_ids <- .token2id(corpus = .Object@corpus, p_attribute = p_attribute, token = stoplist, regex = regex)
+    stoplist_ids <- .token2id(
+      corpus = .Object@corpus,
+      registry = .Object@registry_dir,
+      p_attribute = p_attribute,
+      token = stoplist,
+      regex = regex
+    )
     .fn <- function(.SD){
       p_attr <- paste(p_attribute[1], "id", sep = "_")
       negatives <- which(.SD[[p_attr]] %in% stoplist_ids)

diff --git a/R/utils.R b/R/utils.R
@@ -229,16 +229,16 @@ s_attr_has_values <- function(s_attribute, x){
 #' 
 #' Helper function for context method. Get ids for tokens.
 #' 
-#' @param corpus the CWB corpus to use
-#' @param p_attribute the p-attribute to use
+#' @param corpus CWB corpus to use.
+#' @param registry Registry directory with registry file describing `corpus`.
+#' @param p_attribute The p-attribute to use.
 #' @param token character tokens to turn into ids (character vector length >= 1)
 #' @param regex logical
 #' @noRd
-.token2id <- function(corpus, p_attribute, token = NULL, regex = FALSE){
-  regdir <- corpus_registry_dir(corpus)
+.token2id <- function(corpus, registry, p_attribute, token = NULL, regex = FALSE){
   stopifnot(
     corpus %in% cqp_list_corpora(),
-    p_attribute %in% corpus_p_attributes(corpus, regdir)
+    p_attribute %in% corpus_p_attributes(corpus, registry)
   )
 
   if (is.null(token)) return( NULL )
@@ -251,14 +251,18 @@ s_attr_has_values <- function(s_attribute, x){
         token,
         function(x)
           cl_regex2id(
-            corpus = corpus, registry = regdir,
-            p_attribute = p_attribute, regex = x
+            corpus = corpus,
+            registry = registry,
+            p_attribute = p_attribute,
+            regex = x
           )
       ))
     } else {
       retval <- cl_str2id(
-        corpus = corpus, registry = regdir,
-        p_attribute = p_attribute, str = token
+        corpus = corpus,
+        registry = registry,
+        p_attribute = p_attribute, 
+        str = token
       )
     }
     return( retval )

diff --git a/cran-comments.md b/cran-comments.md
@@ -1,22 +1,13 @@
 ## General remarks
 
-This release catches up with markdown >= 1.3 such that markdown maintainers
-will not have to rely on workarounds to ensure that markdown improvements 
-to not break polmineR reverse dependency tests.
-
-The first, second and this submission of this update was rejected because some
-examples ran too long (> 2.5 secs).
-
-I have minimized long-running examples and replaced the (bigger) test data 
-"GERMAPARLMINI" with the (smaller) "REUTERS" data, which reduces execution time
-for executing examples by 1/3. I also put long-running examples into donttest{}
-sections.
+Fedora R-devel checks indicated an encoding issue in a plot. I have resolved 
+the underlying issue and checked all plots.
 
 
 ## Test environments
 
-* local OS X install, R 4.2.2
-* GithubActions (Windows, macOS, Linux), R 4.2.3
+* local OS X installation, R 4.3.1
+* GithubActions (Windows, macOS, Linux), R 4.3.1
 * win-builder (devel, release, oldrel)