Skip to content

Commit

Permalink
split() for corpus if s_attr does not have values #263
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Blätte authored and Andreas Blätte committed Sep 7, 2023
1 parent 4b12314 commit 204330f
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 18 deletions.
49 changes: 40 additions & 9 deletions R/split.R
Expand Up @@ -77,10 +77,11 @@ setMethod("split", "subcorpus", function(
stop(sprintf("s-attribute '%s' not available", s_attribute))

if (missing(values))
# prospecitvely use RcppCWB::cl_struc_values()
# prospectvely use RcppCWB::cl_struc_values()
values <- s_attr_has_values(s_attribute = s_attribute, x = x)
if (is.null(values))
values <- TRUE
# result is reported, yet later with other info on s-attribute

history <- rev(sapply(sys.calls(), function(x) deparse(x[[1]])))
pb_call <- if (5L %in% which(history == "partition_bundle")) TRUE else FALSE
Expand Down Expand Up @@ -128,6 +129,12 @@ setMethod("split", "subcorpus", function(
"of s-attribute {.val {x@s_attribute_strucs}}"
)
)

# report here, together with further info on s-attribute
cli_alert_info(
's-attribute has values: {col_cyan({if (isFALSE(values)) "no" else "yes"})}'
)

}

if (relation %in% c("sibling", "ancestor")){
Expand Down Expand Up @@ -218,15 +225,28 @@ setMethod("split", "subcorpus", function(
#' @rdname subcorpus_bundle
#' @inheritParams partition_bundle
setMethod("split", "corpus", function(
x, s_attribute, values = NULL, prefix = "",
x, s_attribute, values, prefix = "",
mc = getOption("polmineR.mc"), verbose = TRUE, progress = FALSE,
type = get_type(x), xml = "flat"
) {

stopifnot(is.character(s_attribute), length(s_attribute) == 1L)
stopifnot(
is.character(s_attribute),
length(s_attribute) == 1L
)
if (!s_attribute %in% s_attributes(x))
stop(sprintf("s-attribute '%s' not available", s_attribute))

if (missing(values))
# prospectively use RcppCWB::cl_struc_values()
values <- s_attr_has_values(s_attribute = s_attribute, x = x)
if (is.null(values)) values <- TRUE

if (verbose) cli_alert_info(
's-attribute {.val {s_attribute}} has values: {col_cyan({if (isFALSE(values)) "no" else "yes"})}'
)


# Ensure that when split() is called within partition_bundle(), the resulting
# object is a partition_bundle and the objects in the slot 'object' are
# partition objects, not subcorpus objects.
Expand All @@ -249,13 +269,22 @@ setMethod("split", "corpus", function(
corpus = x@corpus, registry = x@registry_dir,
s_attribute = s_attribute, strucs = strucs
)
strucs_values <- struc2str(x = x, s_attr = s_attribute, struc = strucs)
cpos_list <- split(cpos_matrix, strucs_values)
struc_list <- split(strucs, strucs_values)

if (isFALSE(values)){
cpos_list <- split(cpos_matrix, strucs)
struc_list <- split(strucs, strucs)
} else {
strucs_values <- struc2str(x = x, s_attr = s_attribute, struc = strucs)
cpos_list <- split(cpos_matrix, strucs_values)
struc_list <- split(strucs, strucs_values)
}

if (verbose) cli_progress_done()

if (!is.null(values)){
if (verbose) cli_progress_step("drop values")
if (!is.null(values) && is.character(values)){
if (verbose) cli_progress_step(
"keep only matches for {col_cyan({length(values)})} values provided"
)
drop <- which(!names(cpos_list) %in% values)
if (length(drop) > 0L){
cpos_list <- cpos_list[-drop]
Expand All @@ -279,7 +308,9 @@ setMethod("split", "corpus", function(
y
}

if (verbose) cli_progress_step("instantiate objects")
if (verbose)
cli_progress_step("instantiate objects (n = {.val {length(cpos_list)}})")

y@objects <- if (progress)
pblapply(seq_along(cpos_list), .fn, cl = mc)
else
Expand Down
43 changes: 34 additions & 9 deletions tests/testthat/test_split.R
Expand Up @@ -15,6 +15,40 @@ test_that(
y2 <- merge(y)
expect_identical(as.integer(merge(y)@cpos), as.integer(x@cpos))
expect_identical(as.vector(merge(y)@cpos), as.vector(x@cpos))

# check that argument values works as intended
speakers <- c("Volker Kauder", "Norbert Lammert", "Wolfgang Thierse")
sb_speakers <- corpus("GERMAPARLMINI") %>%
split(s_attribute = "speaker", values = speakers)
expect_true(all(speakers %in% names(sb_speakers)))


# the following tests require that GERMAPARL2MINI is available
# It is wrapped into the GermaParl2 package, which can be installed as
# follows:
# install.packages(
# pkgs = "GermaParl2",
# contriburl = "https://polmine.github.io/drat/src/contrib",
# type = "source"
# )

skip_if_not(use("GermaParl2"))

gparl2 <- corpus("GERMAPARL2MINI")

n_sentences <- gparl2 %>%
split(s_attribute = "p", values = FALSE, verbose = FALSE) %>%
length()

attr_size <- RcppCWB::cl_attribute_size(
corpus = "GERMAPARL2MINI",
attribute = "p",
attribute_type = "s",
registry = gparl2@registry_dir
)

expect_identical(n_sentences, attr_size)

}
)

Expand Down Expand Up @@ -46,15 +80,6 @@ test_that(
pp3@cpos
)

# the following tests require that GERMAPARL2MINI is available
# It is wrapped into the GermaParl2 package, which can be installed as
# follows:
# install.packages(
# pkgs = "GermaParl2",
# contriburl = "https://polmine.github.io/drat/src/contrib",
# type = "source"
# )

skip_if_not(use("GermaParl2"))

gparl2 <- corpus("GERMAPARL2MINI")
Expand Down

0 comments on commit 204330f

Please sign in to comment.