as_docgroups() tested

PolMine · Jul 7, 2023 · 0a3e089 · 0a3e089
1 parent 7b66113
commit 0a3e089
Show file tree

Hide file tree

Showing 16 changed files with 231 additions and 203 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -31,8 +31,9 @@ Encoding: UTF-8
 License: GPL-3
 Collate:
     'duplicates_package.R'
-    'nchars.R'
+    'charcount.R'
     'detect_duplicates.R'
     'utils.R'
+    'encode.R'
 RoxygenNote: 7.2.3
 Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,9 +1,10 @@
 # Generated by roxygen2: do not edit by hand
 
+export(as_docgroups)
 export(charfilter)
 export(detect_duplicates)
+exportMethods(charcount)
 exportMethods(detect_duplicates)
-exportMethods(nchars)
 import(data.table)
 importFrom(Matrix,triu)
 importFrom(R6,R6Class)

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,11 @@
 ## v0.1.3
 
 - Function `minimize_vocabulary()` more generic and renamed as `charfilter()`.
+- Method `nchars()` renamed to `charcount()`.
+- Function `duplicates_get_groups()` renamed to `as_docgroups()`.
+- Argument `s_attribute` of method `detect_duplicates()` used generically. A new
+column with the name of the the s-attribute to be used as metadata will be
+added.
 
 ## v0.1.2
 

diff --git a/R/nchars.R → R/charcount.R b/R/nchars.R → R/charcount.R
diff --git a/R/detect_duplicates.R b/R/detect_duplicates.R
@@ -56,7 +56,7 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica
 #' use(pkg = "duplicates")
 #' 
 #' charcount <- corpus("REUTERS2") %>% 
-#'   nchars(
+#'   charcount(
 #'     p_attribute = "word",
 #'     char_regex = "[a-zA-Z]",
 #'     lowercase = TRUE,
@@ -77,6 +77,8 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica
 #'     mc = parallel::detectCores() - 2L,
 #'     vocab = vocab
 #'   )
+#'   
+#' docgrps <- as_docgroups(dupl)
 setMethod("detect_duplicates", "partition_bundle",
   function(
     x, n = 5L, min_shingle_length = n,
@@ -87,10 +89,10 @@ setMethod("detect_duplicates", "partition_bundle",
   ){ 
     started <- Sys.time()
 
-    if (verbose) cli_progress_step("get sizes and dates")
+    if (verbose) cli_progress_step("get sizes and metadata")
     sizes <- sapply(x@objects, slot, "size")
-    dates <- s_attributes(x, s_attribute = s_attribute, unique = TRUE)
-    dates <- lapply(dates, `[[`, 1L) # a sanity measure
+    s_attr <- s_attributes(x, s_attribute = s_attribute, unique = TRUE)
+    s_attr <- lapply(s_attr, `[[`, 1L) # a sanity measure
 
     if (verbose) cli_progress_step("make ngram matrix")
     ngrams <- ngrams(
@@ -113,10 +115,11 @@ setMethod("detect_duplicates", "partition_bundle",
     dt[, "duplicate_size" := sizes[dt[["duplicate_name"]]]]
 
     if (nrow(dt) > 0L){
-      dt[, "date" := unlist(dates[dt[["name"]]])]
-      dt[, "date_duplicate" := unlist(dates[dt[["duplicate_name"]]])]
+      dt[, (s_attribute) := unlist(s_attr[dt[["name"]]])]
+      dt[, (paste("duplicate", s_attribute, sep = "_")) := unlist(s_attr[dt[["duplicate_name"]]])]
     } else {
-      dt[, "date" := character()][, "date_duplicate" := character()]
+      dt[, (s_attribute) := character()]
+      dt[, (paste("duplicate", s_attribute, sep = "_")) := character()]
     }
     dt
   }
@@ -136,7 +139,9 @@ setMethod("detect_duplicates", "partition_bundle",
 #' chars <- chars[grep("[a-zA-Z]", names(chars))]
 #' char <- names(chars[order(chars, decreasing = FALSE)][1:20])
 #' 
-#' detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6)
+#' dupl <- detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6)
+#' 
+#' docgrps <- as_docgroups(dupl)
 #' @rdname detect_duplicates
 setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length = n, char = "", threshold = 0.9, verbose = TRUE, mc = FALSE){ 
   started <- Sys.time()
@@ -203,4 +208,41 @@ setMethod("detect_duplicates", "dgCMatrix", function(x, n, min_shingle_length, t
     duplicate_name = sim_min@Dimnames[[2]][sim_min@j + 1],
     similarity = sim_min@x
   )
-})
+})
+
+
+#' Get groups of near-duplicate documents
+#' 
+#' @param x A `data.table` with duplicates that have been detected.
+#' @importFrom igraph graph_from_data_frame decompose get.vertex.attribute
+#' @export as_docgroups
+#' @rdname docgroups
+as_docgroups <- function(x){
+
+  ids <- x[, c("name", "duplicate_name")] |>
+    as.data.frame() |>
+    igraph::graph_from_data_frame() |>
+    igraph::decompose() |>
+    lapply(igraph::get.vertex.attribute, name = "name")
+
+  dt <- data.table(
+    name = unlist(ids),
+    group = unlist(
+      mapply(rep, seq_along(ids), sapply(ids, length), SIMPLIFY = FALSE),
+      recursive = FALSE
+    )
+  )
+
+  duplcols <- grep("duplicate_", colnames(x), value = TRUE)
+  metadata <- unique(rbindlist(
+    list(
+      x[, setdiff(colnames(x), c(duplcols, "similarity")), with = FALSE],
+      x[, duplcols, with = FALSE]
+    ),
+    use.names = FALSE
+  ))
+
+  y <- metadata[dt, on = "name"]
+  setcolorder(y, neworder = c("group", "name"))
+  y
+}
diff --git a/R/encode.R b/R/encode.R
@@ -0,0 +1,118 @@
+
+#' Encode annotation data
+#' 
+#' Add structural attributes to CWB corpus based on the annotation data that
+#' has been generated.
+#' @param x Data.
+#' @param corpus ID of CWB corpus.
+#' @param method XXX.
+#' @importFrom data.table setDT
+#' @importFrom cwbtools s_attribute_encode
+duplicates_encode <- function(x, corpus, method = "R"){
+
+  corpus_obj <- corpus(corpus)
+
+  for (s_attr in c("is_duplicate", "duplicates")){
+    s_attribute_encode(
+      values = as.character(x[[s_attr]]),
+      data_dir = corpus_obj@data_dir,
+      s_attribute = s_attr,
+      corpus = corpus,
+      region_matrix = as.matrix(x[, c("cpos_left", "cpos_right")]),
+      method = method,
+      registry_dir = corpus_obj@registry_dir,
+      encoding = corpus_obj@encoding,
+      delete = TRUE,
+      verbose = TRUE
+    )
+  }
+  invisible(TRUE)
+}
+
+.N <- NULL # to avoid warnings
+
+#' Make annotation data
+#' 
+#' @description
+#' Turn `data.table` with duplicates into file with corpus positions and
+#' annotation of duplicates.
+#' @param drop A character vector of document IDs that will be removed from
+#'   the annotation data. Useful for removing known noise that will be 
+#'   excluded from the analysis otherwise.
+#' @param cols XXX.
+#' @param order XXX.
+#' @param x Input `data.table`.
+#' @param corpus ID of CWB corpus.
+#' @param s_attribute Structural attribute to annotate.
+#' @importFrom data.table setDT setnames setkeyv
+#' @importFrom polmineR corpus
+duplicates_as_annotation_data = function(x, corpus, s_attribute, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)){
+
+  groups <- as_docgroups()
+
+  if (!is.null(drop)){
+    groups <- groups[!groups[["name"]] %in% drop]
+    groups_n <- groups[, .N, by = "group"]
+    groups[groups_n, "group_size" := groups_n[["N"]], on = "group"]
+    groups <- groups[groups[["group_size"]] > 1L][, "group_size" := NULL]
+  }
+
+  original <- groups[,
+                     setorderv(x = .SD, cols = cols, order = order)[1,],
+                     by = "group", .SDcols = cols
+  ][, "is_duplicate" := FALSE]
+  groups[original, "is_duplicate" := groups[["is_duplicate"]], on = "name"]
+  groups[, "is_duplicate" := ifelse(is.na(groups[["is_duplicate"]]), TRUE, groups[["is_duplicate"]])]
+  duplicates_dt <- groups[,
+                          list(
+                            name = .SD[["name"]],
+                            is_duplicate = .SD[["is_duplicate"]],
+                            duplicates = sapply(
+                              1L:nrow(.SD),
+                              function(i) paste(setdiff(.SD[["name"]], .SD[["name"]][i]), collapse = "|")
+                            )
+                          ),
+                          by = "group", .SDcols = c("name", "is_duplicate")
+  ][, "group" := NULL]
+
+  # get regions ------------------------------------------------------------
+
+  corpus_obj <- corpus(corpus)
+  x <- corpus(corpus)
+  regions <- setDT(
+    RcppCWB::s_attribute_decode(
+      corpus = corpus,
+      data_dir = corpus_obj@data_dir,
+      s_attribute = s_attribute,
+      encoding = corpus_obj@encoding,
+      registry = corpus_obj@registry_dir,
+      method = "Rcpp"
+    )
+  )
+  setnames(regions, old = "value", new = s_attribute)
+  setkeyv(regions, s_attribute)
+
+  # finalize annotation data -----------------------------------------------
+
+  setnames(duplicates_dt, old = "name", new = s_attribute)
+  anno <- duplicates_dt[regions, on = s_attribute]
+  anno[,
+       "is_duplicate" := ifelse(
+         is.na(anno[["is_duplicate"]]),
+         FALSE,
+         anno[["is_duplicate"]]
+       )
+  ]
+  anno[,
+       "duplicates" := ifelse(
+         is.na(anno[["duplicates"]]),
+         "",
+         anno[["duplicates"]]
+       )]
+  setcolorder(
+    anno,
+    c("cpos_left", "cpos_right", s_attribute, "is_duplicate", "duplicates")
+  )
+  setorderv(anno, cols = "cpos_left")
+  anno
+}