Skip to content

Commit

Permalink
as_docgroups() tested
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Blätte authored and Andreas Blätte committed Jul 7, 2023
1 parent 7b66113 commit 0a3e089
Show file tree
Hide file tree
Showing 16 changed files with 231 additions and 203 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Expand Up @@ -31,8 +31,9 @@ Encoding: UTF-8
License: GPL-3
Collate:
'duplicates_package.R'
'nchars.R'
'charcount.R'
'detect_duplicates.R'
'utils.R'
'encode.R'
RoxygenNote: 7.2.3
Roxygen: list(markdown = TRUE)
3 changes: 2 additions & 1 deletion NAMESPACE
@@ -1,9 +1,10 @@
# Generated by roxygen2: do not edit by hand

export(as_docgroups)
export(charfilter)
export(detect_duplicates)
exportMethods(charcount)
exportMethods(detect_duplicates)
exportMethods(nchars)
import(data.table)
importFrom(Matrix,triu)
importFrom(R6,R6Class)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
@@ -1,6 +1,11 @@
## v0.1.3

- Function `minimize_vocabulary()` more generic and renamed as `charfilter()`.
- Method `nchars()` renamed to `charcount()`.
- Function `duplicates_get_groups()` renamed to `as_docgroups()`.
- Argument `s_attribute` of method `detect_duplicates()` used generically. A new
column with the name of the the s-attribute to be used as metadata will be
added.

## v0.1.2

Expand Down
File renamed without changes.
60 changes: 51 additions & 9 deletions R/detect_duplicates.R
Expand Up @@ -56,7 +56,7 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica
#' use(pkg = "duplicates")
#'
#' charcount <- corpus("REUTERS2") %>%
#' nchars(
#' charcount(
#' p_attribute = "word",
#' char_regex = "[a-zA-Z]",
#' lowercase = TRUE,
Expand All @@ -77,6 +77,8 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica
#' mc = parallel::detectCores() - 2L,
#' vocab = vocab
#' )
#'
#' docgrps <- as_docgroups(dupl)
setMethod("detect_duplicates", "partition_bundle",
function(
x, n = 5L, min_shingle_length = n,
Expand All @@ -87,10 +89,10 @@ setMethod("detect_duplicates", "partition_bundle",
){
started <- Sys.time()

if (verbose) cli_progress_step("get sizes and dates")
if (verbose) cli_progress_step("get sizes and metadata")
sizes <- sapply(x@objects, slot, "size")
dates <- s_attributes(x, s_attribute = s_attribute, unique = TRUE)
dates <- lapply(dates, `[[`, 1L) # a sanity measure
s_attr <- s_attributes(x, s_attribute = s_attribute, unique = TRUE)
s_attr <- lapply(s_attr, `[[`, 1L) # a sanity measure

if (verbose) cli_progress_step("make ngram matrix")
ngrams <- ngrams(
Expand All @@ -113,10 +115,11 @@ setMethod("detect_duplicates", "partition_bundle",
dt[, "duplicate_size" := sizes[dt[["duplicate_name"]]]]

if (nrow(dt) > 0L){
dt[, "date" := unlist(dates[dt[["name"]]])]
dt[, "date_duplicate" := unlist(dates[dt[["duplicate_name"]]])]
dt[, (s_attribute) := unlist(s_attr[dt[["name"]]])]
dt[, (paste("duplicate", s_attribute, sep = "_")) := unlist(s_attr[dt[["duplicate_name"]]])]
} else {
dt[, "date" := character()][, "date_duplicate" := character()]
dt[, (s_attribute) := character()]
dt[, (paste("duplicate", s_attribute, sep = "_")) := character()]
}
dt
}
Expand All @@ -136,7 +139,9 @@ setMethod("detect_duplicates", "partition_bundle",
#' chars <- chars[grep("[a-zA-Z]", names(chars))]
#' char <- names(chars[order(chars, decreasing = FALSE)][1:20])
#'
#' detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6)
#' dupl <- detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6)
#'
#' docgrps <- as_docgroups(dupl)
#' @rdname detect_duplicates
setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length = n, char = "", threshold = 0.9, verbose = TRUE, mc = FALSE){
started <- Sys.time()
Expand Down Expand Up @@ -203,4 +208,41 @@ setMethod("detect_duplicates", "dgCMatrix", function(x, n, min_shingle_length, t
duplicate_name = sim_min@Dimnames[[2]][sim_min@j + 1],
similarity = sim_min@x
)
})
})


#' Get groups of near-duplicate documents
#'
#' @param x A `data.table` with duplicates that have been detected.
#' @importFrom igraph graph_from_data_frame decompose get.vertex.attribute
#' @export as_docgroups
#' @rdname docgroups
as_docgroups <- function(x){

ids <- x[, c("name", "duplicate_name")] |>
as.data.frame() |>
igraph::graph_from_data_frame() |>
igraph::decompose() |>
lapply(igraph::get.vertex.attribute, name = "name")

dt <- data.table(
name = unlist(ids),
group = unlist(
mapply(rep, seq_along(ids), sapply(ids, length), SIMPLIFY = FALSE),
recursive = FALSE
)
)

duplcols <- grep("duplicate_", colnames(x), value = TRUE)
metadata <- unique(rbindlist(
list(
x[, setdiff(colnames(x), c(duplcols, "similarity")), with = FALSE],
x[, duplcols, with = FALSE]
),
use.names = FALSE
))

y <- metadata[dt, on = "name"]
setcolorder(y, neworder = c("group", "name"))
y
}
118 changes: 118 additions & 0 deletions R/encode.R
@@ -0,0 +1,118 @@

#' Encode annotation data
#'
#' Add structural attributes to CWB corpus based on the annotation data that
#' has been generated.
#' @param x Data.
#' @param corpus ID of CWB corpus.
#' @param method XXX.
#' @importFrom data.table setDT
#' @importFrom cwbtools s_attribute_encode
duplicates_encode <- function(x, corpus, method = "R"){

corpus_obj <- corpus(corpus)

for (s_attr in c("is_duplicate", "duplicates")){
s_attribute_encode(
values = as.character(x[[s_attr]]),
data_dir = corpus_obj@data_dir,
s_attribute = s_attr,
corpus = corpus,
region_matrix = as.matrix(x[, c("cpos_left", "cpos_right")]),
method = method,
registry_dir = corpus_obj@registry_dir,
encoding = corpus_obj@encoding,
delete = TRUE,
verbose = TRUE
)
}
invisible(TRUE)
}

.N <- NULL # to avoid warnings

#' Make annotation data
#'
#' @description
#' Turn `data.table` with duplicates into file with corpus positions and
#' annotation of duplicates.
#' @param drop A character vector of document IDs that will be removed from
#' the annotation data. Useful for removing known noise that will be
#' excluded from the analysis otherwise.
#' @param cols XXX.
#' @param order XXX.
#' @param x Input `data.table`.
#' @param corpus ID of CWB corpus.
#' @param s_attribute Structural attribute to annotate.
#' @importFrom data.table setDT setnames setkeyv
#' @importFrom polmineR corpus
duplicates_as_annotation_data = function(x, corpus, s_attribute, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)){

groups <- as_docgroups()

if (!is.null(drop)){
groups <- groups[!groups[["name"]] %in% drop]
groups_n <- groups[, .N, by = "group"]
groups[groups_n, "group_size" := groups_n[["N"]], on = "group"]
groups <- groups[groups[["group_size"]] > 1L][, "group_size" := NULL]
}

original <- groups[,
setorderv(x = .SD, cols = cols, order = order)[1,],
by = "group", .SDcols = cols
][, "is_duplicate" := FALSE]
groups[original, "is_duplicate" := groups[["is_duplicate"]], on = "name"]
groups[, "is_duplicate" := ifelse(is.na(groups[["is_duplicate"]]), TRUE, groups[["is_duplicate"]])]
duplicates_dt <- groups[,
list(
name = .SD[["name"]],
is_duplicate = .SD[["is_duplicate"]],
duplicates = sapply(
1L:nrow(.SD),
function(i) paste(setdiff(.SD[["name"]], .SD[["name"]][i]), collapse = "|")
)
),
by = "group", .SDcols = c("name", "is_duplicate")
][, "group" := NULL]

# get regions ------------------------------------------------------------

corpus_obj <- corpus(corpus)
x <- corpus(corpus)
regions <- setDT(
RcppCWB::s_attribute_decode(
corpus = corpus,
data_dir = corpus_obj@data_dir,
s_attribute = s_attribute,
encoding = corpus_obj@encoding,
registry = corpus_obj@registry_dir,
method = "Rcpp"
)
)
setnames(regions, old = "value", new = s_attribute)
setkeyv(regions, s_attribute)

# finalize annotation data -----------------------------------------------

setnames(duplicates_dt, old = "name", new = s_attribute)
anno <- duplicates_dt[regions, on = s_attribute]
anno[,
"is_duplicate" := ifelse(
is.na(anno[["is_duplicate"]]),
FALSE,
anno[["is_duplicate"]]
)
]
anno[,
"duplicates" := ifelse(
is.na(anno[["duplicates"]]),
"",
anno[["duplicates"]]
)]
setcolorder(
anno,
c("cpos_left", "cpos_right", s_attribute, "is_duplicate", "duplicates")
)
setorderv(anno, cols = "cpos_left")
anno
}

0 comments on commit 0a3e089

Please sign in to comment.