as_docgroups() renamed to docgroups()

PolMine · Jul 9, 2023 · b54f663 · b54f663
1 parent 2ac291b
commit b54f663
Show file tree

Hide file tree

Showing 11 changed files with 61 additions and 56 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: duplicates
 Type: Package
 Title: Near duplicate detection
-Version: 0.1.3
-Date: 2023-07-03
+Version: 0.1.4
+Date: 2023-07-09
 Author: Andreas Blaette
 Maintainer: Andreas Blaette <andreas.blaette@uni-due.de>
 Additional_repositories: https://polmine.github.io/drat
@@ -32,8 +32,8 @@ License: GPL-3
 Collate:
     'duplicates_package.R'
     'charcount.R'
-    'detect_duplicates.R'
+    'docsimil.R'
     'utils.R'
-    'encode.R'
+    'annodata.R'
 RoxygenNote: 7.2.3
 Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,11 +1,11 @@
 # Generated by roxygen2: do not edit by hand
 
-export(as_docgroups)
 export(charfilter)
-export(detect_duplicates)
+export(docgroups)
+export(docsimil)
 export(duplicates_as_annotation_data)
 exportMethods(charcount)
-exportMethods(detect_duplicates)
+exportMethods(docsimil)
 import(data.table)
 importFrom(Matrix,triu)
 importFrom(R6,R6Class)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+## v0.1.4
+
+- Method `detect_duplicates()` renamed to `docsimil()`.
+- Function `as_docgroups()` renamed to `docgroups()`.
+
 ## v0.1.3
 
 - Function `minimize_vocabulary()` more generic and renamed as `charfilter()`.

diff --git a/R/encode.R → R/annodata.R b/R/encode.R → R/annodata.R
@@ -30,15 +30,15 @@
 #' x <- corpus("REUTERS2") |>
 #'   split(s_attribute = "doc_id")
 #' 
-#' dupl <- detect_duplicates(
+#' dupl <- docsimil(
 #'     x = x,
 #'     p_attribute = "word",
 #'     s_attribute = "doc_id",
 #'     mc = parallel::detectCores() - 2L,
 #'     vocab = vocab
 #'   )
 #' 
-#' grps <- as_docgroups(dupl)
+#' grps <- docgroups(dupl)
 #' 
 #' annodata <- duplicates_as_annotation_data(
 #'   x = grps,

diff --git a/R/detect_duplicates.R → R/docsimil.R b/R/detect_duplicates.R → R/docsimil.R
@@ -1,5 +1,5 @@
-#' @rdname detect_duplicates
-setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplicates"))
+#' @rdname docsimil
+setGeneric("docsimil", function(x, ...) standardGeneric("docsimil"))
 
 
 #' Detect Duplicates
@@ -34,8 +34,8 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica
 #' @param verbose A `logical` value, whether to be verbose.
 #' @param ... Further arguments (unused).
 #' @param vocab Pruned vocabulary.
-#' @export detect_duplicates
-#' @rdname detect_duplicates
+#' @export docsimil
+#' @rdname docsimil
 #' @importFrom parallel mclapply
 #' @importFrom pbapply pblapply
 #' @importFrom stats setNames
@@ -70,16 +70,16 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica
 #' x <- corpus("REUTERS2") |>
 #'   split(s_attribute = "doc_id")
 #' 
-#' dupl <- detect_duplicates(
+#' dupl <- docsimil(
 #'     x = x,
 #'     p_attribute = "word",
 #'     s_attribute = "doc_id",
 #'     mc = parallel::detectCores() - 2L,
 #'     vocab = vocab
 #'   )
 #'   
-#' docgrps <- as_docgroups(dupl)
-setMethod("detect_duplicates", "partition_bundle",
+#' docgrps <- docgroups(dupl)
+setMethod("docsimil", "partition_bundle",
   function(
     x, n = 5L, min_shingle_length = n,
     p_attribute = "word", s_attribute = "text_date",
@@ -103,7 +103,7 @@ setMethod("detect_duplicates", "partition_bundle",
       weigh(method = "tfidf") |>
       as.sparseMatrix()
 
-    dt <- detect_duplicates(
+    dt <- docsimil(
       x = m,
       n = n,
       min_shingle_length = min_shingle_length,
@@ -139,11 +139,11 @@ setMethod("detect_duplicates", "partition_bundle",
 #' chars <- chars[grep("[a-zA-Z]", names(chars))]
 #' char <- names(chars[order(chars, decreasing = FALSE)][1:20])
 #' 
-#' dupl <- detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6)
+#' dupl <- docsimil(x = x, n = 5L, char = char, threshold = 0.6)
 #' 
-#' docgrps <- as_docgroups(dupl, cols = "name", order = 1L)
-#' @rdname detect_duplicates
-setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length = n, char = "", threshold = 0.9, verbose = TRUE, mc = FALSE){ 
+#' docgrps <- docgroups(dupl, cols = "name", order = 1L)
+#' @rdname docsimil
+setMethod("docsimil", "list", function(x, n = 5L, min_shingle_length = n, char = "", threshold = 0.9, verbose = TRUE, mc = FALSE){ 
   started <- Sys.time()
 
   stopifnot(is.character(char))
@@ -173,7 +173,7 @@ setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length =
 
   m <- weigh(tdm, method = "tfidf") |> as.sparseMatrix()
 
-  detect_duplicates(
+  docsimil(
     x = m,
     n = n,
     min_shingle_length = min_shingle_length,
@@ -183,9 +183,9 @@ setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length =
 }
 )
 
-#' @rdname detect_duplicates
+#' @rdname docsimil
 #' @export
-setMethod("detect_duplicates", "dgCMatrix", function(x, n, min_shingle_length, threshold, verbose){
+setMethod("docsimil", "dgCMatrix", function(x, n, min_shingle_length, threshold, verbose){
 
   # Very short documents may result in shingle lengths below n, and this
   # may result in an undesired complete similarity. So drop short 
@@ -220,9 +220,9 @@ setMethod("detect_duplicates", "dgCMatrix", function(x, n, min_shingle_length, t
 #' @param cols XXX.
 #' @param order XXX.
 #' @importFrom igraph graph_from_data_frame decompose get.vertex.attribute
-#' @export as_docgroups
+#' @export docgroups
 #' @rdname docgroups
-as_docgroups <- function(x, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)){
+docgroups <- function(x, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)){
 
   ids <- x[, c("name", "duplicate_name")] |>
     as.data.frame() |>

diff --git a/man/docgroups.Rd b/man/docgroups.Rd
diff --git a/man/detect_duplicates.Rd → man/docsimil.Rd b/man/detect_duplicates.Rd → man/docsimil.Rd
diff --git a/man/duplicates_as_annotation_data.Rd b/man/duplicates_as_annotation_data.Rd
diff --git a/tests/testthat/test_detect_duplicates.R b/tests/testthat/test_detect_duplicates.R
@@ -2,7 +2,7 @@ library(polmineR)
 library(duplicates)
 use("duplicates")
 
-testthat::context("detect_duplicates")
+testthat::context("docsimil")
 
 test_that(
   "run duplicate detection",
@@ -24,7 +24,7 @@ test_that(
     x <- corpus("REUTERS2") |>
       polmineR::split(s_attribute = "doc_id")
 
-    dupl <- detect_duplicates(
+    dupl <- docsimil(
         x = x,
         p_attribute = "word",
         s_attribute = "doc_id",
@@ -41,7 +41,7 @@ test_that(
       polmineR::split(s_attribute = "doc_id") %>%
       get_token_stream(p_attribute = "word", collapse = "")
 
-    dupl2 <- detect_duplicates(
+    dupl2 <- docsimil(
       x = x,
       n = 5L,
       char = names(charcount[1:20]),
@@ -58,8 +58,8 @@ test_that(
 
     # --------------------------------------------------------------------------
 
-    docgroups1 <- as_docgroups(dupl)
-    docgroups2 <- as_docgroups(dupl2, cols = "name", order = 1L)
+    docgroups1 <- docgroups(dupl)
+    docgroups2 <- docgroups(dupl2, cols = "name", order = 1L)
 
     expect_identical(docgroups1[["group"]], docgroups2[["group"]])
     expect_identical(docgroups1[["name"]], docgroups2[["name"]])

diff --git a/tests/testthat/test_nchars.R b/tests/testthat/test_nchars.R
@@ -2,7 +2,7 @@ library(polmineR)
 library(duplicates)
 use("duplicates")
 
-testthat::context("detect_duplicates")
+testthat::context("docsimil")
 
 test_that(
   "crosscheck charcount",

diff --git a/vignettes/vignette.Rmd b/vignettes/vignette.Rmd
@@ -37,7 +37,7 @@ vocab <- corpus("REUTERS2") %>%
 x <- corpus("REUTERS2") |>
   split(s_attribute = "doc_id")
 
-dupl <- detect_duplicates(
+dupl <- docsimil(
     x = x,
     p_attribute = "word",
     s_attribute = "doc_id",
@@ -54,7 +54,7 @@ dupl <- detect_duplicates(
 x <- corpus("REUTERS2") |>
   split(s_attribute = "doc_id")
 
-dupl <- detect_duplicates(
+dupl <- docsimil(
     x = x,
     p_attribute = "word",
     s_attribute = "doc_id",
@@ -67,7 +67,7 @@ dupl <- detect_duplicates(
 ## Write to corpus
 
 ```{r}
-groups <- as_docgroups(dupl)
+groups <- docgroups(dupl)
 
 annodata <- duplicates_as_annotation_data(
   x = groups,