Skip to content

Commit

Permalink
as_docgroups() renamed to docgroups()
Browse files Browse the repository at this point in the history
  • Loading branch information
Andreas Blätte authored and Andreas Blätte committed Jul 9, 2023
1 parent 2ac291b commit b54f663
Show file tree
Hide file tree
Showing 11 changed files with 61 additions and 56 deletions.
8 changes: 4 additions & 4 deletions DESCRIPTION
@@ -1,8 +1,8 @@
Package: duplicates
Type: Package
Title: Near duplicate detection
Version: 0.1.3
Date: 2023-07-03
Version: 0.1.4
Date: 2023-07-09
Author: Andreas Blaette
Maintainer: Andreas Blaette <andreas.blaette@uni-due.de>
Additional_repositories: https://polmine.github.io/drat
Expand Down Expand Up @@ -32,8 +32,8 @@ License: GPL-3
Collate:
'duplicates_package.R'
'charcount.R'
'detect_duplicates.R'
'docsimil.R'
'utils.R'
'encode.R'
'annodata.R'
RoxygenNote: 7.2.3
Roxygen: list(markdown = TRUE)
6 changes: 3 additions & 3 deletions NAMESPACE
@@ -1,11 +1,11 @@
# Generated by roxygen2: do not edit by hand

export(as_docgroups)
export(charfilter)
export(detect_duplicates)
export(docgroups)
export(docsimil)
export(duplicates_as_annotation_data)
exportMethods(charcount)
exportMethods(detect_duplicates)
exportMethods(docsimil)
import(data.table)
importFrom(Matrix,triu)
importFrom(R6,R6Class)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
@@ -1,3 +1,8 @@
## v0.1.4

- Method `detect_duplicates()` renamed to `docsimil()`.
- Function `as_docgroups()` renamed to `docgroups()`.

## v0.1.3

- Function `minimize_vocabulary()` more generic and renamed as `charfilter()`.
Expand Down
4 changes: 2 additions & 2 deletions R/encode.R → R/annodata.R
Expand Up @@ -30,15 +30,15 @@
#' x <- corpus("REUTERS2") |>
#' split(s_attribute = "doc_id")
#'
#' dupl <- detect_duplicates(
#' dupl <- docsimil(
#' x = x,
#' p_attribute = "word",
#' s_attribute = "doc_id",
#' mc = parallel::detectCores() - 2L,
#' vocab = vocab
#' )
#'
#' grps <- as_docgroups(dupl)
#' grps <- docgroups(dupl)
#'
#' annodata <- duplicates_as_annotation_data(
#' x = grps,
Expand Down
34 changes: 17 additions & 17 deletions R/detect_duplicates.R → R/docsimil.R
@@ -1,5 +1,5 @@
#' @rdname detect_duplicates
setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplicates"))
#' @rdname docsimil
setGeneric("docsimil", function(x, ...) standardGeneric("docsimil"))


#' Detect Duplicates
Expand Down Expand Up @@ -34,8 +34,8 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica
#' @param verbose A `logical` value, whether to be verbose.
#' @param ... Further arguments (unused).
#' @param vocab Pruned vocabulary.
#' @export detect_duplicates
#' @rdname detect_duplicates
#' @export docsimil
#' @rdname docsimil
#' @importFrom parallel mclapply
#' @importFrom pbapply pblapply
#' @importFrom stats setNames
Expand Down Expand Up @@ -70,16 +70,16 @@ setGeneric("detect_duplicates", function(x, ...) standardGeneric("detect_duplica
#' x <- corpus("REUTERS2") |>
#' split(s_attribute = "doc_id")
#'
#' dupl <- detect_duplicates(
#' dupl <- docsimil(
#' x = x,
#' p_attribute = "word",
#' s_attribute = "doc_id",
#' mc = parallel::detectCores() - 2L,
#' vocab = vocab
#' )
#'
#' docgrps <- as_docgroups(dupl)
setMethod("detect_duplicates", "partition_bundle",
#' docgrps <- docgroups(dupl)
setMethod("docsimil", "partition_bundle",
function(
x, n = 5L, min_shingle_length = n,
p_attribute = "word", s_attribute = "text_date",
Expand All @@ -103,7 +103,7 @@ setMethod("detect_duplicates", "partition_bundle",
weigh(method = "tfidf") |>
as.sparseMatrix()

dt <- detect_duplicates(
dt <- docsimil(
x = m,
n = n,
min_shingle_length = min_shingle_length,
Expand Down Expand Up @@ -139,11 +139,11 @@ setMethod("detect_duplicates", "partition_bundle",
#' chars <- chars[grep("[a-zA-Z]", names(chars))]
#' char <- names(chars[order(chars, decreasing = FALSE)][1:20])
#'
#' dupl <- detect_duplicates(x = x, n = 5L, char = char, threshold = 0.6)
#' dupl <- docsimil(x = x, n = 5L, char = char, threshold = 0.6)
#'
#' docgrps <- as_docgroups(dupl, cols = "name", order = 1L)
#' @rdname detect_duplicates
setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length = n, char = "", threshold = 0.9, verbose = TRUE, mc = FALSE){
#' docgrps <- docgroups(dupl, cols = "name", order = 1L)
#' @rdname docsimil
setMethod("docsimil", "list", function(x, n = 5L, min_shingle_length = n, char = "", threshold = 0.9, verbose = TRUE, mc = FALSE){
started <- Sys.time()

stopifnot(is.character(char))
Expand Down Expand Up @@ -173,7 +173,7 @@ setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length =

m <- weigh(tdm, method = "tfidf") |> as.sparseMatrix()

detect_duplicates(
docsimil(
x = m,
n = n,
min_shingle_length = min_shingle_length,
Expand All @@ -183,9 +183,9 @@ setMethod("detect_duplicates", "list", function(x, n = 5L, min_shingle_length =
}
)

#' @rdname detect_duplicates
#' @rdname docsimil
#' @export
setMethod("detect_duplicates", "dgCMatrix", function(x, n, min_shingle_length, threshold, verbose){
setMethod("docsimil", "dgCMatrix", function(x, n, min_shingle_length, threshold, verbose){

# Very short documents may result in shingle lengths below n, and this
# may result in an undesired complete similarity. So drop short
Expand Down Expand Up @@ -220,9 +220,9 @@ setMethod("detect_duplicates", "dgCMatrix", function(x, n, min_shingle_length, t
#' @param cols XXX.
#' @param order XXX.
#' @importFrom igraph graph_from_data_frame decompose get.vertex.attribute
#' @export as_docgroups
#' @export docgroups
#' @rdname docgroups
as_docgroups <- function(x, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)){
docgroups <- function(x, drop = NULL, cols = c("size", "name"), order = c(1L, 1L)){

ids <- x[, c("name", "duplicate_name")] |>
as.data.frame() |>
Expand Down
8 changes: 4 additions & 4 deletions man/docgroups.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 14 additions & 14 deletions man/detect_duplicates.Rd → man/docsimil.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/duplicates_as_annotation_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions tests/testthat/test_detect_duplicates.R
Expand Up @@ -2,7 +2,7 @@ library(polmineR)
library(duplicates)
use("duplicates")

testthat::context("detect_duplicates")
testthat::context("docsimil")

test_that(
"run duplicate detection",
Expand All @@ -24,7 +24,7 @@ test_that(
x <- corpus("REUTERS2") |>
polmineR::split(s_attribute = "doc_id")

dupl <- detect_duplicates(
dupl <- docsimil(
x = x,
p_attribute = "word",
s_attribute = "doc_id",
Expand All @@ -41,7 +41,7 @@ test_that(
polmineR::split(s_attribute = "doc_id") %>%
get_token_stream(p_attribute = "word", collapse = "")

dupl2 <- detect_duplicates(
dupl2 <- docsimil(
x = x,
n = 5L,
char = names(charcount[1:20]),
Expand All @@ -58,8 +58,8 @@ test_that(

# --------------------------------------------------------------------------

docgroups1 <- as_docgroups(dupl)
docgroups2 <- as_docgroups(dupl2, cols = "name", order = 1L)
docgroups1 <- docgroups(dupl)
docgroups2 <- docgroups(dupl2, cols = "name", order = 1L)

expect_identical(docgroups1[["group"]], docgroups2[["group"]])
expect_identical(docgroups1[["name"]], docgroups2[["name"]])
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test_nchars.R
Expand Up @@ -2,7 +2,7 @@ library(polmineR)
library(duplicates)
use("duplicates")

testthat::context("detect_duplicates")
testthat::context("docsimil")

test_that(
"crosscheck charcount",
Expand Down
6 changes: 3 additions & 3 deletions vignettes/vignette.Rmd
Expand Up @@ -37,7 +37,7 @@ vocab <- corpus("REUTERS2") %>%
x <- corpus("REUTERS2") |>
split(s_attribute = "doc_id")
dupl <- detect_duplicates(
dupl <- docsimil(
x = x,
p_attribute = "word",
s_attribute = "doc_id",
Expand All @@ -54,7 +54,7 @@ dupl <- detect_duplicates(
x <- corpus("REUTERS2") |>
split(s_attribute = "doc_id")
dupl <- detect_duplicates(
dupl <- docsimil(
x = x,
p_attribute = "word",
s_attribute = "doc_id",
Expand All @@ -67,7 +67,7 @@ dupl <- detect_duplicates(
## Write to corpus

```{r}
groups <- as_docgroups(dupl)
groups <- docgroups(dupl)
annodata <- duplicates_as_annotation_data(
x = groups,
Expand Down

0 comments on commit b54f663

Please sign in to comment.