Overhaul the entire documentation with markdown

gesistsa · Nov 7, 2023 · 2a72c79 · 2a72c79
1 parent 1c79527
commit 2a72c79
Show file tree

Hide file tree

Showing 30 changed files with 157 additions and 239 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -12,7 +12,7 @@ License: GPL (>= 3)
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.0
+RoxygenNote: 7.2.3
 URL: https://github.com/gesistsa/sweater
 BugReports: https://github.com/gesistsa/sweater/issues
 LinkingTo: 

diff --git a/R/ect.R b/R/ect.R
@@ -3,14 +3,12 @@
 #' This function estimate the Embedding Coherence Test (ECT) of word embeddings (Dev & Philips, 2019). If possible, please use [query()] instead.
 #'
 #' @inheritParams weat
-#' @return A list with class \code{"ect"} containing the following components:
-#' \describe{
-#' \item{\code{$A_words}}{the input A_words}
-#' \item{\code{$B_words}}{the input B_words}
-#' \item{\code{$S_words}}{the input S_words}
-#' \item{\code{$u_a}}{Cosine similarity between each word vector of S_words and average vector of A_words}
-#' \item{\code{$u_b}}{Cosine similarity between each word vector of S_words and average vector of B_words}
-#' }
+#' @return A list with class `"ect"` containing the following components:
+#' * `$A_words` the input A_words
+#' * `$B_words` the input B_words
+#' * `$S_words` the input S_words
+#' * `$u_a` Cosine similarity between each word vector of S_words and average vector of A_words
+#' * `$u_b` Cosine similarity between each word vector of S_words and average vector of B_words
 #' @seealso
 #' [ect_es()] can be used to obtain the effect size of the test.
 #' [plot_ect()] can be used to visualize the result.
@@ -37,9 +35,8 @@
 #' "females", "sisters", "aunt", "aunts", "niece", "nieces")
 #' garg_f1 <- ect(googlenews, S1, A1, B1)
 #' plot_ect(garg_f1)
-#' @author Chung-hong Chan
 #' @references
-#' Dev, S., & Phillips, J. (2019, April). [Attenuating bias in word vectors.](https://proceedings.mlr.press/v89/dev19a.html) In The 22nd International Conference on Artificial Intelligence and Statistics (pp. 879-887). PMLR. 
+#' Dev, S., & Phillips, J. (2019, April). [Attenuating bias in word vectors.](https://proceedings.mlr.press/v89/dev19a.html) In The 22nd International Conference on Artificial Intelligence and Statistics (pp. 879-887). PMLR.
 #' @export
 ect <- function(w, S_words, A_words, B_words, verbose = FALSE) {
     w_lab <- rownames(w)
@@ -65,7 +62,6 @@ ect <- function(w, S_words, A_words, B_words, verbose = FALSE) {
 #' This functions calculates the Spearman Coefficient of an Embedding Coherence Test. The value ranges from -1 to +1 and a larger value indicates less bias. If possible, please use [calculate_es()] instead.
 #' @param x an ect object from the [ect()] function.
 #' @return Spearman Coefficient
-#' @author Chung-hong Chan
 #' @references
 #' Dev, S., & Phillips, J. (2019, April). [Attenuating bias in word vectors.](https://proceedings.mlr.press/v89/dev19a.html) In The 22nd International Conference on Artificial Intelligence and Statistics (pp. 879-887). PMLR.
 #' @export
@@ -80,10 +76,9 @@ ect_es <- function(x) {
 #' Plot an ECT result on a two-dimensional plane
 #'
 #' This functions plot the words in `S_words` on a 2D plane according to their association with the average vectors of `A_words` and `B_words`. A equality line is also added. Words along the equality line have less bias. Words located on the upper side of the equality line have a stronger association with `A_words` and vice versa.
-#' @param x an ect object from the \link{ect} function.
+#' @param x an ect object from the [ect] function.
 #' @param ... additional parameters to the underlying [plot()] function
 #' @return a plot
-#' @author Chung-hong Chan
 #' @export
 plot_ect <- function(x, ...) {
     if (!"ect" %in% class(x)) {

diff --git a/R/mac.R b/R/mac.R
@@ -3,14 +3,11 @@
 #' This function calculates the mean average cosine similarity (MAC) score proposed in Manzini et al (2019). If possible, please use [query()] instead.
 #'
 #' @inheritParams weat
-#' @return A list with class \code{"mac"} containing the following components:
-#' \describe{
-#' \item{\code{$P}}{a vector of cosine similarity values for every word in S_words}
-#' \item{\code{$S_words}}{the input S_words}
-#' \item{\code{$A_words}}{the input A_words}
-#' }
-#' \code{\link{mac_es}} can be used to obtain the effect size of the test.
-#' @author Chung-hong Chan
+#' @return A list with class `"mac"` containing the following components:
+#' * `$P` a vector of cosine similarity values for every word in S_words
+#' * `$S_words` the input S_words
+#' * `$A_words` the input A_words
+#' [mac_es()] can be used to obtain the effect size of the test.
 #' @examples
 #' data(googlenews)
 #' S1 <- c("janitor", "statistician", "midwife", "bailiff", "auctioneer",
@@ -49,8 +46,8 @@ mac <- function(w, S_words, A_words, verbose = FALSE) {
 #' Calculation of MAC Effect Size
 #'
 #' This function calculates the mean of cosine distance values. If possible, please use [calculate_es()] instead.
-#' 
-#' @param x an object from the function \link{mac}
+#'
+#' @param x an object from the function [mac]
 #' @return Mean of all cosine similarity values
 #' @author Chung-hong Chan
 #' @export

diff --git a/R/misc.R b/R/misc.R
@@ -16,7 +16,6 @@ NULL
 #' The file can have or have not the "verification line", i.e. the first line contains the dimensionality of the matrix. If the verification line exists, the function will check the returned matrix for correctness.
 #' @param x path to your text file
 #' @return a dense matrix
-#' @author Chung-hong Chan
 #' @export
 read_word2vec <- function(x) {
     init_lines <- strsplit(readLines(x, n = 2), " ")
@@ -60,7 +59,6 @@ read_word2vec <- function(x) {
 #' For `ect`, this function calls [plot_ect()]. For other tests (except `weat`), this function plots the bias of words in `S` as a Cleveland Dot Plot. Plotting the result of `weat` is not supported.
 #' @param x an S3 object returned from mac, rnd, semaxis, nas or rnsb
 #' @return a plot
-#' @author Chung-hong Chan
 #' @export
 plot_bias <- function(x) {
     if ("ect" %in% class(x)) {
@@ -82,27 +80,26 @@ plot_bias <- function(x) {
 #' This function calculates the effect of a query.
 #' @param x an S3 object returned from a query, either by the function [query()] or underlying functions such as [mac()]
 #' @param ... additional parameters for the effect size functions
-#' \describe{
-#' \item{\code{r}}{for `weat`: a boolean to denote whether convert the effect size to biserial correlation coefficient.}
-#' \item{\code{standardize}}{for `weat`: a boolean to denote whether to correct the difference by the standard division. The standardized version can be interpreted the same way as Cohen's d. }
-#' }
+#' * `r` for `weat`: a boolean to denote whether convert the effect size to biserial correlation coefficient.
+#' * `standardize` for `weat`: a boolean to denote whether to correct the difference by the standard division. The standardized version can be interpreted the same way as Cohen's d.
 #' @return effect size
-#' @author Chung-hong Chan
 #' @details
 #' The following methods are supported.
-#' \describe{
-#' \item{\code{mac}}{mean cosine distance value. The value makes sense only for comparison (e.g. before and after debiasing). But a lower value indicates greater association between the target words and the attribute words.}
-#' \item{\code{rnd}}{sum of all relative norm distances. It equals to zero when there is no bias.}
-#' \item{\code{rnsb}}{Kullback-Leibler divergence of the predicted negative probabilities, P, from the uniform distribution. A lower value indicates less bias.}
-#' \item{\code{ect}}{Spearman Coefficient of an Embedding Coherence Test. The value ranges from -1 to +1 and a larger value indicates less bias.}
-#' \item{\code{weat}}{The standardized effect size (default) can be interpreted the same way as Cohen's D.}
-#' }
+#' * `mac` mean cosine distance value. The value makes sense only for comparison (e.g. before and after debiasing). But a lower value indicates greater association between the target words and the attribute words.
+#' * `rnd` sum of all relative norm distances. It equals to zero when there is no bias.
+#' * `rnsb` Kullback-Leibler divergence of the predicted negative probabilities, P, from the uniform distribution. A lower value indicates less bias.
+#' * `ect` Spearman Coefficient of an Embedding Coherence Test. The value ranges from -1 to +1 and a larger value indicates less bias.
+#' * `weat` The standardized effect size (default) can be interpreted the same way as Cohen's D.
 #' @seealso [weat_es()], [mac_es()], [rnd_es()], [rnsb_es()], [ect_es()]
 #' @references
 #' Caliskan, A., Bryson, J. J., & Narayanan, A. (2017). Semantics derived automatically from language corpora contain human-like biases. Science, 356(6334), 183-186. \doi{10.1126/science.aal4230}
+#'
 #' Dev, S., & Phillips, J. (2019, April). [Attenuating bias in word vectors.](https://proceedings.mlr.press/v89/dev19a.html) In The 22nd International Conference on Artificial Intelligence and Statistics (pp. 879-887). PMLR.
+#'
 #' Garg, N., Schiebinger, L., Jurafsky, D., & Zou, J. (2018). Word embeddings quantify 100 years of gender and ethnic stereotypes. Proceedings of the National Academy of Sciences, 115(16), E3635-E3644. \doi{10.1073/pnas.1720347115}
+#'
 #' Manzini, T., Lim, Y. C., Tsvetkov, Y., & Black, A. W. (2019). [Black is to criminal as caucasian is to police: Detecting and removing multiclass bias in word embeddings.](https://arxiv.org/abs/1904.04047) arXiv preprint arXiv:1904.04047.
+#'
 #' Sweeney, C., & Najafian, M. (2019, July). [A transparent framework for evaluating unintended demographic bias in word embeddings.](https://aclanthology.org/P19-1162/) In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (pp. 1662-1667).
 #' @export
 calculate_es <- function(x, ...) {

diff --git a/R/nas.R b/R/nas.R
@@ -16,15 +16,12 @@
 #'
 #' This functions quantifies the bias in a set of word embeddings by Caliskan et al (2017). In comparison to WEAT introduced in the same paper, this method is more suitable for continuous ground truth data. See Figure 1 and Figure 2 of the original paper. If possible, please use [query()] instead.
 #' @inheritParams weat
-#' @author Chung-hong Chan
-#' @return A list with class \code{"nas"} containing the following components:
-#' \describe{
-#' \item{\code{$P}}{a vector of normalized association score for every word in S}
-#' \item{\code{$raw}}{a list of raw results used for calculating normalized association scores}
-#' \item{\code{$S_words}}{the input S_words}
-#' \item{\code{$A_words}}{the input A_words}
-#' \item{\code{$B_words}}{the input B_words}
-#' }
+#' @return A list with class `"nas"` containing the following components:
+#' * `$P` a vector of normalized association score for every word in S
+#' * `$raw` a list of raw results used for calculating normalized association scores
+#' * `$S_words` the input S_words
+#' * `$A_words` the input A_words
+#' * `$B_words` the input B_words
 #' @export
 #' @references
 #' Caliskan, A., Bryson, J. J., & Narayanan, A. (2017). Semantics derived automatically from language corpora contain human-like biases. Science, 356(6334), 183-186. \doi{10.1126/science.aal4230}

diff --git a/R/query.R b/R/query.R
@@ -2,14 +2,14 @@
     if (missing(w)) {
         stop("w must be provided.")
     }
-    if (missing(S_words) | missing(A_words)) {
+    if (missing(S_words) || missing(A_words)) {
         stop("S_words and A_words must be provided.")
     }
     if (!method %in% c("guess", "weat", "mac", "nas", "semaxis", "rnsb", "rnd", "ect")) {
         stop("Unkonwn method. Available methods are: guess, weat, mac, nas, semaxis, rnsb, rnd, ect.")
     }
     if (method == "guess") {
-        if (missing(T_words) & missing(B_words)) {
+        if (missing(T_words) && missing(B_words)) {
             method <- "mac"
         } else if (missing(T_words)) {
             if (verbose) {
@@ -27,21 +27,16 @@
 #'
 #' This function makes a query based on the supplied parameters. The object can then be displayed by the S3 method [print.sweater()] and plotted by [plot.sweater()].
 #' @param ... additional parameters for the underlying function
-#' \describe{
-#' \item{\code{l}}{for "semaxis": an integer indicates the number of words to augment each word in A and B based on cosine , see An et al (2018). Default to 0 (no augmentation).}
-#' \item{\code{levels}}{for "rnsb": levels of entries in a hierarchical dictionary that will be applied (see [quanteda::dfm_lookup()])}
-#' }
+#' * `l` for "semaxis": an integer indicates the number of words to augment each word in A and B based on cosine , see An et al (2018). Default to 0 (no augmentation).
+#' * `levels` for "rnsb": levels of entries in a hierarchical dictionary that will be applied (see [quanteda::dfm_lookup()])
 #' @param method string, the method to be used to make the query. Available options are: `weat`, `mac`, `nas`, `semaxis`, `rnsb`, `rnd`, `nas`, `ect` and `guess`. If "guess", the function selects one of the following methods based on your provided wordsets.
-#' \itemize{
-#' \item{S_words & A_words - }{"mac"}
-#' \item{S_words, A_words & B_words - }{"rnd"}
-#' \item{S_words, T_words, A_words & B_words - }{"weat"}
-#' }
+#' * S_words & A_words -  "mac"
+#' * S_words, A_words & B_words -  "rnd"
+#' * S_words, T_words, A_words & B_words -  "weat"
 #' @inheritParams weat
 #' @param x a sweater S3 object
 #' @return a sweater S3 object
 #' @seealso [weat()], [mac()], [nas()], [semaxis()], [rnsb()], [rnd()], [nas()], [ect()]
-#' @author Chung-hong Chan
 #' @examples
 #' data(googlenews)
 #' S1 <- c("janitor", "statistician", "midwife", "bailiff", "auctioneer",
@@ -68,11 +63,11 @@
 #' plot(garg_f1)
 #' @export
 query <- function(w, S_words, T_words, A_words, B_words, method = "guess", verbose = FALSE, ...) {
-    method <- .guess(w = w, S_words= S_words, T_words= T_words,
-                     A_words = A_words, B_words= B_words, method = method,
+    method <- .guess(w = w, S_words = S_words, T_words= T_words,
+                     A_words = A_words, B_words = B_words, method = method,
                      verbose = verbose)
     switch(method,
-           "weat" = weat(w = w, S_words= S_words, T_words = T_words, A_words = A_words, B_words = B_words, verbose = verbose),
+           "weat" = weat(w = w, S_words = S_words, T_words = T_words, A_words = A_words, B_words = B_words, verbose = verbose),
            "mac" = mac(w = w, S_words = S_words, A_words = A_words, verbose = verbose),
            "nas" = nas(w = w, S_words = S_words, A_words = A_words, B_words = B_words, verbose = verbose),
            "semaxis" = semaxis(w = w, S_words = S_words, A_words = A_words, B_words = B_words, verbose = verbose, ...),

diff --git a/R/rnd.R b/R/rnd.R
@@ -7,17 +7,14 @@
 #' Relative Norm Distance
 #'
 #' This function calculate the relative norm distance (RND) of word embeddings. If possible, please use [query()] instead.
-#' 
+#'
 #' @inheritParams weat
-#' @return A list with class \code{"rnd"} containing the following components:
-#' \describe{
-#' \item{\code{$norm_diff}}{a vector of relative norm distances for every word in S_words}
-#' \item{\code{$S_words}}{the input S_words}
-#' \item{\code{$A_words}}{the input A_words}
-#' \item{\code{$B_words}}{the input B_words}
-#' }
-#' \code{\link{rnd_es}} can be used to obtain the effect size of the test.
-#' @author Chung-hong Chan
+#' @return A list with class `"rnd"` containing the following components:
+#' * `$norm_diff` a vector of relative norm distances for every word in S_words
+#' * `$S_words` the input S_words
+#' * `$A_words` the input A_words
+#' * `$B_words` the input B_words
+#' [rnd_es()] can be used to obtain the effect size of the test.
 #' @examples
 #' data(googlenews)
 #' S1 <- c("janitor", "statistician", "midwife", "bailiff", "auctioneer",
@@ -64,9 +61,8 @@ rnd <- function(w, S_words, A_words, B_words, verbose = FALSE) {
 #' Calculation of sum of all relative norm distances
 #'
 #' This function calculates the sum of all relative norm distances from the relative norm distance test. If possible, please use [calculate_es()] instead.
-#' @param x an object from the function \link{rnd}
+#' @param x an object from the function [rnd]
 #' @return Sum of all relative norm distances
-#' @author Chung-hong Chan
 #' @export
 #' @references
 #' Garg, N., Schiebinger, L., Jurafsky, D., & Zou, J. (2018). Word embeddings quantify 100 years of gender and ethnic stereotypes. Proceedings of the National Academy of Sciences, 115(16), E3635-E3644. \doi{10.1073/pnas.1720347115}

diff --git a/R/rnsb.R b/R/rnsb.R
@@ -4,15 +4,13 @@
 #'
 #' @inheritParams weat
 #' @param levels levels of entries in a hierarchical dictionary that will be applied (see [quanteda::dfm_lookup()])
-#' @return A list with class \code{"rnsb"} containing the following components:
-#' \describe{
-#' \item{\code{$classifer}}{ a logistic regression model with L2 regularization trained with LiblineaR}
-#' \item{\code{$A_words}}{the input A_words}
-#' \item{\code{$B_words}}{the input B_words}
-#' \item{\code{$S_words}}{the input S_words}
-#' \item{\code{$P}}{the predicted negative sentiment probabilities}
-#' }
-#' \code{\link{rnsb_es}} can be used to obtain the effect size of the test.
+#' @return A list with class `"rnsb"` containing the following components:
+#' * `$classifer`  a logistic regression model with L2 regularization trained with LiblineaR
+#' * `$A_words` the input A_words
+#' * `$B_words` the input B_words
+#' * `$S_words` the input S_words
+#' * `$P` the predicted negative sentiment probabilities
+#' [rnsb_es()] can be used to obtain the effect size of the test.
 #' @examples
 #' data(googlenews)
 #' S1 <- c("janitor", "statistician", "midwife", "bailiff", "auctioneer",
@@ -36,7 +34,6 @@
 #' "females", "sisters", "aunt", "aunts", "niece", "nieces")
 #' garg_f1 <- rnsb(googlenews, S1, A1, B1)
 #' plot_bias(garg_f1)
-#' @author Chung-hong Chan
 #' @references
 #' Sweeney, C., & Najafian, M. (2019, July). [A transparent framework for evaluating unintended demographic bias in word embeddings.](https://aclanthology.org/P19-1162/) In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (pp. 1662-1667).
 #' @export
@@ -73,9 +70,8 @@ rnsb <- function(w, S_words, A_words, B_words, levels = 1, verbose = FALSE) {
 #' Calculation the Kullback-Leibler divergence
 #'
 #' This function calculates the Kullback-Leibler divergence of the predicted negative probabilities, P, from the uniform distribution. If possible, please use [calculate_es()] instead.
-#' @param x an rnsb object from the \link{rnsb} function.
+#' @param x an rnsb object from the [rnsb] function.
 #' @return the Kullback-Leibler divergence.
-#' @author Chung-hong Chan
 #' @references
 #' Sweeney, C., & Najafian, M. (2019, July). [A transparent framework for evaluating unintended demographic bias in word embeddings.](https://aclanthology.org/P19-1162/) In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (pp. 1662-1667).
 #' @export
@@ -90,7 +86,7 @@ rnsb_es <- function(x) {
 }
 
 ## plot_rnsbs <- function(rnsb1, rnsb2, rnsb1_label = "rnsb1", rnsb2_label = "rnsb2") {
-##     groupnames <- c(names(rnsb1$P), names(rnsb2$P)) 
+##     groupnames <- c(names(rnsb1$P), names(rnsb2$P))
 ##     values <- c(rnsb1$P, rnsb2$P)
 ##     labels <- c(rep(rnsb1_label, length(rnsb1$P)), rep(rnsb2_label, length(rnsb2$P)))
 ##     diff <- values - c(rnsb1$P, rnsb1$P)
@@ -99,4 +95,3 @@ rnsb_es <- function(x) {
 ##     data_to_plot <- data_to_plot[!is.na(data_to_plot$values)]
 ##     ggplot2::ggplot(data_to_plot, ggplot2::aes(x = forcats::fct_reorder(groupnames, diff), y = values, fill = labels)) + ggplot2::geom_bar(stat = "identity", position=ggplot2::position_dodge()) + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1)) + ggplot2::xlab("S") + ggplot2::ylab("P") + ggplot2::geom_hline(yintercept = equality, lty = 2, color = "darkgray") + ggplot2::coord_flip()
 ## }
-