kwic update

SLCLADAL · May 7, 2024 · 57e4a19 · 57e4a19
1 parent 0435d76
commit 57e4a19
Show file tree

Hide file tree

Showing 7 changed files with 2,652 additions and 802 deletions.
diff --git a/cbs/kwics_cb.Rmd b/cbs/kwics_cb.Rmd
diff --git a/cbs/kwics_cb.html b/cbs/kwics_cb.html
diff --git a/content/bibliography.bib b/content/bibliography.bib
@@ -285,7 +285,7 @@ @article{benoit2021package
 @article{benoit2018quanteda,
   author       = {Benoit, Kenneth and Watanabe, Kohei and Wang, Haiyan and Nulty, Paul and Obeng, Adam and M{\"u}ller, Stefan and Matsuo, Akitaka},
   year         = 2018,
-  title        = {quanteda: An R package for the quantitative analysis of textual data},
+  title        = {quanteda: an R package for the quantitative analysis of textual data},
   journal      = {Journal of Open Source Software},
   volume       = 3,
   number       = 30,

diff --git a/content/kwics.Rmd b/content/kwics.Rmd
diff --git a/docs/bibliography.bib b/docs/bibliography.bib
@@ -285,7 +285,7 @@ @article{benoit2021package
 @article{benoit2018quanteda,
   author       = {Benoit, Kenneth and Watanabe, Kohei and Wang, Haiyan and Nulty, Paul and Obeng, Adam and M{\"u}ller, Stefan and Matsuo, Akitaka},
   year         = 2018,
-  title        = {quanteda: An R package for the quantitative analysis of textual data},
+  title        = {quanteda: an R package for the quantitative analysis of textual data},
   journal      = {Journal of Open Source Software},
   volume       = 3,
   number       = 30,

diff --git a/docs/kwics.html b/docs/kwics.html
diff --git a/rscripts/prepam_new.R b/rscripts/prepam_new.R
@@ -0,0 +1,65 @@
+# function that prepares a table with the columns w1, w2 and O11 for 
+# extracting association measures.
+
+# load required packages
+require(dplyr)
+require(stringr)
+require(tokenizers)
+require(quanteda)
+require(tidytext)
+
+
+prepam <- function(x, left_context = 5, right_context = 5){
+words %>%  # TypeList %>%
+    paste0(collapse = " ") %>%
+    stringr::str_split("\\b") %>%
+    unlist() %>%
+    tolower() %>%
+    table() -> test
+    # concatenate the elements in the 'text' object
+    paste0(collapse = " ") %>%
+    # separate possessives and contractions
+    stringr::str_replace_all(fixed("'"), fixed(" '")) %>%
+    stringr::str_replace_all(fixed("’"), fixed(" '")) %>%
+    # split text into sentences
+    tokenizers::tokenize_sentences() %>%
+    # unlist sentences
+    unlist() %>%
+    # remove non-word characters
+    stringr::str_replace_all("\\W", " ") %>%
+    stringr::str_replace_all("[^[:alnum:] ]", " ") %>%
+    # remove superfluous white spaces
+    stringr::str_squish() %>%
+    # convert to lower case and save in 'sentences' object
+    tolower() %>%
+    # tokenize the 'sentences' data using quanteda package
+    quanteda::tokens() %>%
+    # create a document-feature matrix (dfm) using quanteda
+    quanteda::dfm() %>%
+    # create a feature co-occurrence matrix (fcm) without considering trigrams
+    quanteda::fcm(tri = FALSE) %>%
+    # tidy the data using tidytext package
+    tidytext::tidy() %>%
+    # rearrange columns for better readability
+    dplyr::relocate(term, document, count) %>%
+    # rename columns for better interpretation
+    dplyr::rename(w1 = 1,
+                  w2 = 2,
+                  O11 = 3) %>%  
+    dplyr::mutate(N = sum(O11)) %>%
+
+    # calculate R1, O12, and R2
+    dplyr::group_by(w1) %>%
+    dplyr::mutate(R1 = sum(O11),
+                  O12 = R1 - O11,
+                  R2 = N - R1) %>%
+    dplyr::ungroup(w1) %>%
+
+    # calculate C1, O21, C2, and O22
+    dplyr::group_by(w2) %>%
+    dplyr::mutate(C1 = sum(O11),
+                  O21 = C1 - O11,
+                  C2 = N - C1,
+                  O22 = R2 - O21) -> x
+  return(x)
+}