From 182718b1e38fe1333f116f10bbe01b1625487fcc Mon Sep 17 00:00:00 2001 From: Johannes Gruber Date: Mon, 21 May 2018 12:59:18 +0200 Subject: [PATCH] Added factor analysis to dna_cluster and renamed dna_plotMDS to dna_plotCoordinates --- Update rDNA package.R | 115 ++++++ manual/dna-manual.Rnw | 120 +++--- rDNA/DESCRIPTION | 4 +- rDNA/NAMESPACE | 3 +- rDNA/R/rDNA.R | 365 ++++++++++-------- rDNA/man/dna_cluster.Rd | 70 ++-- ...{dna_plotMDS.Rd => dna_plotCoordinates.Rd} | 23 +- rDNA/man/dna_toIgraph.Rd | 3 +- 8 files changed, 438 insertions(+), 265 deletions(-) create mode 100644 Update rDNA package.R rename rDNA/man/{dna_plotMDS.Rd => dna_plotCoordinates.Rd} (81%) diff --git a/Update rDNA package.R b/Update rDNA package.R new file mode 100644 index 00000000..5a22b1af --- /dev/null +++ b/Update rDNA package.R @@ -0,0 +1,115 @@ +# how to write an R package https://hilaryparker.com/2014/04/29/writing-an-r-package-from-scratch/ + +library("devtools") +library("roxygen2") + +#Update Documentation +#setwd("C:/Users/binis/Documents/GitHub/dna/rDNA") +#setwd("F:/Dropbox/Github/dna/rDNA") +# setwd("/home/johannes/Documents/Github 4/dna/rDNA/") +setwd("/home/johannes/Documents/Github/dna/rDNA/") +desc <- readLines("DESCRIPTION") +date <- desc[grepl("^Date:", desc)] +date2 <- gsub("[^[:digit:]-]", "", date) +desc[grepl("^Date:", desc)] <- gsub(date2, Sys.Date(), desc[grepl("^Date:", desc)]) +vers <- desc[grepl("^Version:", desc)] +vers2 <- gsub("[^[:digit:].]", "", vers) +vers3 <- readline(prompt = paste("New Version? Old:", vers2)) +desc[grepl("^Version:", desc)] <- gsub(vers2, vers3, desc[grepl("^Version:", desc)]) +writeLines(desc, "DESCRIPTION") + + +roxygen2::roxygenise(clean = TRUE) +setwd("..") +devtools::check("rDNA") +devtools::spell_check("rDNA", dict = "en_GB", ignore = c( + "CLASSPATH", + "dd", + "dna", + "DNA's", + "docTitle", + "excludeTypes", + "Gruber", + "Leifeld", + "igraph", + "ggplot", + "java", + "Jaccard", + "etc", + "mySQL", + "nw", + "org", + "pts", + "wd", + "plottable", + "linetype", + "bw", + "color", + "cutree", + "eigen", + "lineend", + "louvain", + "mcquitty", + "MDS", + "pam", + "plotDendro", + "RColorBrewer", + "walktrap", + "yyyy", + "onemode", + "twomode", + "getDocuments", + "clust", + "dh", + "drl", + "graphopt", + "kk", + "knitr", + "lables", + "lgl", + "mds", + "POSIXct", + "qualifierAggregation", + "setDocuments", + "dist", + "edgelist", + "eventSequence", + "isoMDS", + "plotMDS", + "timewindow", + "vegdist" +)) +#source("https://install-github.me/MangoTheCat/goodpractice") +#goodpractice::gp("rDNA") +#lintr::lint("rDNA.R") +#lintr::lint_package("rDNA") + +#devtools::install("rDNA") + +# update github +system("git status") + +system("git add -A") +commit_message <- readline(prompt = "Commit message") +system(paste0("git commit -m'", + commit_message, + "'")) + +#build +build(pkg = "rDNA", manual = TRUE) + +# create the package in wd +setwd("C:/Users/binis/Documents/GitHub/dna") +install("rDNA", args = c("--no-multiarch", "--no-test-load")) + +# change version number in manual + + + +### quick test +setwd("/home/johannes/Documents/Github/dna/rDNA/") +roxygen2::roxygenise(clean = TRUE) +setwd("..") +#devtools::check("rDNA") +devtools::install("rDNA") + diff --git a/manual/dna-manual.Rnw b/manual/dna-manual.Rnw index 2cdd902a..77f3fff0 100644 --- a/manual/dna-manual.Rnw +++ b/manual/dna-manual.Rnw @@ -116,13 +116,25 @@ <>= library("knitr") # get latest R version -site <- readLines("https://cran.r-project.org/bin/windows/base/", n = 10) +site <- tryCatch({ + readLines("https://cran.r-project.org/bin/windows/base/", n = 10) +}, +error = function(e) {warning("Internet connection neccessary to check for newest R version")}) R_vers <- site[grepl("^", site)] R_vers <- as.character(regmatches(R_vers, gregexpr("\\d+.\\d+.\\d+", R_vers))) +if (length(R_vers) == 0) { + R_vers <- "3.5.0" +} # get latest RStudio version -site <- readLines("https://www.rstudio.com/products/rstudio/download/") +site <- tryCatch({ + readLines("https://www.rstudio.com/products/rstudio/download/") +}, +error = function(e) {warning("Internet connection neccessary to check for newest R version")}) RS_vers <- site[grepl("<h4 id=\"download\"><strong>RStudio Desktop", site)] RS_vers <- as.character(regmatches(RS_vers, gregexpr("\\d+.\\d+.\\d+", RS_vers))) +if (length(RS_vers) == 0) { + RS_vers <- "1.1.447" +} # set global chunk options opts_chunk$set(fig.path = 'figure/workshop-', fig.align = 'center', fig.show = 'hold', error = FALSE) @@ -621,8 +633,8 @@ Otherwise please continue to the section for the operating system you wish to in For more experienced users, here is a short version of the steps described below: \begin{enumerate} \item (On Mac: install \href{https://support.apple.com/downloads/DL1572/en_US/javaforosx.dmg}{Apple's legacy version of \java}---even though we will never use it.) -\item Install \java\ Runtime Environment (JRE) (Version 8) on your computer. -\item (On Windows and Mac: set up the \code{JAVA\_HOME} to the installation path of your JRE.) +\item Install \java\ Runtime Environment (JDK) (Version 8) on your computer. +\item (On Windows and Mac: set up the \code{JAVA\_HOME} to the installation path of your JDK.) \item Download the newest executable JAR from \url{https://github.com/leifeld/dna/releases}. \item (On Linux: make the JAR file executable.) \\ (On Mac: allow executing apps from an unidentified developer.) @@ -643,7 +655,7 @@ During the installation, you can accept all the default options, including the i \begin{figure}[tbp] \includegraphics[frame, width=\textwidth]{03-1-downljava} - \caption{Downloading JRE from Oracle} + \caption{Downloading JDK from Oracle} \label{fig:downljava} \end{figure} @@ -861,9 +873,11 @@ $sudo gdebi libgstreamer-plugins-base0.10-0_0.10.36-2_amd64.deb # And then clean up $sudo rm libgstreamer0.10-0_0.10.36-1.5_amd64.deb libgstreamer-plugins-base0.10-0_0.10.36-2_amd64.deb @ - \item For Linux there is another system dependecy for \rdna\ which helps with handling units of physical quantities in the plot functions that come with the package. You should install this using: + \item For Linux there are afew other system dependecy for \rdna\. You should install these using: <<eval=FALSE, engine = 'bash', results = 'tex'>>= $sudo apt-get install libudunits2-dev +$sudo apt-get build-dep libcurl4-gnutls-dev +$sudo apt-get install libcurl4-gnutls-dev @ \item After the installation has finished, you can use \R\ by opening \rstudio. \end{enumerate} @@ -2096,8 +2110,8 @@ kable(dt, format = "latex", booktabs = TRUE, linesep = "", row_spec(6, bold = T) @ -Each value in Table~\ref{tab:disagree} represents a speciic value in a three-dimensional array which was descibed in Chapter~\ref{chp:algorithms}. -One of the $x_{ijk}$ values is, for example, the count for $i =$ \emph{``CO2 legislation will not hurt theeconomy.''}, $j =$ \emph{``Senate''} and $k =$ \emph{disgaree}. +Each value in Table~\ref{tab:disagree} represents a specific value in a three-dimensional array which was descibed in Chapter~\ref{chp:algorithms}. +One of the $x_{ijk}$ values is, for example, the count for $i =$ \emph{``CO2 legislation will not hurt the economy.''}, $j =$ \emph{``Senate''} and $k =$ \emph{disgaree}. Look in Table~\ref{tab:disagree} to see that this specific value is 2. You can also see that the Senate and Sierra Club co-reference three concepts: `\emph{`CO2 legislation will not hurt the economy.''}, \emph{``Emissions legislation should regulate CO2.''} and \emph{``There should be legislation to regulate emissions.''}. @@ -2121,7 +2135,7 @@ For the co-referenced concepts, the edge weight is number of statments from Sena \item[Congruence] This option is only available for one-mode networks. It means that only similarity or matches on the qualifier variable are counted in order to construct an edge. -In case of a binary second variable (e.\,g., (dis-)agreement) this means that only statements are counted were, for example, two organisations co-support or both co-reject a concept. +In case of a binary second variable (e.\,g., (dis-)agreement) this means that the only statements counted are those where, for example, two organisations co-support or both co-reject a concept. You can see how this changes the output when you compare Table~\ref{tab:onemode} and Table~\ref{tab:qual3}. Focusing again on the edge between the Senate and the Sierra Club, you can see now that the value dropped from seven to two as they do not support or reject all of the same concepts. We can repeat the same calculation, this time using Equation~\ref{eq:congruence_binary} from Section~\ref{subsec:congruence}: @@ -2363,9 +2377,9 @@ Only the option \code{no} is available in both types---which switches off normal \item \textbf{Two-mode network} \begin{description} \item[Activity] This divides the edge weights through the activity of the node from the first variable. -For example, in and concept $\times$ organisation network in which organisation A has made four statements in total, the edge with concept B, which A mentioned once, has the value 0.25. +For example, in concept $\times$ organisation network in which organisation A has made four statements in total, the edge with concept B, which A mentioned once, has the value 0.25. \item[Prominence] This divides the edge weights through the prominence of the node from the second variable. -For example, in and concept $\times$ organisation network in which concept B was mentioned eight times by all organisations and once by organisation A, the edge of A and B has the value 0.125. +For example, in concept $\times$ organisation network in which concept B was mentioned eight times by all organisations and once by organisation A, the edge of A and B has the value 0.125. \end{description} \item \textbf{One-mode network} \begin{description} @@ -2383,9 +2397,9 @@ Find this algorithm in Equation~\ref{eq:cosine}. \end{itemize} Normalisation can be used in the different networks described in Section~\ref{sec:qualifier} to correct potential biases introduced by very active nodes. By using a threshold value on the edge weights before visualising the network, normalisation can make it easier to remove low-intensity ties without discriminating against organisations with a low media profile. -Furthermore, the two normalisation algorithm which are based on vector similarities (Cosine and Jaccard) prepare networks to be fed into hierarchical cluster analyses, nonmetric multidimensional scaling, or other clustering techniques that are based on distance or similarity measures in order to identify coalitions in a policy debate as an alternative to community detection. +Furthermore, the two normalisation algorithms which are based on vector similarities (Cosine and Jaccard) prepare networks to be fed into hierarchical cluster analyses, nonmetric multidimensional scaling, or other clustering techniques that are based on distance or similarity measures in order to identify coalitions in a policy debate as an alternative to community detection. Details about normalisation can be found in \citet{leifeld2017discourse}. -In Table~\ref{tab:normal} you can see what how the two-mode network from Table~\ref{tab:twomode} looks like after applying the activity normalisation algorithm. +In Table~\ref{tab:normal} you can see how the two-mode network from Table~\ref{tab:twomode} looks like after applying the activity normalisation algorithm. <<eval=TRUE, echo=FALSE>>= conn <- dna_connection(dna_sample(verbose = FALSE)) @@ -3126,7 +3140,7 @@ The following plot shows this by employing the full potential of the function. <<eval=TRUE, message = FALSE, warning = FALSE, results = 'tex', fig.width = 8, fig.height = 4, crop = TRUE>>= clust <- dna_cluster(conn, - variable = "person", + variable1 = "person", attribute1 = "value", attribute2 = "type", cutree.k = 2, @@ -3342,24 +3356,24 @@ One common tool to get around this problem is to reduce the number of dimensions That is excatly what (non-metric) multidimensional scaling (MDS) does.\footnote{Specifically we use Kruskal's non-metric multidimensional scaling which makes most sense for our kind of network data.} Taking agreement and disagreement information towards all concepts, MDS can reduce the differences and similarities between actors to plot them in a two-dimensional space. -In \rdna\ we can perform this with the now well-known \code{dna\_cluster} function and the \code{dna\_plotMDS} command. +In \rdna\ we can perform this with the now well-known \code{dna\_cluster} function and the \code{dna\_plotCoordinates} command. <<eval=TRUE, message = FALSE, warning = FALSE, results = 'tex', fig.width = 8, fig.height = 4, crop = TRUE>>= clust <- dna_cluster(conn) -dna_plotMDS(clust) +dna_plotCoordinates(clust) @ Each point in this dotplot represents an organisation, in this case, and the points are highlighted with colours and shapes which represent their membership in one of the clusters. -Clusters are derived in this case with the \code{pam} function from the \texttt{cluster} package \citep{maechler2017cluster} and the silhouette width is used to assess the best number of clusters automatically by default, which in this case happens to be two. +Clusters are derived in this case with the \code{pam} function from the \texttt{cluster} package \citep{maechler2017cluster} and the silhouette width is used to assess the best number of clusters automatically by default, which in this case happens to be one. However, looking at the plot, you probably wonder why there only appear to be three dots. In fact, there is one point for every one of the seven organisations, but since they are so similar, they are plotted in almost exactly the same place. There are, however, tools to make them visible nevertheless. The first of these tools is called jittering: by adding random noise to the data, it is possible to prevent overplotting. -In the \code{dna\_plotMDS} function, this can be done by providing one or two numeric values to the \code{jitter} argument: +In the \code{dna\_plotCoordinates} function, this can be done by providing one or two numeric values to the \code{jitter} argument: <<eval=TRUE, message = FALSE, warning = FALSE, results = 'tex', fig.width = 8, fig.height = 4, crop = TRUE>>= -dna_plotMDS(clust, jitter = c(0.5, 1.7)) +dna_plotCoordinates(clust, jitter = c(0.5, 1.7)) @ Now you can see points for all of the seven organisations. @@ -3373,7 +3387,7 @@ As long as you do not change the seed, plots will look the same every time you p Yet, if you change the seed, this will alter the position of the jittered points: <<eval=TRUE, message = FALSE, warning = FALSE, results = 'tex', fig.width = 8, fig.height = 4, crop = TRUE>>= -dna_plotMDS(clust, jitter = c(0.5, 1.7), seed = 23455) +dna_plotCoordinates(clust, jitter = c(0.5, 1.7), seed = 23455) @ Use the jittering option cautiously though, since it distorts the appearance of the plot heavily if you choose appearance high jitter values as in the examples above. @@ -3383,45 +3397,45 @@ Use option \code{label = TRUE} to plot the actor labels closely to their respect In this case, the labels will be moved instead of the points if actors appear very closely together. <<eval=TRUE, message = FALSE, warning = FALSE, results = 'tex', fig.width = 8, fig.height = 4, crop = TRUE>>= -dna_plotMDS(clust, label = TRUE) +dna_plotCoordinates(clust, label = TRUE) @ The downside in this case is that the polygons disappeared again, since you need at least three points in one cluster, before the become visible. -Like in the plot functions we showed before, \code{dna\_plotMDS} again comes with several arguments to style the plots. +Like in the plot functions we showed before, \code{dna\_plotCoordinates} again comes with several arguments to style the plots. Specifically, you can provide colours to \code{custom\_colours} and numeric values to \code{custom\_shape}. -Another important point to highlight about \code{dna\_plotMDS} is that there are two additional cluster algorithms to determine the groups. +Another important point to highlight about \code{dna\_plotCoordinates} is that there are two additional cluster algorithms to determine the groups. The first one, \code{pam}, was already mentioned above. The second one, \code{cluster\_louvain} is from the \texttt{igraph} package \citep{csardi2006igraph} and can be chosen with \code{clust\_method = "louvain"}: <<eval=TRUE, message = FALSE, warning = FALSE, results = 'tex', fig.width = 8, fig.height = 4, crop = TRUE>>= -dna_plotMDS(clust, - label = TRUE, - custom_colours = c("red", "green", "blue"), - custom_shape = c(4,5), - clust_method = "louvain") +dna_plotCoordinates(clust, + label = TRUE, + custom_colours = c("red", "green", "blue"), + custom_shape = c(4,5), + clust_method = "louvain") @ If you would rather use one of those explained above, you can simply set the option in \code{dna\_cluster} and then choose \code{clust\_method = "inherit"}: <<eval=TRUE, message = FALSE, warning = FALSE, results = 'tex', fig.width = 8, fig.height = 4, crop = TRUE>>= clust <- dna_cluster(conn, clust.method = "edge_betweenness", cutree.k = 2) -dna_plotMDS(clust, - draw_polygons = TRUE, - label = TRUE, - jitter = c(1.5, 1.7), - clust_method = "inherit", - seed = 12345) +dna_plotCoordinates(clust, + draw_polygons = TRUE, + label = TRUE, + jitter = c(1.5, 1.7), + clust_method = "inherit", + seed = 12345) @ Other ways to style the plot include setting the labels of the axis: <<eval=TRUE, message = FALSE, warning = FALSE, results = 'tex', fig.width = 8, fig.height = 4, crop = TRUE>>= -dna_plotMDS(clust, - axis_labels = c("Dimension A", "Dimension B"), - stress = FALSE, - title = character()) +dna_plotCoordinates(clust, + axis_labels = c("Dimension A", "Dimension B"), + stress = FALSE, + title = character()) @ Or using again \texttt{ggplot2} commands to modify the plot much further: @@ -3431,17 +3445,17 @@ Or using again \texttt{ggplot2} commands to modify the plot much further: # ggplot library("ggplot2") clust$group <- gsub("Group", "Cluster", clust$group) -dna_plotMDS(clust, - jitter = c(1.5, 1.7), - clust_method = "inherit", - axis_labels = c("Dimension A", "Dimension B"), - custom_shape = c(1, 5), - stress = FALSE, - title = character(), - label = TRUE, - label_size = 2, - point_size = 3, - label_background = TRUE) + +dna_plotCoordinates(clust, + jitter = c(1.5, 1.7), + clust_method = "inherit", + axis_labels = c("Dimension A", "Dimension B"), + custom_shape = c(1, 5), + stress = FALSE, + title = character(), + label = TRUE, + label_size = 2, + point_size = 3, + label_background = TRUE) + theme_bw() + labs(caption = paste0("Nonmetric MDS with STRESS = ", round(attributes(clust$mds)$stress, 3), @@ -3464,7 +3478,7 @@ dna_plotMDS(clust, % TODO \section{Network plots} -We have shown above already, how you can use the \R\ infrastructure to produce some basic network plots with the \texttt{statnet} suite of packages. +We have shown above already how you can use the \R\ infrastructure to produce some basic network plots with the \texttt{statnet} suite of packages. Besides that, there are a number of packages in the \R\ universe that are quite capable to cater to your network plotting needs. The already mentioned \texttt{igraph} package \citep{csardi2006igraph}, for example, comes with very powerful functions for plotting networks. Other packages, such as \texttt{networkD3} \citep{allaire2017networkD3} are capable of creating interactive network plots, which are great to share, for example, on a website. @@ -3506,7 +3520,7 @@ In \code{dna\_plotNetwork}, more than a dozen options from the \texttt{igraph} p \item[fr] Spreads the nodes based on the force-directed algorithm of Fruchterman and Reingold. See \code{?igraph::with\_fr} - \item[gem] Place nodes on the plane using the GEM force-directed layout algorithm. See \code{?igraph::with\_gem}. + \item[gem] Places nodes on the plane using the GEM force-directed layout algorithm. See \code{?igraph::with\_gem}. \item[graphopt] Employs the Graphopt algorithm based on alternating attraction and repulsion to place nodes. @@ -3547,7 +3561,7 @@ grid <- grid.arrange(nw_fr, nw_graphopt, nw_mds, nw_randomly) @ -There are, again several options to style the plot. +There are again several options to style the plot. Some critics, however, doubt the usefulness of the common network plots---often referred to as hairballs---you see above. \citet{krzywinski2012hive}, for example, have argued that these network plots ``lack reproducibility and perceptual uniformity because they do not use a node coordinate system''. In the function \code{dna\_plotNetwork} a seed is automatically set to ensure at least reproducibility when you run the same code. You can change the argument \code{seed} to get an idea of how much chance is involved in calculating the layout. @@ -3649,7 +3663,7 @@ You should also be cautious with the option \code{removeStatements} since when s \subsection{Adding newspaper articles using LexisNexisTools} One common way of retrieving texts for analysis in \dna\ is by downloading articles from the commercial newspaper archive LexisNexis. Many university libraries have access to this database which maintains a collection of newspaper articles from many major outlets across Europe and North America. -It's powerful search engine also allows for a finely grained search string to limit the number of articles for a specific topic. +Its powerful search engine also allows for a finely grained search string to limit the number of articles for a specific topic. To convert the raw files from LexisNexis though, we need another \R\ package: \texttt{LexisNexisTools} \citep{gruber2018lexis}. You can install this package via \code{devtools::install\_github("JBGruber/LexisNexisTools")}. diff --git a/rDNA/DESCRIPTION b/rDNA/DESCRIPTION index 0e728acf..4dd9647d 100755 --- a/rDNA/DESCRIPTION +++ b/rDNA/DESCRIPTION @@ -1,6 +1,6 @@ Package: rDNA -Version: 2.0.82 -Date: 2018-05-20 +Version: 2.0.83 +Date: 2018-05-21 Title: Discourse Network Analysis in R Authors@R: c(person("Philip", "Leifeld", email = "Philip.Leifeld@glasgow.ac.uk", diff --git a/rDNA/NAMESPACE b/rDNA/NAMESPACE index 65223754..f401acf0 100644 --- a/rDNA/NAMESPACE +++ b/rDNA/NAMESPACE @@ -12,11 +12,11 @@ export(dna_getDocuments) export(dna_gui) export(dna_init) export(dna_network) +export(dna_plotCoordinates) export(dna_plotDendro) export(dna_plotFrequency) export(dna_plotHeatmap) export(dna_plotHive) -export(dna_plotMDS) export(dna_plotNetwork) export(dna_plotTimeWindow) export(dna_removeDocument) @@ -65,6 +65,7 @@ importFrom(stats,cor) importFrom(stats,cutree) importFrom(stats,dendrapply) importFrom(stats,dist) +importFrom(stats,factanal) importFrom(stats,hclust) importFrom(stats,is.leaf) importFrom(stats,kmeans) diff --git a/rDNA/R/rDNA.R b/rDNA/R/rDNA.R index 597c7352..d2f60889 100644 --- a/rDNA/R/rDNA.R +++ b/rDNA/R/rDNA.R @@ -714,37 +714,42 @@ dna_setDocuments <- function(connection, #' Perform a cluster analysis based on a DNA connection. Clustering is performed #' on the distance matrix of a collated two-mode network for cluster methods #' "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median" -#' and "centroid" or on a one-mode network with the cluster methods -#' "edge_betweenness", "leading_eigen" and "walktrap" from the \link{igraph} -#' package. The collated two-mode network is constructed by retrieving -#' individual networks for each of the qualifiers levels and combining the -#' results by columns. You can look at this network with -#' \code{View(clust$network)} ("clust" being the outcome of a call to -#' \code{dna_cluster()}). +#' and "centroid" or on a one-mode "subtract" network (with negative values +#' replaced by 0) for the cluster methods "edge_betweenness", "leading_eigen" +#' and "walktrap" from the \link{igraph} package. The collated two-mode network +#' is constructed by retrieving individual networks for each of the qualifiers +#' levels and combining the results by columns. Alternatively, you can use a +#' two-mode "subtract" network with option \code{collate = TRUE}. You can look +#' at this network with \code{View(clust$network)} ("clust" being the outcome of +#' a call to \code{dna_cluster()}). #' #' The distance matrix is calculated either by \link[vegan]{vegdist}, if the #' collated two-mode network is binary, or by \link[stats]{dist}, in all other #' cases. #' #' Besides clustering, this function also performs non-metric multidimensional -#' scaling (see \link[MASS]{isoMDS}). The results can be extracted from the -#' object using \code{clust.l$mds} or can be plotted using \link{dna_plotMDS}. +#' scaling (see \link[MASS]{isoMDS}) and factor analysis (see +#' \link[stats]{factanal}). The results can be extracted from the returned +#' object using \code{clust.l$mds} or \code{clust.l$fa} respectively. Both +#' results can also be plotted using \link{dna_plotCoordinates}. #' #' @param connection A \code{dna_connection} object created by the -#' \link{dna_connection} function. -#' @param variable The first variable for network construction(see -#' \link{dna_network}). The second one defaults to \code{"concept"} but can be -#' provided via \code{...} if necessary (see \code{variable2} in -#' \link{dna_connection}). +#' \link{dna_connection} function. +#' @param variable1 The first variable for network construction (see +#' \link{dna_network}). Defaults to "organization". +#' @param variable2 The second variable for network construction (see +#' \link{dna_network}). Defaults to \code{"concept"}. +#' @param transpose Logical. If \code{TRUE}, variable2 is clustered instead of +#' variable1. +#' @param collate Logical. If \code{FALSE}, clustering is performed on a +#' "subtract" network instead of the collated twomode network (see +#' \link{dna_network} for information on "subtract" networks). #' @param duplicates Setting for excluding duplicate statements before network -#' construction (for details see \link{dna_network}. If exclusion of -#' duplicates results in a binary matrix, \link[vegan]{vegdist} will be used -#' instead of \link[stats]{dist} to calculate the dissimilarity matrix. -#' @param clust.variable Choose if you want to cluster \code{"variable1"} (e.g., -#' "organization" by default) or \code{"variable2"} (e.g., "concept" by -#' default) from the network. +#' construction (for details see \link{dna_network}. If exclusion of +#' duplicates results in a binary matrix, \link[vegan]{vegdist} will be used +#' instead of \link[stats]{dist} to calculate the dissimilarity matrix. #' @param clust.method The agglomeration method to be used. When set to -#' \code{"ward.D"}, \code{"ward.D2"}, \code{"single"}, \code{"complete"}, +#' \code{"ward.D"}, \code{"ward.D2"}, \code{"single"}, \code{"complete"}, #' \code{"average"}, \code{"mcquitty"}, \code{"median"} or \code{"centroid"} #' the respective methods from \link[stats]{hclust} will be used. When set to #' \code{"edge_betweenness"}, \code{"leading_eigen"} or \code{"walktrap"} @@ -760,17 +765,15 @@ dna_setDocuments <- function(connection, #' @param cutree.k,cutree.h If cutree.k or cutree.h are provided, the tree from #' hierarchical clustering is cut into several groups. See $k$ and $h$ in #' \link[stats]{cutree} for details. -#' @param qualifierAggregation This argument refers to the qualifier aggregation -#' in \link{dna_network}. It is ignored for constructing the network for -#' clustering but is used for MDS. The two available options are -#' \code{"combine"} and \code{"subtract"} (see \link{dna_network}). -#' @param dimensions The desired dimension for the solution of the MDS. Only two -#' can be plotted but you might want to calculate more and then choose which -#' ones to plot. -#' @param ... Additional arguments passed to \link{dna_network}. This is -#' especially useful to set qualifier (defaults to \code{"agreement"}) and -#' normalization (defaults to \code{"no"}) if non-default values are needed -#' for clustering. +#' @param dimensions The desired dimension for the solution of the MDS and also +#' the desired number of factors to extract from the factor analysis. Only two +#' can be plotted but you might want to calculate more and then choose which +#' ones to plot. +#' @param ... Additional arguments passed to \link{dna_network}, +#' \link[stats]{factanal}) and \link[MASS]{isoMDS}). This is especially useful +#' to set qualifier (defaults to \code{"agreement"}) and normalization +#' (defaults to \code{"no"}) if non-default values are needed in the clustered +#' network. #' #' @examples #' \dontrun{ @@ -782,14 +785,14 @@ dna_setDocuments <- function(connection, #' #' dna_plotDendro(clust.l) #' dna_plotHeatmap(clust.l) -#' dna_plotMDS(clust.l, +#' dna_plotCoordinates(clust.l, #' jitter = c(0.5, 0.7)) #' #' } #' @author Johannes B. Gruber #' @export #' @importFrom vegan vegdist -#' @importFrom stats setNames dist hclust cutree as.hclust +#' @importFrom stats setNames dist hclust cutree as.hclust factanal #' @importFrom igraph graph_from_adjacency_matrix cluster_leading_eigen #' cluster_walktrap E #' @importFrom dplyr summarise group_by_all @@ -799,18 +802,26 @@ dna_setDocuments <- function(connection, #' @importFrom grDevices chull #' @importFrom utils packageVersion capture.output dna_cluster <- function(connection, - variable = "organization", + variable1 = "organization", + variable2 = "concept", + transpose = FALSE, + collate = TRUE, duplicates = "document", - clust.variable = "variable1", clust.method = "ward.D2", attribute1 = "color", attribute2 = "value", cutree.k = NULL, cutree.h = NULL, - qualifierAggregation = "combine", dimensions = 2, ...) { dots <- list(...) + if (any(names(formals(factanal)) %in% names(dots))) { + dots_fa <- dots[names(dots) %in% names(formals(factanal))] + dots[names(dots) %in% names(formals(factanal))] <- NULL + } else { + dots_fa <- list() + } + if ("normalization" %in% names(dots)) { normalization_onemode <- ifelse(dots[["normalization"]] %in% c("no", "average", "Jaccard", "cosine"), @@ -860,55 +871,66 @@ dna_cluster <- function(connection, )) } } - dta <- lapply(lvls, function(l) { - excludeVals <- c(stats::setNames(list(lvls[!lvls == l]), - nm = qualifier), - excludeValues) - nw <- do.call(dna_network, - c(list(connection = connection, - networkType = "twomode", - variable1 = variable, - qualifier = qualifier, - qualifierAggregation = "ignore", - normalization = normalization_twomode, - isolates = TRUE, - duplicates = duplicates, - excludeValues = excludeVals, - invertValues = FALSE, - verbose = FALSE) - , dots) - ) - colnames(nw) <- paste(colnames(nw), "-", l) - return(nw) - }) - dta <- rapply(dta, f = function(x) ifelse(is.nan(x), 0, x), how = "replace" ) - dta <- do.call("cbind", dta) - dta <- dta[rowSums(dta) > 0, ] - dta <- dta[, colSums(dta) > 0] - if (clust.variable == "variable2") { + if (collate) { + dta <- lapply(lvls, function(l) { + excludeVals <- c(stats::setNames(list(lvls[!lvls == l]), + nm = qualifier), + excludeValues) + nw <- do.call(dna_network, + c(list(connection = connection, + networkType = "twomode", + variable1 = variable1, + variable2 = variable2, + qualifier = qualifier, + qualifierAggregation = "ignore", + normalization = normalization_twomode, + isolates = TRUE, + duplicates = duplicates, + excludeValues = excludeVals, + invertValues = FALSE, + verbose = FALSE) + , dots) + ) + colnames(nw) <- paste(colnames(nw), "-", l) + return(nw) + }) + dta <- rapply(dta, f = function(x) ifelse(is.nan(x), 0, x), how = "replace" ) + dta <- do.call("cbind", dta) + dta.fa <- dta + dta <- dta[rowSums(dta) > 0, ] + dta <- dta[, colSums(dta) > 0] + } else { + dta <- do.call(dna_network, + c(list(connection = connection, + networkType = "twomode", + variable1 = variable1, + variable2 = variable2, + qualifier = qualifier, + qualifierAggregation = "subtract", + normalization = normalization_twomode, + isolates = TRUE, + duplicates = duplicates, + excludeValues = excludeValues, + invertValues = FALSE, + verbose = FALSE) + , dots)) + dta.fa <- dta + } + if (transpose) { dta <- t(dta) - if ("variable2" %in% names(dots)) { - variable1 <- dots[["variable2"]] - variable2 <- variable - } else { - variable1 <- formals("dna_network")[["variable2"]] - variable2 <- variable - } - } else if (clust.variable == "variable1") { - variable1 <- variable - if ("variable2" %in% names(dots)) { - variable2 <- dots[["variable2"]] - } else { - variable2 <- formals("dna_network")[["variable2"]] - } + dta.fa <- t(dta.fa) + . <- variable1 + variable1 <- variable2 + variable2 <- . } - dots[["variable2"]] <- NULL + # create onemode for igraph and MDS louvain cluster nw <- do.call(dna_network, c(list(connection = connection, networkType = "onemode", qualifierAggregation = "subtract", normalization = normalization_onemode, variable1 = variable1, + variable2 = variable2, isolates = FALSE, duplicates = duplicates, qualifier = qualifier, @@ -977,6 +999,7 @@ dna_cluster <- function(connection, if (!is.null(c(cutree.k, cutree.h))) { hc$group <- cutree(hc, k = cutree.k, h = cutree.h) } + # Retrieve colours for variable1. Even if transpose, this is correct col <- dna_getAttributes(connection = connection, statementType = dots$statementType, variable = variable1, @@ -993,47 +1016,38 @@ dna_cluster <- function(connection, } else { attr(hc, "cut") <- NA } - if (!qualifierAggregation %in% c("subtract", "combine")) { - qualifierAggregation <- "combine" - warning("Only the qualifier aggregations 'subtract' and 'combine' can be honoured. The default 'combine' was used instead.") - } - if (clust.variable == "variable1") { - nw <- do.call(dna_network, - c(list(connection = connection, - networkType = "twomode", - variable1 = variable1, - variable2 = variable2, - normalization = "no", - isolates = TRUE, - duplicates = duplicates, - qualifier = qualifier, - verbose = FALSE, - qualifierAggregation = qualifierAggregation) - , dots)) - } else { - nw <- dta - warning("When variable2 is clustered qualifier aggregations is turned off and instead the transposed collated two-mode matrix is used.") - } - if (any(duplicated(nw))) { - . <- data.frame(nw, check.names = FALSE) + # FA + fa <- tryCatch(do.call(factanal, + c(list(x = t(dta.fa), + factors = dimensions) + , dots_fa)), + error = function(e) { + e <- paste("In factor analysis: ", e) + warning(e, call. = FALSE) + e <- NULL + } + ) + # MDS + if (any(duplicated(dta))) { + . <- data.frame(dta, check.names = FALSE) . <- dplyr::group_by_all(.) .$rn <- row.names(.) . <- dplyr::summarise(., rowname = paste(rn, collapse = "|")) . <- data.frame(., stringsAsFactors = FALSE) row.names(.) <- .$rowname - nw <- .[, !colnames(.) == "rowname"] + dta <- .[, !colnames(.) == "rowname"] } - if (all(nw %in% c(0, 1))) { - d <- vegan::vegdist(nw, method = "jaccard") + if (all(dta %in% c(0, 1))) { + d <- vegan::vegdist(dta, method = "jaccard") } else { - d <- dist(nw, method = "euclidean") + d <- dist(dta, method = "euclidean") } if (length(d) < 2) { stop("Clustering cannot be performed on less than three actors.") } mds <- MASS::isoMDS(d, trace = FALSE, k = dimensions) - k.best <- which.max(sapply(seq(from = 2, to = nrow(nw) - 1, by = 1), function(i) { - cluster::pam(nw, diss = FALSE, k = i)$silinfo$avg.width + k.best <- which.max(sapply(seq(from = 2, to = nrow(dta) - 1, by = 1), function(i) { + cluster::pam(dta, diss = FALSE, k = i)$silinfo$avg.width })) stress <- mds$stress mat <- data.frame(mds$points) @@ -1047,6 +1061,7 @@ dna_cluster <- function(connection, mds <- splitstackshape::cSplit(mds, "variable", "|", "long") } hc$mds <- data.frame(mds[!duplicated(mds$variable, fromLast = TRUE), ]) + hc$fa <- fa attributes(hc$mds)$stress <- stress hc$call <- match.call() attr(hc, "colours") <- c("attribute1" = attribute1, "attribute2" = attribute2) @@ -2093,20 +2108,19 @@ dna_timeWindow <- function(connection, #' dna_downloadJar() #' dna_init("dna-2.0-beta21.jar") #' conn <- dna_connection(dna_sample()) -#' nw <- dna_network(conn, -#' networkType = "onemode") +#' nw <- dna_network(conn, networkType = "onemode") #' graph <- dna_toIgraph(nw) #' } dna_toIgraph <- function(x, weighted = TRUE) { - if (any(grepl("dna_network_onemode", class(x)))) { + if (any(class(x) %in% "dna_network_onemode")) { graph <- graph_from_adjacency_matrix(x, mode = "undirected", weighted = weighted, diag = FALSE, add.colnames = NULL, add.rownames = NA) - } else if (any(grepl("dna_network_twomode", class(x)))) { + } else if (any(class(x) %in% "dna_network_twomode")) { graph <- graph_from_incidence_matrix(x, directed = FALSE, weighted = weighted, @@ -2160,7 +2174,7 @@ dna_toREM <- function(x, variable = "organization", ...) { dots <- list(...) - if (any(grepl("dna_connection", class(x)))) { + if (any(class(x) %in% "dna_connection")) { dots_network <- dots[names(dots) %in% names(formals("dna_network"))] dta <- do.call("dna_network", c(list(x, @@ -2168,7 +2182,7 @@ dna_toREM <- function(x, variable1 = variable, verbose = FALSE ), dots_network)) - } else if (any(grepl("dna_eventlist", class(x)))) { + } else if (any(class(x) %in% "dna_eventlist")) { if (any(names(dots) %in% names(formals("dna_network")))) { message("Since x is already a network object, arguments for dna_network() provided through '...' are ignored") } @@ -2229,13 +2243,13 @@ dna_toREM <- function(x, #' } dna_toNetwork <- function(x, ...) { - if (any(grepl("dna_network_onemode", class(x)))) { - nw <- as.network.matrix(x, + if (any(class(x) %in% "dna_network_onemode")) { + nw2 <- as.network.matrix(x, matrix.type = "adjacency", directed = FALSE, bipartite = FALSE, ...) - } else if (any(grepl("dna_network_twomode", class(x)))) { + } else if (any(class(x) %in% "dna_network_twomode")) { nw <- as.network.matrix(x, matrix.type = "incidence", directed = FALSE, @@ -3017,7 +3031,7 @@ dna_plotHive <- function(x, ...) { layout <- "hive" # Make igraph - if (any(grepl("dna_network_twomode", class(x)))) { + if (any(class(x) %in% "dna_network_twomode")) { stop("Twomode networks are currently not allowed.") } graph <- dna_toIgraph(x) @@ -3045,7 +3059,7 @@ dna_plotHive <- function(x, names(groups) <- V(graph)$name } else if (any(grepl("list|character", class(groups)))) { groups <- groups[match(V(graph)$name, names(groups))] - } else if (any(grepl("dna_cluster", class(groups)))) { + } else if (any(class(groups) %in% "dna_cluster")) { groups <- groups$group[match(V(graph)$name, groups$labels)] } node_attribute <- "Membership" @@ -3180,47 +3194,49 @@ dna_plotHive <- function(x, #' highlight clusters. #' #' @param clust A \code{dna_cluster} object created by the \link{dna_cluster} -#' function. +#' function. +#' @param what Choose either "MDS" to plot the results of multidimensional +#' scaling or "FA" to plot two factors of the factor analysis. #' @param dimensions Provide two numeric values to determine which dimensions to -#' plot. The default, c(1, 2), will plot dimension 1 and dimension 2. +#' plot. The default, c(1, 2), will plot dimension 1 and dimension 2. #' @param draw_polygons Logical. Should clusters be highlighted with coloured -#' polygons? +#' polygons? #' @param custom_colours Manually provide colours for the points and polygons. #' @param custom_shape Manually provide shapes to use for the scatterplot. #' @param alpha The alpha level of the polygons drawn when \code{draw.clusters = -#' "polygon"}. +#' "polygon"}. #' @param jitter Takes either one value, to control the width of the jittering -#' of points, two values to control width and height of the jittering of -#' points (e.g., c(.l, .2)) or \code{character()} to turn off the jittering of -#' points. +#' of points, two values to control width and height of the jittering of +#' points (e.g., c(.l, .2)) or \code{character()} to turn off the jittering of +#' points. #' @param seed Seed for jittering. #' @param label Logical. Should labels be plotted? #' @param label_size,font_colour,label_background Control the label size, font -#' colour of the labels and if a background should be displayed when -#' \code{label = TRUE}. label_size takes numeric values, font_colour takes a -#' character string with a valid colour value and label_background can be -#' either TRUE or FALSE. +#' colour of the labels and if a background should be displayed when +#' \code{label = TRUE}. label_size takes numeric values, font_colour takes a +#' character string with a valid colour value and label_background can be +#' either TRUE or FALSE. #' @param point_size Size of the points in the scatterplot. #' @param expand Expand x- and y-axis (e.g., to make room for labels). The first -#' value is the units by which the x-axis is expanded in both directions, the -#' second controls expansion of the y axis. +#' value is the units by which the x-axis is expanded in both directions, the +#' second controls expansion of the y axis. #' @param stress Should stress from the MDS be displayed on the plot. #' @param axis_labels Provide custom axis labels. #' @param clust_method Can be either \code{pam} for \link[cluster]{pam}, -#' \code{"louvain"} for \link[igraph]{cluster_louvain} or \code{"inherit"} to -#' use the method provided by the call to \link{dna_cluster}. +#' \code{"louvain"} for \link[igraph]{cluster_louvain} or \code{"inherit"} to +#' use the method provided by the call to \link{dna_cluster}. #' @param truncate Sets the number of characters to which labels should be -#' truncated. Value \code{Inf} turns off truncation. +#' truncated. Value \code{Inf} turns off truncation. #' @param title Title of the MDS plot. #' @param ... Not used. If you want to add more plot options use \code{+} and -#' the ggplot2 logic (see example). +#' the ggplot2 logic (see example). #' @examples #' \dontrun{ #' dna_downloadJar() #' dna_init("dna-2.0-beta21.jar") #' conn <- dna_connection(dna_sample()) #' clust <- dna_cluster(conn) -#' mds <- dna_plotMDS(clust) +#' mds <- dna_plotCoordinates(clust) #' # Flip plot with ggplot2 command #' library("ggplot2") #' mds + @@ -3230,29 +3246,43 @@ dna_plotHive <- function(x, #' @export #' @import ggplot2 #' @importFrom ggrepel geom_label_repel -dna_plotMDS <- function(clust, - dimensions = c(1, 2), - draw_polygons = TRUE, - alpha = .25, - jitter = NULL, - seed = 12345, - label = FALSE, - label_size = 3.5, - point_size = 1, - label_background = FALSE, - font_colour = "black", - expand = 0, - stress = TRUE, - truncate = 40, - custom_colours = character(), - custom_shape = character(), - axis_labels = character(), - clust_method = "pam", - title = "Nonmetric Multidimensional Scaling", - ...) { - df <- clust[["mds"]] - dim1 <- paste0("Dimension_", dimensions[1]) - dim2 <- paste0("Dimension_", dimensions[2]) +dna_plotCoordinates <- function(clust, + what = "MDS", + dimensions = c(1, 2), + draw_polygons = TRUE, + alpha = .25, + jitter = NULL, + seed = 12345, + label = FALSE, + label_size = 3.5, + point_size = 1, + label_background = FALSE, + font_colour = "black", + expand = 0, + stress = TRUE, + truncate = 40, + custom_colours = character(), + custom_shape = character(), + axis_labels = character(), + clust_method = "pam", + title = "auto", + ...) { + if (what == "MDS") { + df <- clust[["mds"]] + dim1 <- paste0("Dimension_", dimensions[1]) + dim2 <- paste0("Dimension_", dimensions[2]) + } else if (what == "FA") { + df <- clust[["fa"]]$loadings[, dimensions] + dim1 <- paste0("Factor", dimensions[1]) + dim2 <- paste0("Factor", dimensions[2]) + df <- data.frame(df, + variable = row.names(df), + cluster_pam = clust[["mds"]]$cluster_pam, + cluster_louvain = clust[["mds"]]$cluster_louvain) + } else { + stop("This function can either plot MDS or factor analysis data. Please select 'MDS' or 'FA' as 'what'." ) + } + # jitter if selected if (length(jitter) > 0) { set.seed(seed) df[[dim1]] <- jitter(df[[dim1]], amount = jitter[1]) @@ -3331,10 +3361,17 @@ dna_plotMDS <- function(clust, ylab(label = axis_labels[2]) } if (length(title) > 0) { + if (title == "auto") { + if (what == "MDS") { + title <- "Non-metric Multidimensional Scaling" + } else if (what == "FA") { + title <- "Factor analysis" + } + } g <- g + ggtitle(title) } - if (stress) { + if (stress & what == "MDS") { a <- data.frame(x = max(df[[dim1]]) + expand[1], y = max(df[[dim2]]) + expand[2], label = paste("Stress:", round(attributes(df)$stress, digits = 6))) @@ -3497,7 +3534,7 @@ dna_plotNetwork <- function(x, ...) { # Make igraph object set.seed(seed) - if (any(grepl("dna_network_twomode", class(x)))) { + if (any(class(x) %in% "dna_network_twomode")) { if (layout == "auto") { layout <- "bipartite" message("Using `bipartite` as default layout") @@ -3510,7 +3547,7 @@ dna_plotNetwork <- function(x, names(groups) <- V(graph)$name } else if (any(grepl("list|character", class(groups)))) { V(graph)$group <- groups[match(V(graph)$name, names(groups))] - } else if (any(grepl("dna_cluster", class(groups)))) { + } else if (any(grepl("list|character", class(groups)))) { V(graph)$group <- groups$group[match(V(graph)$name, groups$labels)] } # colour and attribute @@ -3551,7 +3588,7 @@ dna_plotNetwork <- function(x, substr(node_attribute, 2, nchar(node_attribute))) } lyt$name_short <- trim(as.character(lyt$name), n = truncate) - if (any(grepl("dna_network_twomode", class(x)))) { + if (any(class(x) %in% "dna_network_twomode")) { lyt$attribute <- as.character(lyt$attribute) if (node_colours == "auto" & node_attribute == "Color") { att <- dna_getAttributes(eval(args[["connection"]]), @@ -3754,7 +3791,7 @@ dna_plotTimeWindow <- function(x, diagnostics = FALSE, ...) { method <- colnames(x)[3] - if (!any(grepl("dna_timeWindow", class(x)))) { + if (!any(class(x) %in% "dna_timeWindow")) { warning("x is not an object of class \"dna_timeWindow\".") } if (identical(facetValues, "all")) { diff --git a/rDNA/man/dna_cluster.Rd b/rDNA/man/dna_cluster.Rd index e5baf366..10aded04 100644 --- a/rDNA/man/dna_cluster.Rd +++ b/rDNA/man/dna_cluster.Rd @@ -4,31 +4,35 @@ \alias{dna_cluster} \title{Cluster network from a DNA connection} \usage{ -dna_cluster(connection, variable = "organization", duplicates = "document", - clust.variable = "variable1", clust.method = "ward.D2", - attribute1 = "color", attribute2 = "value", cutree.k = NULL, - cutree.h = NULL, qualifierAggregation = "combine", dimensions = 2, ...) +dna_cluster(connection, variable1 = "organization", variable2 = "concept", + transpose = FALSE, collate = TRUE, duplicates = "document", + clust.method = "ward.D2", attribute1 = "color", attribute2 = "value", + cutree.k = NULL, cutree.h = NULL, dimensions = 2, ...) } \arguments{ \item{connection}{A \code{dna_connection} object created by the \link{dna_connection} function.} -\item{variable}{The first variable for network construction(see -\link{dna_network}). The second one defaults to \code{"concept"} but can be -provided via \code{...} if necessary (see \code{variable2} in -\link{dna_connection}).} +\item{variable1}{The first variable for network construction (see +\link{dna_network}). Defaults to "organization".} + +\item{variable2}{The second variable for network construction (see +\link{dna_network}). Defaults to \code{"concept"}.} + +\item{transpose}{Logical. If \code{TRUE}, variable2 is clustered instead of +variable1.} + +\item{collate}{Logical. If \code{FALSE}, clustering is performed on a +"subtract" network instead of the collated twomode network (see +\link{dna_network} for information on "subtract" networks).} \item{duplicates}{Setting for excluding duplicate statements before network construction (for details see \link{dna_network}. If exclusion of duplicates results in a binary matrix, \link[vegan]{vegdist} will be used instead of \link[stats]{dist} to calculate the dissimilarity matrix.} -\item{clust.variable}{Choose if you want to cluster \code{"variable1"} (e.g., -"organization" by default) or \code{"variable2"} (e.g., "concept" by -default) from the network.} - \item{clust.method}{The agglomeration method to be used. When set to -\code{"ward.D"}, \code{"ward.D2"}, \code{"single"}, \code{"complete"}, + \code{"ward.D"}, \code{"ward.D2"}, \code{"single"}, \code{"complete"}, \code{"average"}, \code{"mcquitty"}, \code{"median"} or \code{"centroid"} the respective methods from \link[stats]{hclust} will be used. When set to \code{"edge_betweenness"}, \code{"leading_eigen"} or \code{"walktrap"} @@ -47,19 +51,16 @@ and \code{"note"}.} hierarchical clustering is cut into several groups. See $k$ and $h$ in \link[stats]{cutree} for details.} -\item{qualifierAggregation}{This argument refers to the qualifier aggregation -in \link{dna_network}. It is ignored for constructing the network for -clustering but is used for MDS. The two available options are -\code{"combine"} and \code{"subtract"} (see \link{dna_network}).} - -\item{dimensions}{The desired dimension for the solution of the MDS. Only two +\item{dimensions}{The desired dimension for the solution of the MDS and also +the desired number of factors to extract from the factor analysis. Only two can be plotted but you might want to calculate more and then choose which ones to plot.} -\item{...}{Additional arguments passed to \link{dna_network}. This is -especially useful to set qualifier (defaults to \code{"agreement"}) and -normalization (defaults to \code{"no"}) if non-default values are needed -for clustering.} +\item{...}{Additional arguments passed to \link{dna_network}, +\link[stats]{factanal}) and \link[MASS]{isoMDS}). This is especially useful +to set qualifier (defaults to \code{"agreement"}) and normalization +(defaults to \code{"no"}) if non-default values are needed in the clustered +network.} } \description{ Clustering methods for DNA connections. @@ -68,21 +69,24 @@ Clustering methods for DNA connections. Perform a cluster analysis based on a DNA connection. Clustering is performed on the distance matrix of a collated two-mode network for cluster methods "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median" -and "centroid" or on a one-mode network with the cluster methods -"edge_betweenness", "leading_eigen" and "walktrap" from the \link{igraph} -package. The collated two-mode network is constructed by retrieving -individual networks for each of the qualifiers levels and combining the -results by columns. You can look at this network with -\code{View(clust$network)} ("clust" being the outcome of a call to -\code{dna_cluster()}). +and "centroid" or on a one-mode "subtract" network (with negative values +replaced by 0) for the cluster methods "edge_betweenness", "leading_eigen" +and "walktrap" from the \link{igraph} package. The collated two-mode network +is constructed by retrieving individual networks for each of the qualifiers +levels and combining the results by columns. Alternatively, you can use a +two-mode "subtract" network with option \code{collate = TRUE}. You can look +at this network with \code{View(clust$network)} ("clust" being the outcome of +a call to \code{dna_cluster()}). The distance matrix is calculated either by \link[vegan]{vegdist}, if the collated two-mode network is binary, or by \link[stats]{dist}, in all other cases. Besides clustering, this function also performs non-metric multidimensional -scaling (see \link[MASS]{isoMDS}). The results can be extracted from the -object using \code{clust.l$mds} or can be plotted using \link{dna_plotMDS}. +scaling (see \link[MASS]{isoMDS}) and factor analysis (see +\link[stats]{factanal}). The results can be extracted from the returned +object using \code{clust.l$mds} or \code{clust.l$fa} respectively. Both +results can also be plotted using \link{dna_plotCoordinates}. } \examples{ \dontrun{ @@ -94,7 +98,7 @@ clust.l <- dna_cluster(conn) dna_plotDendro(clust.l) dna_plotHeatmap(clust.l) -dna_plotMDS(clust.l, +dna_plotCoordinates(clust.l, jitter = c(0.5, 0.7)) } diff --git a/rDNA/man/dna_plotMDS.Rd b/rDNA/man/dna_plotCoordinates.Rd similarity index 81% rename from rDNA/man/dna_plotMDS.Rd rename to rDNA/man/dna_plotCoordinates.Rd index e75b4262..8cf0f9e3 100644 --- a/rDNA/man/dna_plotMDS.Rd +++ b/rDNA/man/dna_plotCoordinates.Rd @@ -1,21 +1,24 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rDNA.R -\name{dna_plotMDS} -\alias{dna_plotMDS} +\name{dna_plotCoordinates} +\alias{dna_plotCoordinates} \title{Plots an MDS scatterplot from dna.cluster objects} \usage{ -dna_plotMDS(clust, dimensions = c(1, 2), draw_polygons = TRUE, - alpha = 0.25, jitter = NULL, seed = 12345, label = FALSE, - label_size = 3.5, point_size = 1, label_background = FALSE, - font_colour = "black", expand = 0, stress = TRUE, truncate = 40, - custom_colours = character(), custom_shape = character(), - axis_labels = character(), clust_method = "pam", - title = "Nonmetric Multidimensional Scaling", ...) +dna_plotCoordinates(clust, what = "MDS", dimensions = c(1, 2), + draw_polygons = TRUE, alpha = 0.25, jitter = NULL, seed = 12345, + label = FALSE, label_size = 3.5, point_size = 1, + label_background = FALSE, font_colour = "black", expand = 0, + stress = TRUE, truncate = 40, custom_colours = character(), + custom_shape = character(), axis_labels = character(), + clust_method = "pam", title = "auto", ...) } \arguments{ \item{clust}{A \code{dna_cluster} object created by the \link{dna_cluster} function.} +\item{what}{Choose either "MDS" to plot the results of multidimensional +scaling or "FA" to plot two factors of the factor analysis.} + \item{dimensions}{Provide two numeric values to determine which dimensions to plot. The default, c(1, 2), will plot dimension 1 and dimension 2.} @@ -82,7 +85,7 @@ dna_downloadJar() dna_init("dna-2.0-beta21.jar") conn <- dna_connection(dna_sample()) clust <- dna_cluster(conn) -mds <- dna_plotMDS(clust) +mds <- dna_plotCoordinates(clust) # Flip plot with ggplot2 command library("ggplot2") mds + diff --git a/rDNA/man/dna_toIgraph.Rd b/rDNA/man/dna_toIgraph.Rd index 41d6a330..16c49a6f 100644 --- a/rDNA/man/dna_toIgraph.Rd +++ b/rDNA/man/dna_toIgraph.Rd @@ -21,8 +21,7 @@ This function can convert objects of class 'dna_network_onemode' or dna_downloadJar() dna_init("dna-2.0-beta21.jar") conn <- dna_connection(dna_sample()) -nw <- dna_network(conn, -networkType = "onemode") +nw <- dna_network(conn, networkType = "onemode") graph <- dna_toIgraph(nw) } }