From 38eac001e7b4177c78bdcd002527f98c172c92cd Mon Sep 17 00:00:00 2001 From: leifeld Date: Sun, 4 Feb 2024 01:27:28 +0000 Subject: [PATCH] Kernel smoothing and better parallelization for phase transitions --- bibliography/build.gradle | 52 +-- build/bibliography.md | 2 +- dna/build.gradle | 4 + dna/src/main/java/dna/Dna.java | 2 +- dna/src/main/java/export/Exporter.java | 300 +++++++++++------ rDNA/build.gradle | 25 +- rDNA/rDNA/DESCRIPTION | 2 +- rDNA/rDNA/NAMESPACE | 2 + rDNA/rDNA/R/rDNA.R | 425 ++++++++++++++++++++++++- rDNA/rDNA/man/dna_getHeadlessDna.Rd | 38 +++ rDNA/rDNA/man/dna_init.Rd | 1 + rDNA/rDNA/man/dna_jar.Rd | 1 + rDNA/rDNA/man/dna_network.Rd | 14 + rDNA/rDNA/man/dna_phaseTransitions.Rd | 83 ++++- rDNA/rDNA/man/dna_sample.Rd | 1 + 15 files changed, 791 insertions(+), 161 deletions(-) create mode 100644 rDNA/rDNA/man/dna_getHeadlessDna.Rd diff --git a/bibliography/build.gradle b/bibliography/build.gradle index 494a3bc9..421680e1 100644 --- a/bibliography/build.gradle +++ b/bibliography/build.gradle @@ -1,9 +1,7 @@ task bibliographyMarkdown { + inputs.dir '.' doLast { - exec { - workingDir '.' - commandLine 'mkdir', "-p", "$rootDir/build" - } + mkdir "$rootDir/build" exec { workingDir '.' commandLine 'pandoc', "-t", "gfm", "-s", "--csl", "apa-numeric-superscript-brackets.csl", "--citeproc", "-o", "$rootDir/build/bibliography.md", "bibliography.tex" @@ -12,59 +10,35 @@ task bibliographyMarkdown { } task bibliographyPdflatex { + inputs.dir '.' doLast{ - exec { - workingDir '.' - commandLine 'mkdir', "-p", "temp" - } - exec { - workingDir '.' - commandLine 'mkdir', "-p", "$rootDir/build" + mkdir "./temp" + mkdir "$rootDir/build" + copy { + from 'bibliography.tex', 'bibliography.bib' + into 'temp' } - exec { - workingDir '.' - commandLine 'cp', "bibliography.tex", "temp/" - } - exec { - workingDir '.' - commandLine 'cp', "bibliography.bib", "temp/" - } - } - doLast { exec { workingDir 'temp' commandLine 'pdflatex', "bibliography.tex" - } - exec { - workingDir 'temp' commandLine 'bibtex', "bibliography" - } - exec { - workingDir 'temp' commandLine 'pdflatex', "bibliography.tex" - } - exec { - workingDir 'temp' commandLine 'pdflatex', "bibliography.tex" } - } - doLast { - exec { - workingDir '.' - commandLine 'mv', "temp/bibliography.pdf", "$rootDir/build/" - } - exec { - workingDir '.' - commandLine 'rm', "-r", "temp" + copy { + from 'temp/bibliography.pdf' + into "$rootDir/build/" } + delete 'temp' } } task build { dependsOn bibliographyMarkdown dependsOn bibliographyPdflatex + inputs.dir '.' def outputDir = file("$rootDir/build/") outputs.dir outputDir diff --git a/build/bibliography.md b/build/bibliography.md index 9c34e7be..b500ceb6 100644 --- a/build/bibliography.md +++ b/build/bibliography.md @@ -4,7 +4,7 @@ author: bibliography: - bibliography.bib csl: apa-numeric-superscript-brackets.csl -date: 2024-01-10 +date: 2024-02-04 title: "Discourse Network Analysis: Bibliography" --- diff --git a/dna/build.gradle b/dna/build.gradle index 85d425be..b4d6d65d 100644 --- a/dna/build.gradle +++ b/dna/build.gradle @@ -5,6 +5,8 @@ plugins { // create jar file, not just .class files jar { + inputs.dir '.' + // point the manifest to the right class manifest { attributes 'Main-Class': 'dna.Dna' @@ -77,4 +79,6 @@ dependencies { implementation group: 'me.tongfei', name: 'progressbar', version: '0.9.4' // https://mvnrepository.com/artifact/org.ojalgo/ojalgo implementation group: 'org.ojalgo', name: 'ojalgo', version: '51.4.1' + // https://mvnrepository.com/artifact/org.apache.commons/commons-math3 + implementation group: 'org.apache.commons', name: 'commons-math3', version: '3.6.1' } diff --git a/dna/src/main/java/dna/Dna.java b/dna/src/main/java/dna/Dna.java index 5dc60244..47532b3a 100644 --- a/dna/src/main/java/dna/Dna.java +++ b/dna/src/main/java/dna/Dna.java @@ -17,7 +17,7 @@ public class Dna { public static Dna dna; public static Logger logger; public static Sql sql; - public static final String date = "2024-01-01"; + public static final String date = "2024-02-04"; public static final String version = "3.0.11"; public static final String operatingSystem = System.getProperty("os.name"); public static File workingDirectory = null; diff --git a/dna/src/main/java/export/Exporter.java b/dna/src/main/java/export/Exporter.java index fbfe4cf8..675037d9 100644 --- a/dna/src/main/java/export/Exporter.java +++ b/dna/src/main/java/export/Exporter.java @@ -10,6 +10,9 @@ import logger.Logger; import me.tongfei.progressbar.ProgressBar; import model.*; +import org.apache.commons.math3.linear.EigenDecomposition; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.linear.Array2DRowRealMatrix; import org.jdom.Attribute; import org.jdom.Comment; import org.jdom.Element; @@ -883,7 +886,7 @@ String[] extractLabels( * {@link #filteredStatements} slot of the class. */ public void filterStatements() { - try (ProgressBar pb = new ProgressBar("Filtering statements...", this.originalStatements.size())) { + try (ProgressBar pb = new ProgressBar("Filtering statements", this.originalStatements.size())) { pb.stepTo(0); // create a deep copy of the original statements @@ -1682,7 +1685,6 @@ private Matrix computeTwoModeMatrix(ArrayList processedStatemen /** * Compute a series of network matrices using kernel smoothing. - * * This function creates a series of network matrices (one-mode or two-mode) similar to the time window approach, * but using kernel smoothing around a forward-moving mid-point on the time axis (gamma). The networks are defined * by the mid-point {@code gamma}, the window size {@code w}, and the kernel function. These parameters are saved in @@ -1699,6 +1701,15 @@ public void computeKernelSmoothedTimeSlices() { Dna.logger.log(l); } + // check and fix two-mode qualifier aggregation for unimplemented settings + if (Exporter.this.qualifierAggregation.equals("combine")) { + LogEvent l = new LogEvent(Logger.WARNING, + Exporter.this.qualifierAggregation + " qualifier aggregation not implemented.", + Exporter.this.qualifierAggregation + " qualifier aggregation has not been implemented for kernel-smoothed networks. Using \"subtract\" qualifier aggregation instead."); + Exporter.this.qualifierAggregation = "subtract"; + Dna.logger.log(l); + } + // initialise variables and constants Collections.sort(this.filteredStatements); // probably not necessary, but can't hurt to have it if (this.windowSize % 2 != 0) { // windowSize is the w constant in the paper; only even numbers are acceptable because adding or subtracting w / 2 to or from gamma would not yield integers @@ -1710,14 +1721,14 @@ public void computeKernelSmoothedTimeSlices() { LocalDateTime gamma = b; // current time while progressing through list of statements // save the labels of the variables and qualifier and put indices in hash maps for fast retrieval - String[] var1Values = retrieveValues(Exporter.this.filteredStatements, Exporter.this.variable1, Exporter.this.variable1Document); - String[] var2Values = retrieveValues(Exporter.this.filteredStatements, Exporter.this.variable2, Exporter.this.variable2Document); - String[] qualValues = retrieveValues(Exporter.this.filteredStatements, Exporter.this.qualifier, Exporter.this.qualifierDocument); + String[] var1Values = extractLabels(Exporter.this.filteredStatements, Exporter.this.variable1, Exporter.this.variable1Document); + String[] var2Values = extractLabels(Exporter.this.filteredStatements, Exporter.this.variable2, Exporter.this.variable2Document); + String[] qualValues = extractLabels(Exporter.this.filteredStatements, Exporter.this.qualifier, Exporter.this.qualifierDocument); if (dataTypes.get(Exporter.this.qualifier).equals("integer")) { int[] qual = Exporter.this.originalStatements.stream().mapToInt(s -> (int) s.get(Exporter.this.qualifier)).distinct().sorted().toArray(); if (qual.length < qualValues.length) { qualValues = IntStream.rangeClosed(qual[0], qual[qual.length - 1]) - .mapToObj(i -> String.valueOf(i)) + .mapToObj(String::valueOf) .toArray(String[]::new); } } @@ -1735,100 +1746,96 @@ public void computeKernelSmoothedTimeSlices() { } // create an array list of empty Matrix results, store all date-time stamps in them, and save indices in a hash map - Exporter.this.matrixResults = new ArrayList(); - HashMap timeMap = new HashMap<>(); + Exporter.this.matrixResults = new ArrayList<>(); if (Exporter.this.kernel.equals("gaussian")) { // for each mid-point gamma, create an empty Matrix and save the start, mid, and end time points in it as defined by the start and end of the whole time range; the actual matrix is injected later if (timeWindow.equals("minutes")) { gamma = gamma.minusMinutes(1); - while (gamma.isBefore(e.plusMinutes(1))) { + while (gamma.isBefore(e)) { gamma = gamma.plusMinutes(1); Exporter.this.matrixResults.add(new Matrix(var1Values, Exporter.this.networkType.equals("onemode") ? var1Values : var2Values, false, b, gamma, e)); - timeMap.put(gamma, Exporter.this.matrixResults.size() - 1); } } else if (timeWindow.equals("hours")) { gamma = gamma.minusHours(1); - while (gamma.isBefore(e.plusHours(1))) { + while (gamma.isBefore(e)) { gamma = gamma.plusHours(1); Exporter.this.matrixResults.add(new Matrix(var1Values, Exporter.this.networkType.equals("onemode") ? var1Values : var2Values, false, b, gamma, e)); - timeMap.put(gamma, Exporter.this.matrixResults.size() - 1); } } else if (timeWindow.equals("days")) { gamma = gamma.minusDays(1); - while (gamma.isBefore(e.plusDays(1))) { + while (gamma.isBefore(e)) { gamma = gamma.plusDays(1); Exporter.this.matrixResults.add(new Matrix(var1Values, Exporter.this.networkType.equals("onemode") ? var1Values : var2Values, false, b, gamma, e)); - timeMap.put(gamma, Exporter.this.matrixResults.size() - 1); } } else if (timeWindow.equals("weeks")) { gamma = gamma.minusWeeks(1); - while (gamma.isBefore(e.plusWeeks(1))) { + while (gamma.isBefore(e)) { gamma = gamma.plusWeeks(1); Exporter.this.matrixResults.add(new Matrix(var1Values, Exporter.this.networkType.equals("onemode") ? var1Values : var2Values, false, b, gamma, e)); - timeMap.put(gamma, Exporter.this.matrixResults.size() - 1); } } else if (timeWindow.equals("months")) { gamma = gamma.minusMonths(1); - while (gamma.isBefore(e.plusMonths(1))) { + while (gamma.isBefore(e)) { gamma = gamma.plusMonths(1); Exporter.this.matrixResults.add(new Matrix(var1Values, Exporter.this.networkType.equals("onemode") ? var1Values : var2Values, false, b, gamma, e)); - timeMap.put(gamma, Exporter.this.matrixResults.size() - 1); } } else if (timeWindow.equals("years")) { gamma = gamma.minusYears(1); - while (gamma.isBefore(e.plusYears(1))) { + while (gamma.isBefore(e)) { gamma = gamma.plusYears(1); Exporter.this.matrixResults.add(new Matrix(var1Values, Exporter.this.networkType.equals("onemode") ? var1Values : var2Values, false, b, gamma, e)); - timeMap.put(gamma, Exporter.this.matrixResults.size() - 1); } } } else { // for each mid-point gamma, create an empty Matrix and save the start, mid, and end time points in it as defined by width w; the actual matrix is injected later if (timeWindow.equals("minutes")) { gamma = gamma.minusMinutes(1); - while (gamma.isBefore(e.plusMinutes(1))) { + while (gamma.isBefore(e)) { gamma = gamma.plusMinutes(1); Exporter.this.matrixResults.add(new Matrix(var1Values, Exporter.this.networkType.equals("onemode") ? var1Values : var2Values, false, gamma.minusMinutes(W_HALF).isBefore(b) ? b : gamma.minusMinutes(W_HALF), gamma, gamma.plusMinutes(W_HALF).isAfter(e) ? e : gamma.plusMinutes(W_HALF))); - timeMap.put(gamma, Exporter.this.matrixResults.size() - 1); } } else if (timeWindow.equals("hours")) { gamma = gamma.minusHours(1); - while (gamma.isBefore(e.plusHours(1))) { + while (gamma.isBefore(e)) { gamma = gamma.plusHours(1); Exporter.this.matrixResults.add(new Matrix(var1Values, Exporter.this.networkType.equals("onemode") ? var1Values : var2Values, false, gamma.minusHours(W_HALF).isBefore(b) ? b : gamma.minusHours(W_HALF), gamma, gamma.plusHours(W_HALF).isAfter(e) ? e : gamma.plusHours(W_HALF))); - timeMap.put(gamma, Exporter.this.matrixResults.size() - 1); } } else if (timeWindow.equals("days")) { gamma = gamma.minusDays(1); - while (gamma.isBefore(e.plusDays(1))) { + while (gamma.isBefore(e)) { gamma = gamma.plusDays(1); Exporter.this.matrixResults.add(new Matrix(var1Values, Exporter.this.networkType.equals("onemode") ? var1Values : var2Values, false, gamma.minusDays(W_HALF).isBefore(b) ? b : gamma.minusDays(W_HALF), gamma, gamma.plusDays(W_HALF).isAfter(e) ? e : gamma.plusDays(W_HALF))); - timeMap.put(gamma, Exporter.this.matrixResults.size() - 1); } } else if (timeWindow.equals("weeks")) { gamma = gamma.minusWeeks(1); - while (gamma.isBefore(e.plusWeeks(1))) { + while (gamma.isBefore(e)) { gamma = gamma.plusWeeks(1); Exporter.this.matrixResults.add(new Matrix(var1Values, Exporter.this.networkType.equals("onemode") ? var1Values : var2Values, false, gamma.minusWeeks(W_HALF).isBefore(b) ? b : gamma.minusWeeks(W_HALF), gamma, gamma.plusWeeks(W_HALF).isAfter(e) ? e : gamma.plusWeeks(W_HALF))); - timeMap.put(gamma, Exporter.this.matrixResults.size() - 1); } } else if (timeWindow.equals("months")) { gamma = gamma.minusMonths(1); - while (gamma.isBefore(e.plusMonths(1))) { + while (gamma.isBefore(e)) { gamma = gamma.plusMonths(1); Exporter.this.matrixResults.add(new Matrix(var1Values, Exporter.this.networkType.equals("onemode") ? var1Values : var2Values, false, gamma.minusMonths(W_HALF).isBefore(b) ? b : gamma.minusMonths(W_HALF), gamma, gamma.plusMonths(W_HALF).isAfter(e) ? e : gamma.plusMonths(W_HALF))); - timeMap.put(gamma, Exporter.this.matrixResults.size() - 1); } } else if (timeWindow.equals("years")) { gamma = gamma.minusYears(1); - while (gamma.isBefore(e.plusYears(1))) { + while (gamma.isBefore(e)) { gamma = gamma.plusYears(1); Exporter.this.matrixResults.add(new Matrix(var1Values, Exporter.this.networkType.equals("onemode") ? var1Values : var2Values, false, gamma.minusYears(W_HALF).isBefore(b) ? b : gamma.minusYears(W_HALF), gamma, gamma.plusYears(W_HALF).isAfter(e) ? e : gamma.plusYears(W_HALF))); - timeMap.put(gamma, Exporter.this.matrixResults.size() - 1); } } } - // create a 4D array, go through the statements, and populate the array - int[][][][] X = new int[var1Values.length][var2Values.length][qualValues.length][Exporter.this.matrixResults.size()]; // var1 x var2 x qual x time + // create a 3D array, go through the statements, and populate the array + @SuppressWarnings("unchecked") + ArrayList[][][] X = (ArrayList[][][]) new ArrayList[var1Values.length][var2Values.length][qualValues.length]; + for (int i = 0; i < var1Values.length; i++) { + for (int j = 0; j < var2Values.length; j++) { + for (int k = 0; k < qualValues.length; k++) { + X[i][j][k] = new ArrayList(); + } + } + } + Exporter.this.filteredStatements.stream().forEach(s -> { int var1Index = -1; if (Exporter.this.variable1Document) { @@ -1846,7 +1853,7 @@ public void computeKernelSmoothedTimeSlices() { var1Index = var1Map.get(s.getTitle()); } } else { - var1Index = var1Map.get((String) s.get(Exporter.this.variable1)); + var1Index = var1Map.get(((Entity) s.get(Exporter.this.variable1)).getValue()); } int var2Index = -1; if (Exporter.this.variable2Document) { @@ -1864,7 +1871,7 @@ public void computeKernelSmoothedTimeSlices() { var2Index = var2Map.get(s.getTitle()); } } else { - var2Index = var2Map.get((String) s.get(Exporter.this.variable2)); + var2Index = var2Map.get(((Entity) s.get(Exporter.this.variable2)).getValue()); } int qualIndex = -1; if (Exporter.this.qualifierDocument) { @@ -1885,46 +1892,53 @@ public void computeKernelSmoothedTimeSlices() { if (dataTypes.get(Exporter.this.qualifier).equals("integer") || dataTypes.get(Exporter.this.qualifier).equals("boolean")) { qualIndex = qualMap.get(String.valueOf((int) s.get(Exporter.this.qualifier))); } else { - qualIndex = qualMap.get(s.get(Exporter.this.qualifier)); + qualIndex = qualMap.get(((Entity) s.get(Exporter.this.qualifier)).getValue()); } } - int timeIndex = timeMap.get(s.getDateTime()); - X[var1Index][var2Index][qualIndex][timeIndex]++; + X[var1Index][var2Index][qualIndex].add(s); }); // process each matrix result in a parallel stream instead of for-loop and add calculation results - ArrayList processedResults = Exporter.this.matrixResults.parallelStream() - .map(matrixResult -> processTimeSlice(matrixResult, X, Exporter.this.matrixResults)) + ArrayList processedResults = ProgressBar.wrap(Exporter.this.matrixResults.parallelStream(), "Kernel smoothing") + .map(matrixResult -> processTimeSlice(matrixResult, X)) .collect(Collectors.toCollection(ArrayList::new)); Exporter.this.matrixResults = processedResults; } /** - * Compute a one-mode or two-mode network matrix with kernel-weighting and inject it into a {@link Matrix} object. - * - * To compute the kernel-weighted network projection, the 4D array X is needed because it stores the statement data, - * the current matrix result is needed because it stores the mid-point gamma, and the array list of all matrix - * results is needed because the mid-points of the matrices contain the time stamps for the time dimension in X, - * which is required to establish the time difference between the current matrix mid-point and the respective - * statement in X. + * Compute a one-mode or two-mode network matrix with kernel-weighting and inject it into a {@link Matrix} object. + * To compute the kernel-weighted network projection, the 3D array X with statement array lists corresponding to + * each i-j-k combination is needed because it stores the statement data including their date-time stamp, and + * the current matrix result is needed because it stores the mid-point gamma. The kernel-weighted temporal distance + * between the statement time and gamma is computed and used in creating the network. * * @param matrixResult The matrix result into which the network matrix will be inserted. - * @param X A 4D array containing the data. - * @param matrixResults All matrix results, containing the temporal information for the fourth dimension of X. - * @return The matrix result after inserting the network matrix. + * @param X A 3D array containing the data. + * @return The matrix result after inserting the network matrix. */ - private Matrix processTimeSlice(Matrix matrixResult, int[][][][] X, ArrayList matrixResults) { + private Matrix processTimeSlice(Matrix matrixResult, ArrayList[][][] X) { if (this.networkType.equals("twomode")) { double[][] m = new double[X.length][X[0].length]; for (int i = 0; i < X.length; i++) { for (int j = 0; j < X[0].length; j++) { for (int k = 0; k < X[0][0].length; k++) { - for (int t = 0; t < X[0][0][0].length; t++) { - m[i][j] = k * X[i][j][k][t] * zeta(matrixResults.get(t).getDateTime(), matrixResult.getDateTime(), Exporter.this.windowSize, Exporter.this.timeWindow, Exporter.this.kernel); + for (int t = 0; t < X[i][j][k].size(); t++) { + if (Exporter.this.qualifierAggregation.equals("ignore")) { + m[i][j] = m[i][j] + zeta(X[i][j][k].get(t).getDateTime(), matrixResult.getDateTime(), Exporter.this.windowSize, Exporter.this.timeWindow, Exporter.this.kernel); + } else if (Exporter.this.qualifierAggregation.equals("subtract")) { + if (Exporter.this.dataTypes.get(Exporter.this.qualifier).equals("boolean")) { + m[i][j] = m[i][j] + (((double) k) - 0.5) * 2 * zeta(X[i][j][k].get(t).getDateTime(), matrixResult.getDateTime(), Exporter.this.windowSize, Exporter.this.timeWindow, Exporter.this.kernel); + } else if (Exporter.this.dataTypes.get(Exporter.this.qualifier).equals("integer")) { + m[i][j] = m[i][j] + k * zeta(X[i][j][k].get(t).getDateTime(), matrixResult.getDateTime(), Exporter.this.windowSize, Exporter.this.timeWindow, Exporter.this.kernel); + } else if (Exporter.this.dataTypes.get(Exporter.this.qualifier).equals("short text")) { + m[i][j] = m[i][j] + zeta(X[i][j][k].get(t).getDateTime(), matrixResult.getDateTime(), Exporter.this.windowSize, Exporter.this.timeWindow, Exporter.this.kernel); + } + } } } } } + matrixResult.setMatrix(m); } else if (this.networkType.equals("onemode")) { double[][] m = new double[X.length][X.length]; double[][] norm = new double[X.length][X.length]; @@ -1932,27 +1946,28 @@ private Matrix processTimeSlice(Matrix matrixResult, int[][][][] X, ArrayList { + eigenvalues[i] = computeNormalizedEigenvalues(Exporter.this.matrixResults.get(i).getMatrix(), "ojalgo"); // TODO: try out "apache", debug, and compare speed + }); + } + + ProgressBar.wrap(IntStream.range(0, Exporter.this.matrixResults.size()).parallel(), "Distance matrix").forEach(i -> { + IntStream.range(i + 1, Exporter.this.matrixResults.size()).forEach(j -> { // start from i + 1 to ensure symmetry and avoid redundant computation (= upper triangle) + double distance = 0.0; + if (distanceMethod.equals("spectral")) { + distance = spectralLoss(eigenvalues[i], eigenvalues[j]); + } else if (distanceMethod.equals("absdiff")) { // sum of element-wise absolute differences + for (int a = 0; a < Exporter.this.matrixResults.get(i).getMatrix().length; a++) { + for (int b = 0; b < Exporter.this.matrixResults.get(j).getMatrix()[0].length; b++) { + distance += Math.abs(Exporter.this.matrixResults.get(i).getMatrix()[a][b] - Exporter.this.matrixResults.get(j).getMatrix()[a][b]); + } + } + } + distanceMatrix[i][j] = distance; + distanceMatrix[j][i] = distance; // since the distance matrix is symmetric, set both [i][j] and [j][i] + }); + }); + return distanceMatrix; + } + /** * Create a series of one-mode or two-mode networks using a moving time window. */ @@ -2051,7 +2131,7 @@ public void computeTimeWindowMatrices() { ArrayList beforeStatements = new ArrayList(); // holds all statements between (and excluding) the time stamp of the first statement in the window and the focal statement ArrayList afterStatements = new ArrayList(); // holds all statements between (and excluding) the focal statement and the time stamp of the last statement in the window if (this.timeWindow.equals("events")) { - try (ProgressBar pb = new ProgressBar("Time window matrices...", this.filteredStatements.size())) { + try (ProgressBar pb = new ProgressBar("Time window matrices", this.filteredStatements.size())) { pb.stepTo(0); if (this.windowSize < 2) { LogEvent l = new LogEvent(Logger.WARNING, @@ -2155,7 +2235,7 @@ public void computeTimeWindowMatrices() { } } } else { - try (ProgressBar pb = new ProgressBar("Time window matrices...", 100)) { + try (ProgressBar pb = new ProgressBar("Time window matrices", 100)) { long percent = 0; pb.stepTo(percent); LocalDateTime startCalendar = this.startDateTime; // start of statement list @@ -2281,7 +2361,7 @@ public void exportToFile() { * can be used for estimating relational event models. */ private void eventCSV() { - try (ProgressBar pb = new ProgressBar("Exporting events...", this.filteredStatements.size())) { + try (ProgressBar pb = new ProgressBar("Exporting events", this.filteredStatements.size())) { pb.stepTo(0); String key; DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); @@ -2340,7 +2420,7 @@ private void eventCSV() { * Export {@link Matrix Matrix} to a CSV matrix file. */ private void exportCSV() { - try (ProgressBar pb = new ProgressBar("Exporting networks...", this.matrixResults.size())) { + try (ProgressBar pb = new ProgressBar("Exporting networks", this.matrixResults.size())) { pb.stepTo(0); String filename2; String filename1 = this.outfile.substring(0, this.outfile.length() - 4); @@ -2397,7 +2477,7 @@ private void exportCSV() { * Export network to a DL fullmatrix file for the software UCINET. */ private void exportDL() { - try (ProgressBar pb = new ProgressBar("Exporting networks...", this.matrixResults.size())) { + try (ProgressBar pb = new ProgressBar("Exporting networks", this.matrixResults.size())) { pb.stepTo(0); String filename2; String filename1 = this.outfile.substring(0, this.outfile.length() - 4); @@ -2476,7 +2556,7 @@ private void exportDL() { * Export filter for graphML files. */ private void exportGraphml() { - try (ProgressBar pb = new ProgressBar("Exporting networks...", this.matrixResults.size())) { + try (ProgressBar pb = new ProgressBar("Exporting networks", this.matrixResults.size())) { pb.stepTo(0); // set up file name components for time window (in case this will be required later) @@ -3042,7 +3122,7 @@ public void saveSimulatedAnnealingBackboneResult(boolean penalty) { finalBackboneList.toArray(String[]::new), finalRedundantList.toArray(String[]::new), spectralLoss(eigenvaluesFull, eigenvaluesCurrent), - spectralLoss(eigenvaluesFull, computeNormalizedEigenvalues(redundantMatrix.getMatrix())), + spectralLoss(eigenvaluesFull, computeNormalizedEigenvalues(redundantMatrix.getMatrix(), "ojalgo")), p, T, temperatureLog.stream().mapToDouble(v -> v.doubleValue()).toArray(), @@ -3076,9 +3156,10 @@ public SimulatedAnnealingBackboneResult getSimulatedAnnealingBackboneResult() { * Use tools from the {@code ojalgo} library to compute eigenvalues of a symmetric matrix. * * @param matrix The matrix as a two-dimensional double array. + * @param library The linear algebra Java library to use as a back-end: {@code "ojalgo"} or {@code "apache"}. * @return One-dimensional double array of eigenvalues. */ - private double[] computeNormalizedEigenvalues(double[][] matrix) { + private double[] computeNormalizedEigenvalues(double[][] matrix, String library) { for (int i = 0; i < matrix.length; i++) { for (int j = 0; j < matrix[0].length; j++) { if (matrix[i][j] < 0) { @@ -3086,21 +3167,36 @@ private double[] computeNormalizedEigenvalues(double[][] matrix) { } } } - Primitive64Matrix matrixPrimitive = Primitive64Matrix.FACTORY.rows(matrix); // create matrix - DenseArray rowSums = Primitive64Array.FACTORY.make(matrix.length); // container for row sums - matrixPrimitive.reduceRows(Aggregator.SUM, rowSums); // populate row sums into rowSums - Primitive64Matrix.SparseReceiver sr = Primitive64Matrix.FACTORY.makeSparse(matrix.length, matrix.length); // container for degree matrix - sr.fillDiagonal(rowSums); // put row sums onto diagonal - Primitive64Matrix laplacian = sr.get(); // put row sum container into a new degree matrix (the future Laplacian matrix) - laplacian.subtract(matrixPrimitive); // subtract adjacency matrix from degree matrix to create Laplacian matrix - Eigenvalue eig = Eigenvalue.PRIMITIVE.make(laplacian); // eigenvalues - eig.decompose(laplacian); // decomposition - double[] eigenvalues = eig.getEigenvalues().toRawCopy1D(); // extract eigenvalues and convert to double[] - double eigenvaluesSum = Arrays.stream(eigenvalues).sum(); // compute sum of eigenvalues - if (eigenvaluesSum > 0.0) { - eigenvalues = DoubleStream.of(eigenvalues).map(v -> v / eigenvaluesSum).toArray(); // normalise/scale to one + double[] eigenvalues; + if (library.equals("apache")) { + RealMatrix realMatrix = new Array2DRowRealMatrix(matrix); // create a real matrix from the 2D array + EigenDecomposition decomposition = new EigenDecomposition(realMatrix); // perform eigen decomposition + eigenvalues = decomposition.getRealEigenvalues(); // get the real parts of the eigenvalues + // normalize the eigenvalues + double eigenvaluesSum = Arrays.stream(eigenvalues).sum(); + if (eigenvaluesSum > 0.0) { + for (int i = 0; i < eigenvalues.length; i++) { + eigenvalues[i] /= eigenvaluesSum; + } + } + } else if (library.equals("ojalgo")) { + Primitive64Matrix matrixPrimitive = Primitive64Matrix.FACTORY.rows(matrix); // create matrix + DenseArray rowSums = Primitive64Array.FACTORY.make(matrix.length); // container for row sums + matrixPrimitive.reduceRows(Aggregator.SUM, rowSums); // populate row sums into rowSums + Primitive64Matrix.SparseReceiver sr = Primitive64Matrix.FACTORY.makeSparse(matrix.length, matrix.length); // container for degree matrix + sr.fillDiagonal(rowSums); // put row sums onto diagonal + Primitive64Matrix laplacian = sr.get(); // put row sum container into a new degree matrix (the future Laplacian matrix) + laplacian.subtract(matrixPrimitive); // subtract adjacency matrix from degree matrix to create Laplacian matrix + Eigenvalue eig = Eigenvalue.PRIMITIVE.make(laplacian); // eigenvalues + eig.decompose(laplacian); // decomposition + eigenvalues = eig.getEigenvalues().toRawCopy1D(); // extract eigenvalues and convert to double[] + double eigenvaluesSum = Arrays.stream(eigenvalues).sum(); // compute sum of eigenvalues + if (eigenvaluesSum > 0.0) { + eigenvalues = DoubleStream.of(eigenvalues).map(v -> v / eigenvaluesSum).toArray(); // normalize/scale to one + } + } else { + eigenvalues = new double[matrix.length]; // return zeroes if library not recognized; don't log error because it would be very slow } - return eigenvalues; } @@ -3219,7 +3315,7 @@ public void initializeNestedBackbone() { this.isolates = true; // include isolates in the iterations but not in the full matrix; will be adjusted to smaller full matrix dimensions without isolates manually each time in the iterations; necessary because some actors may be deleted in the backbone matrix otherwise after deleting their concepts // compute normalised eigenvalues for the full matrix; no need to recompute every time as they do not change - eigenvaluesFull = computeNormalizedEigenvalues(fullMatrix.getMatrix()); + eigenvaluesFull = computeNormalizedEigenvalues(fullMatrix.getMatrix(), "ojalgo"); iteration = new int[fullConcepts.length]; backboneLoss = new double[fullConcepts.length]; redundantLoss = new double[fullConcepts.length]; @@ -3255,7 +3351,7 @@ public void iterateNestedBackbone() { candidateMatrix = this.computeOneModeMatrix(candidateStatementList, this.qualifierAggregation, this.startDateTime, this.stopDateTime); candidateMatrix = this.reduceCandidateMatrix(candidateMatrix, fullMatrix.getRowNames()); // ensure it has the right dimensions by purging isolates relative to the full matrix candidateMatrices.add(candidateMatrix); - eigenvaluesCandidate = computeNormalizedEigenvalues(candidateMatrix.getMatrix()); // normalised eigenvalues for the candidate matrix + eigenvaluesCandidate = computeNormalizedEigenvalues(candidateMatrix.getMatrix(), "ojalgo"); // normalised eigenvalues for the candidate matrix currentLosses[i] = spectralLoss(eigenvaluesFull, eigenvaluesCandidate); } double smallestLoss = 0.0; @@ -3280,7 +3376,7 @@ public void iterateNestedBackbone() { Matrix redundantMatrix = this.computeOneModeMatrix(candidateStatementList, this.qualifierAggregation, this.startDateTime, this.stopDateTime); redundantMatrix = this.reduceCandidateMatrix(redundantMatrix, fullMatrix.getRowNames()); redundantMatrices.add(redundantMatrix); - eigenvaluesCandidate = computeNormalizedEigenvalues(redundantMatrix.getMatrix()); + eigenvaluesCandidate = computeNormalizedEigenvalues(redundantMatrix.getMatrix(), "ojalgo"); redundantLoss[counter] = spectralLoss(eigenvaluesFull, eigenvaluesCandidate); numStatements[counter] = numStatementsCandidates[i]; counter++; @@ -3339,7 +3435,7 @@ public void initializeSimulatedAnnealingBackbone(boolean penalty, double p, int this.isolates = true; // include isolates in the iterations; will be adjusted to full matrix without isolates manually each time // compute normalised eigenvalues for the full matrix; no need to recompute every time as they do not change - eigenvaluesFull = computeNormalizedEigenvalues(fullMatrix.getMatrix()); + eigenvaluesFull = computeNormalizedEigenvalues(fullMatrix.getMatrix(), "ojalgo"); if (penalty) { // simulated annealing with penalty: initially one randomly chosen entity in the backbone set // pick a random concept c_j from C and save its index @@ -3391,7 +3487,7 @@ public void initializeSimulatedAnnealingBackbone(boolean penalty, double p, int finalMatrix = this.reduceCandidateMatrix(finalMatrix, fullMatrix.getRowNames()); // ensure it has the right dimensions by purging isolates relative to the full matrix // eigenvalues for final matrix - eigenvaluesFinal = computeNormalizedEigenvalues(finalMatrix.getMatrix()); // normalised eigenvalues for the candidate matrix + eigenvaluesFinal = computeNormalizedEigenvalues(finalMatrix.getMatrix(), "ojalgo"); // normalised eigenvalues for the candidate matrix // create an initial current backbone set B_0, also with the one c_j concept like in B: B_0 <- {c_j} currentBackboneList = new ArrayList(finalBackboneList); @@ -3504,7 +3600,7 @@ public void iterateSimulatedAnnealingBackbone(boolean penalty) { .collect(Collectors.toCollection(ArrayList::new)); candidateMatrix = this.computeOneModeMatrix(candidateStatementList, this.qualifierAggregation, this.startDateTime, this.stopDateTime); // create candidate matrix after filtering the statements based on the action that was executed candidateMatrix = this.reduceCandidateMatrix(candidateMatrix, fullMatrix.getRowNames()); // ensure it has the right dimensions by purging isolates relative to the full matrix - eigenvaluesCandidate = computeNormalizedEigenvalues(candidateMatrix.getMatrix()); // normalised eigenvalues for the candidate matrix + eigenvaluesCandidate = computeNormalizedEigenvalues(candidateMatrix.getMatrix(), "ojalgo"); // normalised eigenvalues for the candidate matrix if (penalty) { newLoss = penalizedLoss(eigenvaluesFull, eigenvaluesCandidate, p, candidateBackboneList.size(), fullConcepts.length); // spectral distance between full and candidate matrix } else { @@ -3583,7 +3679,7 @@ public double[] evaluateBackboneSolution(String[] backboneEntities, int p) { this.isolates = true; // include isolates in the iterations; will be adjusted to full matrix without isolates manually each time // compute normalised eigenvalues for the full matrix; no need to recompute every time as they do not change - eigenvaluesFull = computeNormalizedEigenvalues(fullMatrix.getMatrix()); + eigenvaluesFull = computeNormalizedEigenvalues(fullMatrix.getMatrix(), "ojalgo"); // create copy of filtered statements and remove redundant entities ArrayList entityList = Stream.of(backboneEntities).collect(Collectors.toCollection(ArrayList::new)); @@ -3604,7 +3700,7 @@ public double[] evaluateBackboneSolution(String[] backboneEntities, int p) { .collect(Collectors.toCollection(ArrayList::new)); candidateMatrix = this.computeOneModeMatrix(candidateStatementList, this.qualifierAggregation, this.startDateTime, this.stopDateTime); // create candidate matrix after filtering the statements based on the action that was executed candidateMatrix = this.reduceCandidateMatrix(candidateMatrix, fullMatrix.getRowNames()); // ensure it has the right dimensions by purging isolates relative to the full matrix - eigenvaluesCandidate = computeNormalizedEigenvalues(candidateMatrix.getMatrix()); // normalised eigenvalues for the candidate matrix + eigenvaluesCandidate = computeNormalizedEigenvalues(candidateMatrix.getMatrix(), "ojalgo"); // normalised eigenvalues for the candidate matrix results[0] = penalizedLoss(eigenvaluesFull, eigenvaluesCandidate, p, backboneSet.size(), fullConcepts.length); // spectral distance between full and candidate matrix // spectral distance between full and redundant set @@ -3614,7 +3710,7 @@ public double[] evaluateBackboneSolution(String[] backboneEntities, int p) { .collect(Collectors.toCollection(ArrayList::new)); candidateMatrix = this.computeOneModeMatrix(candidateStatementList, this.qualifierAggregation, this.startDateTime, this.stopDateTime); // create candidate matrix after filtering the statements based on the action that was executed candidateMatrix = this.reduceCandidateMatrix(candidateMatrix, fullMatrix.getRowNames()); // ensure it has the right dimensions by purging isolates relative to the full matrix - eigenvaluesCandidate = computeNormalizedEigenvalues(candidateMatrix.getMatrix()); // normalised eigenvalues for the candidate matrix + eigenvaluesCandidate = computeNormalizedEigenvalues(candidateMatrix.getMatrix(), "ojalgo"); // normalised eigenvalues for the candidate matrix results[1] = penalizedLoss(eigenvaluesFull, eigenvaluesCandidate, p, redundantSet.size(), fullConcepts.length); // spectral distance between full and candidate matrix return results; diff --git a/rDNA/build.gradle b/rDNA/build.gradle index 3c29bc93..194ccbdf 100644 --- a/rDNA/build.gradle +++ b/rDNA/build.gradle @@ -1,4 +1,5 @@ task rDNADocument { + inputs.dir 'rDNA' doLast { exec { workingDir 'rDNA' @@ -9,26 +10,25 @@ task rDNADocument { task rDNABuild { dependsOn rDNADocument + inputs.dir 'rDNA' + def outputDir = file("$rootDir/build/") + doFirst { + delete fileTree(dir: outputDir, includes: ['*.tar.gz']) + } doLast { - exec { - workingDir '.' - commandLine 'mkdir', "-p", "$rootDir/build" - } + mkdir "$rootDir/build" exec { workingDir "$rootDir/build/" commandLine 'R', "CMD", "build", "../rDNA/rDNA" } } - def outputDir = file("$rootDir/build/") outputs.dir outputDir } task rDNACheck { + inputs.dir 'rDNA' doLast { - exec { - workingDir '.' - commandLine 'mkdir', "-p", "$rootDir/build" - } + mkdir "$rootDir/build" exec { workingDir "rDNA" commandLine 'R', "-e", "devtools::check(manual = TRUE, check_dir = '../../build/')" @@ -39,11 +39,9 @@ task rDNACheck { } task rDNATest { + inputs.dir 'rDNA' doLast { - exec { - workingDir '.' - commandLine 'mkdir', "-p", "$rootDir/build" - } + mkdir "$rootDir/build" exec { workingDir "rDNA" commandLine 'R', "-e", "devtools::test()" @@ -57,6 +55,7 @@ task rDNATest { task build { dependsOn rDNABuild + inputs.dir 'rDNA' def outputDir = file("$rootDir/build/") outputs.dir outputDir doLast {} diff --git a/rDNA/rDNA/DESCRIPTION b/rDNA/rDNA/DESCRIPTION index b5a8fe9d..504ace0b 100755 --- a/rDNA/rDNA/DESCRIPTION +++ b/rDNA/rDNA/DESCRIPTION @@ -1,6 +1,6 @@ Package: rDNA Version: 3.0.11 -Date: 2023-10-15 +Date: 2024-02-04 Title: Discourse Network Analysis in R Authors@R: c(person(given = "Philip", diff --git a/rDNA/rDNA/NAMESPACE b/rDNA/rDNA/NAMESPACE index 750c465f..a449fcf4 100644 --- a/rDNA/rDNA/NAMESPACE +++ b/rDNA/rDNA/NAMESPACE @@ -20,6 +20,7 @@ export(dna_barplot) export(dna_closeDatabase) export(dna_evaluateBackboneSolution) export(dna_getAttributes) +export(dna_getHeadlessDna) export(dna_getVariables) export(dna_init) export(dna_jar) @@ -28,6 +29,7 @@ export(dna_network) export(dna_openConnectionProfile) export(dna_openDatabase) export(dna_phaseTransitions) +export(dna_phaseTransitions2) export(dna_printDetails) export(dna_queryCoders) export(dna_sample) diff --git a/rDNA/rDNA/R/rDNA.R b/rDNA/rDNA/R/rDNA.R index 167626f7..8703bb2e 100644 --- a/rDNA/rDNA/R/rDNA.R +++ b/rDNA/rDNA/R/rDNA.R @@ -73,6 +73,34 @@ dna_init <- function(jarfile = dna_jar(), memory = 1024, returnString = FALSE) { } } +#' Get headless DNA +#' +#' Get headless DNA. +#' +#' This function returns a reference to the Java class that represents the +#' R interface of DNA, i.e., the headless API. Headless means that no graphical +#' user interface is necessary to use the functions in DNA. This is the raw API +#' interface and should only be used by expert users or to debug other rDNA +#' functions. You can save the headless DNA object to an object in R and access +#' its Java functions directly as slots using the "$" operator, for example +#' \code{dna_getHeadlessDna()$getExport()} will return a reference to the +#' \code{Exporter} class, which does all the network computations, and +#' \code{dna_getHeadlessDna()$getAttributes()} will return attributes etc. You +#' can view the different functions that are available by using the tab key to +#' auto-complete function access after typing the "$" if the headless reference +#' has been saved to an object in the workspace. +#' +#' @return Headless DNA reference. +#' +#' @author Philip Leifeld +#' +#' @family {rDNA package startup} +#' +#' @export +dna_getHeadlessDna <- function() { + dnaEnvironment[["dna"]]$headlessDna +} + #' Identify and/or download and install the correct DNA jar file #' #' Identify and/or download and install the correct DNA jar file. @@ -987,6 +1015,18 @@ dna_getAttributes <- function(statementType = NULL, #' @param windowSize The number of time units of which a moving time window is #' comprised. This can be the number of statement events, the number of days #' etc., as defined in the \code{"timeWindow"} argument. +#' @param kernel Use kernel smoothing for computing time windows? This option +#' only matters if the \code{timeWindow} argument has a value other than +#' \code{"no"} or \code{"event"}. The default value \code{kernel = "no"} +#' switches off kernel smoothing, which means all statements within a time +#' window are weighted equally. Other values down-weight statements the +#' farther they are temporally away from the mid-point of the time window. +#' Several kernel smoothing functions are available, similar to kernel density +#' estimation: \code{"uniform"} is similar to \code{"no"} and weights all +#' statements with a value of \code{0.5}. \code{"gaussian"} uses a standard +#' normal distribution as a kernel smoother. \code{"epanechnikov"} uses an +#' Epanechnikov kernel smoother. \code{"triangular"} uses a triangular kernel +#' function. If in doubt, do not use kernel smoothing. #' @param excludeValues A list of named character vectors that contains entries #' which should be excluded during network construction. For example, #' \code{list(concept = c("A", "B"), organization = c("org A", "org B"))} @@ -1083,6 +1123,7 @@ dna_network <- function(networkType = "twomode", stop.time = "23:59:59", timeWindow = "no", windowSize = 100, + kernel = "no", excludeValues = list(), excludeAuthors = character(), excludeSources = character(), @@ -1136,7 +1177,7 @@ dna_network <- function(networkType = "twomode", } # call rNetwork function to compute results - .jcall(dnaEnvironment[["dna"]]$headlessDna, + .jcall(dna_getHeadlessDna(), "V", "rNetwork", networkType, @@ -1157,6 +1198,7 @@ dna_network <- function(networkType = "twomode", stop.time, timeWindow, as.integer(windowSize), + kernel, var, val, excludeAuthors, @@ -1172,7 +1214,7 @@ dna_network <- function(networkType = "twomode", fileFormat ) - exporter <- .jcall(dnaEnvironment[["dna"]]$headlessDna, "Lexport/Exporter;", "getExporter") # get a reference to the Exporter object, in which results are stored + exporter <- .jcall(dna_getHeadlessDna(), "Lexport/Exporter;", "getExporter") # get a reference to the Exporter object, in which results are stored if (networkType == "eventlist") { # assemble an event list in the form of a data frame of filtered statements f <- J(exporter, "getFilteredStatements", simplify = TRUE) # array list of filtered export statements; use J because array list return type not recognized using .jcall @@ -4300,6 +4342,14 @@ print.dna_multiclust <- function(x, ...) { #' be applied to identify the different stages and phases from the resulting #' distance matrix. #' +#' In contrast to the \code{\link{dna_phaseTransitions}} function, the +#' \code{\link{dna_phaseTransitions2}} function does not offer split nodes. +#' However, it offers two advantages. The first one is faster computation and +#' better parallelization in Java. The second one is kernel smoothing, which +#' means the farther away from a time point a statement is, the less important +#' it becomes for the network that is created around the time point. Several +#' kernel smoothing functions are available; see the \code{kernel} argument. +#' #' @param distanceMethod The distance measure that expresses the dissimilarity #' between any two network matrices. The following choices are available: #' \itemize{ @@ -4388,7 +4438,21 @@ print.dna_multiclust <- function(x, ...) { #' the \pkg{pbmcapply} package is used to parallelize the computation of the #' distance matrix and the clustering. Note that this method is based on #' forking and is only available on Unix operating systems, including MacOS -#' and Linux. +#' and Linux. In the \code{dna_phaseTransitions2} function, only the +#' clustering is done using this parallelization in R while the remaining +#' computations are done in parallel using threads in Java, which is more +#' efficient. +#' @param kernel Use kernel smoothing for computing network time slices? The +#' default value \code{kernel = "no"} switches off kernel smoothing, which +#' means all statements within a time window are weighted equally. Other +#' values down-weight statements the farther they are temporally away from the +#' temporal mid-point of the respective time slice. Several kernel smoothing +#' functions are available, similar to kernel density estimation: +#' \code{"uniform"} is similar to \code{"no"} and weights all statements with +#' a value of \code{0.5}. \code{"gaussian"} uses a standard normal +#' distribution as a kernel smoother. \code{"epanechnikov"} uses an +#' Epanechnikov kernel smoother. \code{"triangular"} uses a triangular kernel +#' function. #' @inheritParams dna_network #' #' @examples @@ -4412,6 +4476,25 @@ print.dna_multiclust <- function(x, ...) { #' timeWindow = "events", #' windowSize = 15) #' results +#' +#' # kernel smoothing and spectral distances using dna_phaseTransitions2 +#' results2 <- dna_phaseTransitions2(distanceMethod = "spectral", +#' clusterMethods = c("ward", +#' "pam", +#' "concor", +#' "walktrap"), +#' k.min = 2, +#' k.max = 6, +#' networkType = "onemode", +#' variable1 = "organization", +#' variable2 = "concept", +#' timeWindow = "days", +#' windowSize = 15, +#' kernel = "gaussian") +#' results2 +#' +#' library("ggplot2") +#' autoplot(results2) #' } #' #' @author Philip Leifeld, Kristijan Garic @@ -4519,6 +4602,7 @@ dna_phaseTransitions <- function(distanceMethod = "absdiff", stop.time = stop.time, timeWindow = timeWindow, windowSize = windowSize, + kernel = "no", excludeValues = excludeValues, excludeAuthors = excludeAuthors, excludeSources = excludeSources, @@ -4599,6 +4683,7 @@ dna_phaseTransitions <- function(distanceMethod = "absdiff", stop.time = stop.time, timeWindow = timeWindow, windowSize = windowSize, + kernel = "no", excludeValues = excludeValues, excludeAuthors = excludeAuthors, excludeSources = excludeSources, @@ -4823,6 +4908,340 @@ dna_phaseTransitions <- function(distanceMethod = "absdiff", return(results) } +#' @rdname dna_phaseTransitions +#' @author Philip Leifeld +#' @importFrom stats dist +#' @importFrom utils combn +#' @importFrom rJava .jarray .jcall .jnull J +#' @export +dna_phaseTransitions2 <- function(distanceMethod = "absdiff", + normalizeNetwork = FALSE, + clusterMethods = c("single", + "average", + "complete", + "ward", + "kmeans", + "pam", + "spectral", + "fastgreedy", + "walktrap"), + k.min = 2, + k.max = 6, + cores = 1, + networkType = "twomode", + statementType = "DNA Statement", + variable1 = "organization", + variable1Document = FALSE, + variable2 = "concept", + variable2Document = FALSE, + qualifier = "agreement", + qualifierDocument = FALSE, + qualifierAggregation = "subtract", + normalization = "no", + duplicates = "document", + start.date = "01.01.1900", + stop.date = "31.12.2099", + start.time = "00:00:00", + stop.time = "23:59:59", + timeWindow = "days", + windowSize = 150, + kernel = "no", + excludeValues = list(), + excludeAuthors = character(), + excludeSources = character(), + excludeSections = character(), + excludeTypes = character(), + invertValues = FALSE, + invertAuthors = FALSE, + invertSources = FALSE, + invertSections = FALSE, + invertTypes = FALSE) { + + # check arguments and packages + if (distanceMethod == "spectral" && networkType == "twomode") { + distanceMethod <- "absdiff" + warning("Spectral distances only work with one-mode networks. Using 'distanceMethod = \"absdiff\"' instead.") + } + if (cores > 1 && !requireNamespace("pbmcapply", quietly = TRUE)) { + pbmclapply <- FALSE + warning("Argument 'cores' requires the 'pbmcapply' package, which is not installed.\nSetting 'cores = 1'. Consider installing the 'pbmcapply' package if you use Linux or MacOS.") + } + igraphMethods <- c("louvain", "fastgreedy", "walktrap", "leading_eigen", "edge_betweenness", "infomap", "label_prop", "spinglass") + if (any(igraphMethods %in% clusterMethods) && !requireNamespace("igraph", quietly = TRUE)) { + clusterMethods <- clusterMethods[-igraphMethods] + warning("'igraph' package not installed. Dropping clustering methods from the 'igraph' package. Consider installing 'igraph'.") + } + if ("pam" %in% clusterMethods && !requireNamespace("cluster", quietly = TRUE)) { + clusterMethods <- clusterMethods[which(clusterMethods != "pam")] + warning("'cluster' package not installed. Dropping clustering methods from the 'cluster' package. Consider installing 'cluster'.") + } + if ("concor" %in% clusterMethods && k.min > 2) { + clusterMethods <- clusterMethods[which(clusterMethods != "concor")] + warning("Dropping 'concor' from clustering methods because the CONCOR implementation in rDNA can only find exactly two clusters, but the 'k.min' argument was larger than 2.") + } + clusterMethods <- rev(clusterMethods) # reverse order to save time during parallel computation by starting the computationally intensive methods first + mcall <- match.call() # save the arguments for storing them in the results later + + # generate the time window networks + if (is.null(timeWindow) || is.na(timeWindow) || !is.character(timeWindow) || length(timeWindow) != 1 || !timeWindow %in% c("events", "seconds", "minutes", "hours", "days", "weeks", "months", "years")) { + timeWindow <- "events" + warning("The 'timeWindow' argument was invalid. Proceeding with 'timeWindow = \"events\" instead.") + } + + # wrap the vectors of exclude values for document variables into Java arrays + excludeAuthors <- .jarray(excludeAuthors) + excludeSources <- .jarray(excludeSources) + excludeSections <- .jarray(excludeSections) + excludeTypes <- .jarray(excludeTypes) + + # compile exclude variables and values vectors + dat <- matrix("", nrow = length(unlist(excludeValues)), ncol = 2) + count <- 0 + if (length(excludeValues) > 0) { + for (i in 1:length(excludeValues)) { + if (length(excludeValues[[i]]) > 0) { + for (j in 1:length(excludeValues[[i]])) { + count <- count + 1 + dat[count, 1] <- names(excludeValues)[i] + dat[count, 2] <- excludeValues[[i]][j] + } + } + } + var <- dat[, 1] + val <- dat[, 2] + } else { + var <- character() + val <- character() + } + var <- .jarray(var) # array of variable names of each excluded value + val <- .jarray(val) # array of values to be excluded + + # encode R NULL as Java null value if necessary + if (is.null(qualifier) || is.na(qualifier)) { + qualifier <- .jnull(class = "java/lang/String") + } + fileFormat <- .jnull(class = "java/lang/String") + outfile <- .jnull(class = "java/lang/String") + + # call rNetwork function to compute results + .jcall(dna_getHeadlessDna(), + "V", + "rNetwork", + networkType, + statementType, + variable1, + variable1Document, + variable2, + variable2Document, + qualifier, + qualifierDocument, + qualifierAggregation, + normalization, + TRUE, + duplicates, + start.date, + stop.date, + start.time, + stop.time, + timeWindow, + as.integer(windowSize), + kernel, + var, + val, + excludeAuthors, + excludeSources, + excludeSections, + excludeTypes, + invertValues, + invertAuthors, + invertSources, + invertSections, + invertTypes, + outfile, + fileFormat + ) + exporter <- dna_getHeadlessDna()$getExporter() # save Java object reference to exporter class + + # matrix normalization + if (normalizeNetwork) { + .jcall(exporter, + "V", + "normalizeMatrixResults") + } + + # compute distance matrix + if (distanceMethod == "modularity") { + stop("Differences in modularity have not been implemented yet. Please use absolute differences or spectral Euclidean distance as a distance method.") + } else if (!distanceMethod %in% c("absdiff", "spectral")) { + stop("Distance method not recognized. Try \"absdiff\" or \"spectral\".") + } + distance_mat <- .jcall(exporter, + "[[D", + "computeDistanceMatrix", + distanceMethod, + simplify = TRUE) + distance_mat <- distance_mat / max(distance_mat) # rescale between 0 and 1 + + # retrieve mid-point dates (gamma) + m <- .jcall(exporter, "[Lexport/Matrix;", "getMatrixResultsArray") # get list of Matrix objects from Exporter object + dates <- sapply(m, function(x) as.POSIXct(.jcall(x, "J", "getDateTimeLong"), origin = "1970-01-01")) + + # define clustering function + hclustMethods <- c("single", "average", "complete", "ward") + cl <- function(method, distmat) { + tryCatch({ + similarity_mat <- 1 - distmat + g <- igraph::graph.adjacency(similarity_mat, mode = "undirected", weighted = TRUE, diag = FALSE) # graph needs to be based on similarity, not distance + if (method %in% hclustMethods) { + if (method == "single") { + suppressWarnings(cl <- stats::hclust(as.dist(distmat), method = "single")) + } else if (method == "average") { + suppressWarnings(cl <- stats::hclust(as.dist(distmat), method = "average")) + } else if (method == "complete") { + suppressWarnings(cl <- stats::hclust(as.dist(distmat), method = "complete")) + } else if (method == "ward") { + suppressWarnings(cl <- stats::hclust(as.dist(distmat), method = "ward.D2")) + } + opt_k <- lapply(k.min:k.max, function(x) { + mem <- stats::cutree(cl, k = x) + mod <- igraph::modularity(x = g, weights = igraph::E(g)$weight, membership = mem) + return(list(mem = mem, mod = mod)) + }) + mod <- sapply(opt_k, function(x) x$mod) + kk <- which.max(mod) + mem <- opt_k[[kk]]$mem + } else if (method == "kmeans") { + opt_k <- lapply(k.min:k.max, function(x) { + suppressWarnings(cl <- stats::kmeans(distmat, centers = x)) + mem <- cl$cluster + mod <- igraph::modularity(x = g, weights = igraph::E(g)$weight, membership = mem) + return(list(cl = cl, mem = mem, mod = mod)) + }) + mod <- sapply(opt_k, function(x) x$mod) + kk <- which.max(mod) + mem <- opt_k[[kk]]$mem + } else if (method == "pam") { + opt_k <- lapply(k.min:k.max, function(x) { + suppressWarnings(cl <- cluster::pam(distmat, k = x)) + mem <- cl$cluster + mod <- igraph::modularity(x = g, weights = igraph::E(g)$weight, membership = mem) + return(list(cl = cl, mem = mem, mod = mod)) + }) + mod <- sapply(opt_k, function(x) x$mod) + kk <- which.max(mod) + mem <- opt_k[[kk]]$mem + } else if (method == "spectral") { + sigma <- 1.0 + affinity_matrix <- exp(-distmat^2 / (2 * sigma^2)) + L <- diag(rowSums(affinity_matrix)) - affinity_matrix + D.sqrt.inv <- diag(1 / sqrt(rowSums(affinity_matrix))) + L.norm <- D.sqrt.inv %*% L %*% D.sqrt.inv + eigenvalues <- eigen(L.norm) # eigenvalue decomposition + opt_k <- lapply(k.min:k.max, function(x) { + U <- eigenvalues$vectors[, 1:x] + mem <- kmeans(U, centers = x)$cluster # cluster the eigenvectors + mod <- igraph::modularity(x = g, weights = igraph::E(g)$weight, membership = mem) + return(list(mem = mem, mod = mod)) + }) + mod <- sapply(opt_k, function(x) x$mod) + kk <- which.max(mod) + mem <- opt_k[[kk]]$mem + } else if (method == "concor") { + suppressWarnings(mi <- stats::cor(similarity_mat)) + iter <- 1 + while (any(abs(mi) <= 0.999) & iter <= 50) { + mi[is.na(mi)] <- 0 + mi <- stats::cor(mi) + iter <- iter + 1 + } + mem <- ((mi[, 1] > 0) * 1) + 1 + } else if (method %in% igraphMethods) { + if (method == "fastgreedy") { + suppressWarnings(cl <- igraph::cluster_fast_greedy(g)) + } else if (method == "walktrap") { + suppressWarnings(cl <- igraph::cluster_walktrap(g)) + } else if (method == "leading_eigen") { + suppressWarnings(cl <- igraph::cluster_leading_eigen(g)) + } else if (method == "edge_betweenness") { + suppressWarnings(cl <- igraph::cluster_edge_betweenness(g)) + } else if (method == "spinglass") { + suppressWarnings(cl <- igraph::cluster_spinglass(g)) + } + opt_k <- lapply(k.min:k.max, function(x) { + mem <- igraph::cut_at(communities = cl, no = x) + mod <- igraph::modularity(x = g, weights = igraph::E(g)$weight, membership = mem) + return(list(mem = mem, mod = mod)) + }) + mod <- sapply(opt_k, function(x) x$mod) + kk <- which.max(mod) + mem <- opt_k[[kk]]$mem + } + list(method = method, + modularity = igraph::modularity(x = g, weights = igraph::E(g)$weight, membership = mem), + memberships = mem) + }, + error = function(e) { + warning("Cluster method '", method, "' could not be computed due to an error: ", e) + }, + warning = function(w) { + warning("Cluster method '", method, "' threw a warning: ", w) + }) + } + + # apply all clustering methods to distance matrix + if (cores > 1) { + cat(paste("Clustering distance matrix on", cores, "cores.\n")) + a <- Sys.time() + l <- pbmcapply::pbmclapply(clusterMethods, cl, distmat = distance_mat, mc.cores = cores) + b <- Sys.time() + } else { + cat("Clustering distance matrix... ") + a <- Sys.time() + l <- lapply(clusterMethods, cl, distmat = distance_mat) + b <- Sys.time() + cat(intToUtf8(0x2714), "\n") + } + print(b - a) + for (i in length(l):1) { + if (length(l[[i]]) == 1) { + l <- l[-i] + clusterMethods <- clusterMethods[-i] + } + } + results <- list() + mod <- sapply(l, function(x) x$modularity) + best <- which(mod == max(mod))[1] + results$modularity <- mod[best] + results$clusterMethod <- clusterMethods[best] + + # temporal embedding via MDS + if (!requireNamespace("MASS", quietly = TRUE)) { + mem <- data.frame("date" = as.POSIXct(dates, format = "%d-%m-%Y", tz = "UTC"), + "state" = l[[best]]$memberships) + results$states <- mem + warning("Skipping temporal embedding because the 'MASS' package is not installed. Consider installing it.") + } else { + cat("Temporal embedding...\n") + a <- Sys.time() + distmat <- distance_mat + 1e-12 + mds <- MASS::isoMDS(distmat) # MDS of distance matrix + points <- mds$points + mem <- data.frame("date" = as.POSIXct(dates, format = "%d-%m-%Y", tz = "UTC"), + "state" = l[[best]]$memberships, + "X1" = points[, 1], + "X2" = points[, 2]) + results$states <- mem + b <- Sys.time() + print(b - a) + } + + results$distmat <- distance_mat + class(results) <- "dna_phaseTransitions" + attributes(results)$stress <- ifelse(ncol(results$states) == 2, NA, mds$stress) + attributes(results)$call <- mcall + return(results) +} + #' Print the summary of a \code{dna_phaseTransitions} object #' #' Show details of a \code{dna_phaseTransitions} object. diff --git a/rDNA/rDNA/man/dna_getHeadlessDna.Rd b/rDNA/rDNA/man/dna_getHeadlessDna.Rd new file mode 100644 index 00000000..dce52f07 --- /dev/null +++ b/rDNA/rDNA/man/dna_getHeadlessDna.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rDNA.R +\name{dna_getHeadlessDna} +\alias{dna_getHeadlessDna} +\title{Get headless DNA} +\usage{ +dna_getHeadlessDna() +} +\value{ +Headless DNA reference. +} +\description{ +Get headless DNA. +} +\details{ +This function returns a reference to the Java class that represents the +R interface of DNA, i.e., the headless API. Headless means that no graphical +user interface is necessary to use the functions in DNA. This is the raw API +interface and should only be used by expert users or to debug other rDNA +functions. You can save the headless DNA object to an object in R and access +its Java functions directly as slots using the "$" operator, for example +\code{dna_getHeadlessDna()$getExport()} will return a reference to the +\code{Exporter} class, which does all the network computations, and +\code{dna_getHeadlessDna()$getAttributes()} will return attributes etc. You +can view the different functions that are available by using the tab key to +auto-complete function access after typing the "$" if the headless reference +has been saved to an object in the workspace. +} +\seealso{ +Other {rDNA package startup}: +\code{\link{dna_init}()}, +\code{\link{dna_jar}()}, +\code{\link{dna_sample}()} +} +\author{ +Philip Leifeld +} +\concept{{rDNA package startup}} diff --git a/rDNA/rDNA/man/dna_init.Rd b/rDNA/rDNA/man/dna_init.Rd index 04574283..c72d8c6f 100644 --- a/rDNA/rDNA/man/dna_init.Rd +++ b/rDNA/rDNA/man/dna_init.Rd @@ -36,6 +36,7 @@ dna_init() } \seealso{ Other {rDNA package startup}: +\code{\link{dna_getHeadlessDna}()}, \code{\link{dna_jar}()}, \code{\link{dna_sample}()} } diff --git a/rDNA/rDNA/man/dna_jar.Rd b/rDNA/rDNA/man/dna_jar.Rd index 127c046f..82e4af72 100644 --- a/rDNA/rDNA/man/dna_jar.Rd +++ b/rDNA/rDNA/man/dna_jar.Rd @@ -33,6 +33,7 @@ file. If all of this fails, an error message is thrown. } \seealso{ Other {rDNA package startup}: +\code{\link{dna_getHeadlessDna}()}, \code{\link{dna_init}()}, \code{\link{dna_sample}()} } diff --git a/rDNA/rDNA/man/dna_network.Rd b/rDNA/rDNA/man/dna_network.Rd index 4799220e..f6dbba3c 100644 --- a/rDNA/rDNA/man/dna_network.Rd +++ b/rDNA/rDNA/man/dna_network.Rd @@ -23,6 +23,7 @@ dna_network( stop.time = "23:59:59", timeWindow = "no", windowSize = 100, + kernel = "no", excludeValues = list(), excludeAuthors = character(), excludeSources = character(), @@ -181,6 +182,19 @@ event time.} comprised. This can be the number of statement events, the number of days etc., as defined in the \code{"timeWindow"} argument.} +\item{kernel}{Use kernel smoothing for computing time windows? This option +only matters if the \code{timeWindow} argument has a value other than +\code{"no"} or \code{"event"}. The default value \code{kernel = "no"} +switches off kernel smoothing, which means all statements within a time +window are weighted equally. Other values down-weight statements the +farther they are temporally away from the mid-point of the time window. +Several kernel smoothing functions are available, similar to kernel density +estimation: \code{"uniform"} is similar to \code{"no"} and weights all +statements with a value of \code{0.5}. \code{"gaussian"} uses a standard +normal distribution as a kernel smoother. \code{"epanechnikov"} uses an +Epanechnikov kernel smoother. \code{"triangular"} uses a triangular kernel +function. If in doubt, do not use kernel smoothing.} + \item{excludeValues}{A list of named character vectors that contains entries which should be excluded during network construction. For example, \code{list(concept = c("A", "B"), organization = c("org A", "org B"))} diff --git a/rDNA/rDNA/man/dna_phaseTransitions.Rd b/rDNA/rDNA/man/dna_phaseTransitions.Rd index d1a8343f..2fcd8b96 100644 --- a/rDNA/rDNA/man/dna_phaseTransitions.Rd +++ b/rDNA/rDNA/man/dna_phaseTransitions.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/rDNA.R \name{dna_phaseTransitions} \alias{dna_phaseTransitions} +\alias{dna_phaseTransitions2} \alias{print.dna_phaseTransitions} \alias{autoplot.dna_phaseTransitions} \title{Detect phase transitions and states in a discourse network} @@ -44,6 +45,44 @@ dna_phaseTransitions( invertTypes = FALSE ) +dna_phaseTransitions2( + distanceMethod = "absdiff", + normalizeNetwork = FALSE, + clusterMethods = c("single", "average", "complete", "ward", "kmeans", "pam", + "spectral", "fastgreedy", "walktrap"), + k.min = 2, + k.max = 6, + cores = 1, + networkType = "twomode", + statementType = "DNA Statement", + variable1 = "organization", + variable1Document = FALSE, + variable2 = "concept", + variable2Document = FALSE, + qualifier = "agreement", + qualifierDocument = FALSE, + qualifierAggregation = "subtract", + normalization = "no", + duplicates = "document", + start.date = "01.01.1900", + stop.date = "31.12.2099", + start.time = "00:00:00", + stop.time = "23:59:59", + timeWindow = "days", + windowSize = 150, + kernel = "no", + excludeValues = list(), + excludeAuthors = character(), + excludeSources = character(), + excludeSections = character(), + excludeTypes = character(), + invertValues = FALSE, + invertAuthors = FALSE, + invertSources = FALSE, + invertSections = FALSE, + invertTypes = FALSE +) + \method{print}{dna_phaseTransitions}(x, ...) \method{autoplot}{dna_phaseTransitions}(object, ..., plots = c("heatmap", "silhouette", "mds", "states")) @@ -143,7 +182,10 @@ the data in this case. Do not use with actor networks!} the \pkg{pbmcapply} package is used to parallelize the computation of the distance matrix and the clustering. Note that this method is based on forking and is only available on Unix operating systems, including MacOS -and Linux.} +and Linux. In the \code{dna_phaseTransitions2} function, only the +clustering is done using this parallelization in R while the remaining +computations are done in parallel using threads in Java, which is more +efficient.} \item{networkType}{The kind of network to be computed. Can be \code{"twomode"}, \code{"onemode"}, or \code{"eventlist"}.} @@ -339,6 +381,18 @@ construction (\code{invertTypes = FALSE}) or if they should be the only values that should be included during network construction (\code{invertTypes = TRUE}).} +\item{kernel}{Use kernel smoothing for computing network time slices? The +default value \code{kernel = "no"} switches off kernel smoothing, which +means all statements within a time window are weighted equally. Other +values down-weight statements the farther they are temporally away from the +temporal mid-point of the respective time slice. Several kernel smoothing +functions are available, similar to kernel density estimation: +\code{"uniform"} is similar to \code{"no"} and weights all statements with +a value of \code{0.5}. \code{"gaussian"} uses a standard normal +distribution as a kernel smoother. \code{"epanechnikov"} uses an +Epanechnikov kernel smoother. \code{"triangular"} uses a triangular kernel +function.} + \item{x}{A \code{dna_phaseTransitions} object.} \item{...}{Additional arguments. Currently not in use.} @@ -363,6 +417,14 @@ Euclidean spectral distance are available. Several clustering techniques can be applied to identify the different stages and phases from the resulting distance matrix. +In contrast to the \code{\link{dna_phaseTransitions}} function, the +\code{\link{dna_phaseTransitions2}} function does not offer split nodes. +However, it offers two advantages. The first one is faster computation and +better parallelization in Java. The second one is kernel smoothing, which +means the farther away from a time point a statement is, the less important +it becomes for the network that is created around the time point. Several +kernel smoothing functions are available; see the \code{kernel} argument. + Print a summary of a \code{dna_phaseTransitions} object, which can be created using the \link{dna_phaseTransitions} function. } @@ -387,6 +449,25 @@ results <- dna_phaseTransitions(distanceMethod = "absdiff", timeWindow = "events", windowSize = 15) results + +# kernel smoothing and spectral distances using dna_phaseTransitions2 +results2 <- dna_phaseTransitions2(distanceMethod = "spectral", + clusterMethods = c("ward", + "pam", + "concor", + "walktrap"), + k.min = 2, + k.max = 6, + networkType = "onemode", + variable1 = "organization", + variable2 = "concept", + timeWindow = "days", + windowSize = 15, + kernel = "gaussian") +results2 + +library("ggplot2") +autoplot(results2) } } diff --git a/rDNA/rDNA/man/dna_sample.Rd b/rDNA/rDNA/man/dna_sample.Rd index 1cb11311..20e3a0e3 100644 --- a/rDNA/rDNA/man/dna_sample.Rd +++ b/rDNA/rDNA/man/dna_sample.Rd @@ -27,6 +27,7 @@ dna_openDatabase(s) } \seealso{ Other {rDNA package startup}: +\code{\link{dna_getHeadlessDna}()}, \code{\link{dna_init}()}, \code{\link{dna_jar}()} }