/
Exporter.java
3752 lines (3527 loc) · 173 KB
/
Exporter.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package export;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.thoughtworks.xstream.XStream;
import com.thoughtworks.xstream.io.xml.PrettyPrintWriter;
import com.thoughtworks.xstream.io.xml.StaxDriver;
import dna.Dna;
import logger.LogEvent;
import logger.Logger;
import me.tongfei.progressbar.ProgressBar;
import model.*;
import org.apache.commons.math3.linear.EigenDecomposition;
import org.apache.commons.math3.linear.RealMatrix;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
import org.jdom.Attribute;
import org.jdom.Comment;
import org.jdom.Element;
import org.jdom.Namespace;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.ojalgo.array.DenseArray;
import org.ojalgo.array.Primitive64Array;
import org.ojalgo.function.aggregator.Aggregator;
import org.ojalgo.matrix.Primitive64Matrix;
import org.ojalgo.matrix.decomposition.Eigenvalue;
import java.io.*;
import java.time.Duration;
import java.time.LocalDateTime;
import java.time.Period;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.time.temporal.WeekFields;
import java.util.*;
import java.util.concurrent.ThreadLocalRandom;
import java.util.stream.Collectors;
import java.util.stream.DoubleStream;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* Exporter class. This class contains functions for filtering statement array
* lists, creating network matrices, and writing networks to files.
*/
public class Exporter {
private StatementType statementType;
private String networkType, variable1, variable2, qualifier, qualifierAggregation;
private String normalization, duplicates, timeWindow;
private boolean variable1Document, variable2Document, qualifierDocument, isolates;
private LocalDateTime startDateTime, stopDateTime;
private int windowSize;
private HashMap<String, ArrayList<String>> excludeValues;
private ArrayList<String> excludeAuthors, excludeSources, excludeSections, excludeTypes;
private boolean invertValues, invertAuthors, invertSources, invertSections, invertTypes;
private String fileFormat, outfile;
/**
* Holds all documents.
*/
private ArrayList<TableDocument> documents;
/**
* Holds a mapping of document IDs to indices in the {@link #documents} array list.
*/
private HashMap<Integer, Integer> docMap;
/**
* Holds a mapping of statement type variable names to data types for quick lookup.
*/
private HashMap<String, String> dataTypes;
/**
* Holds all statements.
*/
private ArrayList<ExportStatement> originalStatements;
/**
* Holds the statements that remain after filtering by date, exclude filter, duplicates etc.
*/
private ArrayList<ExportStatement> filteredStatements;
/**
* Holds the resulting matrices. Can have size 1.
*/
private ArrayList<Matrix> matrixResults;
// common backbone algorithm objects
private String[] fullConcepts;
private Matrix fullMatrix;
private ArrayList<String> currentBackboneList, currentRedundantList;
private double[] eigenvaluesFull;
// objects for nested backbone algorithm
private int counter;
private int[] iteration, numStatements;
private String[] entity;
private double[] backboneLoss, redundantLoss;
ArrayList<Matrix> backboneMatrices = new ArrayList<>();
ArrayList<Matrix> redundantMatrices = new ArrayList<>();
private NestedBackboneResult nestedBackboneResult = null;
// objects for simulated annealing backbone algorithm
private ArrayList<Double> temperatureLog, acceptanceProbabilityLog, penalizedBackboneLossLog, acceptanceRatioLastHundredIterationsLog;
private ArrayList<Integer> acceptedLog, proposedBackboneSizeLog, acceptedBackboneSizeLog, finalBackboneSizeLog;
private String selectedAction;
private ArrayList<String> actionList, candidateBackboneList, candidateRedundantList, finalBackboneList, finalRedundantList;
private ArrayList<ExportStatement> currentStatementList, candidateStatementList, finalStatementList; // declare candidate statement list at t
private Matrix currentMatrix, candidateMatrix, finalMatrix; // candidate matrix at the respective t, Y^{B^*_t}
private boolean accept;
private double p, temperature, acceptance, r, oldLoss, newLoss, finalLoss, log;
private double[] eigenvaluesCurrent, eigenvaluesCandidate, eigenvaluesFinal;
private int T, t, backboneSize;
private SimulatedAnnealingBackboneResult simulatedAnnealingBackboneResult = null;
// time smoothing
/**
* Kernel function used for time slice network smoothing. Can be {@code "no"} (for no kernel function; uses legacy
* code instead); {@code "uniform"} (for uniform kernel, which is similar to "no"); {@code "epanechnikov"} (for the
* Epanechnikov kernel function; @code "triangular"} (for the triangular kernel function); or {@code "gaussian"}
* (for the Gaussian, or standard normal, kernel function).
*/
private String kernel = "no";
/**
* For kernel-smoothed time slices, should the first mid-point be half a bandwidth or time window duration after the
* start date and the last mid-point be half a bandwidth or duration before the last date to allow sufficient data
* around the end points of the timeline?
*/
private boolean indentBandwidth = true;
public void setKernelFunction(String kernel) {
this.kernel = kernel;
}
public void setIndentBandwidth(boolean indentBandwidth) {
this.indentBandwidth = indentBandwidth;
}
/**
* <p>Create a new Exporter class instance, holding an array list of export
* statements (i.e., statements with added document information and a hash
* map for easier access to variable values.
*
* @param networkType The type of network to be exported. Valid values are:
* <ul>
* <li>{@code "twomode"} (to create a two-mode network)</li>
* <li>{@code "onemode"} (to create a one-mode network)</li>
* <li>{@code "eventlist"} (to create an event list)</li>
* </ul>
* @param statementType The statement type.
* @param variable1 The name of the first variable, for example {@code
* "organization"}. In addition to the variables defined in the statement
* type, the document variables {@code author}, {@code source}, {@code
* section}, {@code type}, {@code id}, and {@code title} are valid. If
* document-level variables are used, this must be declared using the
* {@code variable1Document} argument.
* @param variable1Document Is the first variable defined at the document
* level, for instance the author or document ID?
* @param variable2 The name of the second variable, for example {@code
* "concept"}. In addition to the variables defined in the statement type,
* the document variables {@code author}, {@code source}, {@code section},
* {@code type}, {@code id}, and {@code title} are valid. If
* document-level variables are used, this must be declared using the
* {@code variable2Document} argument.
* @param variable2Document Is the second variable defined at the document
* level, for instance the author or document ID?
* @param qualifier The qualifier variable, for example {@code
* "agreement"}.
* @param qualifierDocument Is the qualifier variable defined at the
* document level, for instance the author or document ID?
* @param qualifierAggregation The way in which the qualifier variable is
* used to aggregate ties in the network.<br/>
* Valid values if the {@code networkType} argument equals {@code
* "onemode"} are:
* <ul>
* <li>{@code "ignore"} (for ignoring the qualifier variable)</li>
* <li>{@code "congruence"} (for recording a network tie only if both
* nodes have the same qualifier value in the binary case or for
* recording the similarity between the two nodes on the qualifier
* variable in the integer case)</li>
* <li>{@code "conflict"} (for recording a network tie only if both
* nodes have a different qualifier value in the binary case or for
* recording the distance between the two nodes on the qualifier
* variable in the integer case)</li>
* <li>{@code "subtract"} (for subtracting the conflict tie value from
* the congruence tie value in each dyad)</li>
* <li>{@code "congruence & conflict"} (only applicable to time window
* networks: add both a congruence and a conflict network to the time
* window list of networks at each time step)</li>
* </ul>
* Valid values if the {@code networkType} argument equals {@code
* "twomode"} are:
* <ul>
* <li>{@code "ignore"} (for ignoring the qualifier variable)</li>
* <li>{@code "combine"} (for creating multiplex combinations, e.g.,
* {@code 1} for positive, {@code 2} for negative, and {@code 3} for
* mixed)</li>
* <li>{@code "subtract"} (for subtracting negative from positive
* ties)</li>
* </ul>
* The argument is ignored if {@code networkType} equals {@code
* "eventlist"}.
* @param normalization Normalization of edge weights. Valid settings for
* <em>one-mode</em> networks are:
* <ul>
* <li>{@code "no"} (for switching off normalization)</li>
* <li>{@code "average"} (for average activity normalization)</li>
* <li>{@code "jaccard"} (for Jaccard coefficient normalization)</li>
* <li>{@code "cosine"} (for cosine similarity normalization)</li>
* </ul>
* Valid settings for <em>two-mode</em> networks are:
* <ul>
* <li>{@code "no"} (for switching off normalization)</li>
* <li>{@code "activity"} (for activity normalization)</li>
* <li>{@code "prominence"} (for prominence normalization)</li>
* </ul>
* @param isolates Should all nodes of the respective variable be included
* in the network matrix ({@code true}), or should only those nodes be
* included that are active in the current time period and are not
* excluded ({@code false})?
* @param duplicates Setting for excluding duplicate statements before
* network construction. Valid values are:
* <ul>
* <li>{@code "include"} (for including all statements in network
* construction)</li>
* <li>{@code "document"} (for counting only one identical statement per
* document)</li>
* <li>{@code "week"} (for counting only one identical statement per
* calendar week as defined in the UK locale, i.e., Monday to Sunday)
* </li>
* <li>{@code "month"} (for counting only one identical statement per
* calendar month)</li>
* <li>{@code "year"} (for counting only one identical statement per
* calendar year)</li>
* <li>{@code "acrossrange"} (for counting only one identical statement
* across the whole time range)</li>
* </ul>
* @param startDateTime The start date and time for network construction.
* All statements before this specified date/time will be excluded.
* @param stopDateTime The stop date and time for network construction.
* All statements after this specified date/time will be excluded.
* @param timeWindow If any of the time units is selected, a moving time
* window will be imposed, and only the statements falling within the
* time period defined by the window will be used to create the network.
* The time window will then be moved forward by one time unit at a time,
* and a new network with the new time boundaries will be created. This
* is repeated until the end of the overall time span is reached. All
* time windows will be saved as separate network matrices in a list. The
* duration of each time window is defined by the {@code windowsize}
* argument. For example, this could be used to create a time window of
* six months that moves forward by one month each time, thus creating
* time windows that overlap by five months. If {@code "events"} is used
* instead of a natural time unit, the time window will comprise exactly
* as many statements as defined in the {@code windowsize} argument.
* However, if the start or end statement falls on a date and time where
* multiple events happen, those additional events that occur
* simultaneously are included because there is no other way to decide
* which of the statements should be selected. Therefore the window size
* is sometimes extended when the start or end point of a time window is
* ambiguous in event time. Valid argument values are:
* <ul>
* <li>{@code "no"} (no time window will be used)</li>
* <li>{@code "events"} (time window length = number of statements)</li>
* <li>{@code "seconds"} (number of seconds)</li>
* <li>{@code "minutes"} (number of minutes)</li>
* <li>{@code "hours"} (number of hours)</li>
* <li>{@code "days"} (number of days)</li>
* <li>{@code "weeks"} (number of calendar weeks)</li>
* <li>{@code "months"} (number of calendar months)</li>
* <li>{@code "years"} (number of calendar years)</li>
* </ul>
* @param windowSize The number of time units of which a moving time
* window is comprised. This can be the number of statement events, the
* number of days etc., as defined in the {@code timeWindow} argument.
* @param excludeValues A hash map that contains values which should be
* excluded during network construction. The hash map is indexed by
* variable name (for example, {@code "organization"} as the key, and
* the corresponding value is an array list of values to exclude, for
* example {@code "org A"} or {@code "org B"}. This is irrespective of
* whether these values appear in {@code variable1}, {@code variable2},
* or the {@code qualifier} variable. Note that only variables at the
* statement level can be used here. There are separate arguments for
* excluding statements nested in documents with certain meta-data.
* @param excludeAuthors An array of authors. If a statement is nested in
* a document where one of these authors is set in the {@code author}
* meta-data field, the statement is excluded from network construction.
* @param excludeSources An array of sources. If a statement is nested in
* a document where one of these sources is set in the {@code source}
* meta-data field, the statement is excluded from network construction.
* @param excludeSections An array of sections. If a statement is nested
* in a document where one of these sections is set in the {@code
* section} meta-data field, the statement is excluded from network
* construction.
* @param excludeTypes An array of types. If a statement is nested in a
* document where one of these types is set in the {@code type}
* meta-data field, the statement is excluded from network construction.
* @param invertValues Indicates whether the entries provided by the
* {@code excludeValues} argument should be excluded from network
* construction ({@code false}) or if they should be the only values
* that should be included during network construction ({@code true}).
* @param invertAuthors Indicates whether the values provided by the
* {@code excludeAuthors} argument should be excluded from network
* construction ({@code false}) or if they should be the only values
* that should be included during network construction ({@code true}).
* @param invertSources Indicates whether the values provided by the
* {@code excludeSources} argument should be excluded from network
* construction ({@code false}) or if they should be the only values
* that should be included during network construction ({@code true}).
* @param invertSections Indicates whether the values provided by the
* {@code excludeSections} argument should be excluded from network
* construction ({@code false}) or if they should be the only values
* that should be included during network construction ({@code true}).
* @param invertTypes Indicates whether the values provided by the
* {@code excludeTypes} argument should be excluded from network
* construction ({@code false}) or if they should be the only values
* that should be included during network construction ({@code true}).
* @param fileFormat The file format specification for saving the
* resulting network(s) to a file instead of returning an object. Valid
* values are:
* <ul>
* <li>{@code "csv"} (for network matrices or event lists)</li>
* <li>{@code "dl"} (for UCINET DL full-matrix files)</li>
* <li>{@code "graphml"} (for visone {@code .graphml} files; this
* specification is also compatible with time windows)</li>
* </ul>
* @param outfile The file name for saving the network.
*/
public Exporter(
String networkType,
StatementType statementType,
String variable1,
boolean variable1Document,
String variable2,
boolean variable2Document,
String qualifier,
boolean qualifierDocument,
String qualifierAggregation,
String normalization,
boolean isolates,
String duplicates,
LocalDateTime startDateTime,
LocalDateTime stopDateTime,
String timeWindow,
int windowSize,
HashMap<String, ArrayList<String>> excludeValues,
ArrayList<String> excludeAuthors,
ArrayList<String> excludeSources,
ArrayList<String> excludeSections,
ArrayList<String> excludeTypes,
boolean invertValues,
boolean invertAuthors,
boolean invertSources,
boolean invertSections,
boolean invertTypes,
String fileFormat,
String outfile) {
// create a list of document variables for easier if-condition checking below
ArrayList<String> documentVariables = new ArrayList<String>();
documentVariables.add("author");
documentVariables.add("source");
documentVariables.add("section");
documentVariables.add("type");
documentVariables.add("id");
documentVariables.add("title");
// check network type
// valid input: 'eventlist', 'twomode', or 'onemode'
this.networkType = networkType;
this.networkType = this.networkType.toLowerCase();
if (!this.networkType.equals("eventlist") && !this.networkType.equals("twomode") && !this.networkType.equals("onemode")) {
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Network type setting invalid.",
"When exporting a network, the network type was set to be \"" + networkType + "\", but the only valid options are \"onemode\", \"twomode\", and \"eventlist\". Using the default value \"twomode\" in this case.");
Dna.logger.log(le);
this.networkType = "twomode";
}
// check statement type
this.statementType = statementType;
ArrayList<String> shortTextVariables = Stream.of(this.statementType.getVariablesList(false, true, false, false)).collect(Collectors.toCollection(ArrayList::new));
if (shortTextVariables.size() < 2) {
LogEvent le = new LogEvent(Logger.ERROR,
"Exporter: Statement type contains fewer than two short text variables.",
"When exporting a network, the statement type \"" + this.statementType.getLabel() + "\" (ID: " + this.statementType.getId() + ") was selected, but this statement type contains fewer than two short text variables. At least two short text variables are required for network construction.");
Dna.logger.log(le);
}
// check variable1, variable1Document, variable2, and variable2Document
this.variable1Document = variable1Document;
if (this.variable1Document && !documentVariables.contains(variable1)) {
this.variable1Document = false;
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Variable 1 is not a document-level variable.",
"When exporting a network, Variable 1 was set to be a document-level variable, but \"" + variable1 + "\" does not exist as a document-level variable. Trying to interpret it as a statement-level variable instead.");
Dna.logger.log(le);
}
this.variable2Document = variable2Document;
if (this.variable2Document && !documentVariables.contains(variable2)) {
this.variable2Document = false;
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Variable 2 is not a document-level variable.",
"When exporting a network, Variable 2 was set to be a document-level variable, but \"" + variable2 + "\" does not exist as a document-level variable. Trying to interpret it as a statement-level variable instead.");
Dna.logger.log(le);
}
this.variable1 = variable1;
this.variable2 = variable2;
if (!variable1Document && !shortTextVariables.contains(this.variable1)) {
String var1 = this.variable1;
int counter = 0;
while (var1.equals(this.variable1) || var1.equals(this.variable2)) {
var1 = shortTextVariables.get(counter);
counter++;
}
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Variable 1 does not exist in statement type.",
"When exporting a network, Variable 1 was set to be \"" + this.variable1 + "\", but this variable is undefined in the statement type \"" + this.statementType + "\" or is not a short text variable. Using variable \"" + var1 + "\" instead.");
Dna.logger.log(le);
this.variable1 = var1;
}
if (!variable2Document && !shortTextVariables.contains(this.variable2)) {
String var2 = this.variable2;
int counter = 0;
while (var2.equals(this.variable1) || var2.equals(this.variable2)) {
var2 = shortTextVariables.get(counter);
counter++;
}
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Variable 2 does not exist in statement type.",
"When exporting a network, Variable 2 was set to be \"" + this.variable2 + "\", but this variable is undefined in the statement type \"" + this.statementType + "\" or is not a short text variable. Using variable \"" + var2 + "\" instead.");
Dna.logger.log(le);
this.variable2 = var2;
}
if (this.variable1.equals(this.variable2)) {
String var2 = this.variable2;
int counter = 0;
while (var2.equals(this.variable1)) {
var2 = shortTextVariables.get(counter);
counter++;
}
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Variables 1 and 2 are identical.",
"When exporting a network, Variable 1 and Variable 2 were identical (\"" + this.variable1 + "\"). Changing Variable 2 to \"" + var2 + "\" instead.");
Dna.logger.log(le);
this.variable2 = var2;
}
// check qualifier, qualifierDocument, and qualifierAggregation
this.qualifierDocument = qualifierDocument;
if (qualifier == null && this.qualifierDocument) {
this.qualifierDocument = false;
} else if (qualifier != null && this.qualifierDocument && !documentVariables.contains(qualifier)) {
this.qualifierDocument = false;
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Qualifier variable is not a document-level variable.",
"When exporting a network, the qualifier variable was set to be a document-level variable, but \"" + qualifier + "\" does not exist as a document-level variable. Trying to interpret it as a statement-level variable instead.");
Dna.logger.log(le);
}
this.qualifierAggregation = qualifierAggregation.toLowerCase();
this.qualifier = qualifier;
ArrayList<String> variables = Stream.of(this.statementType.getVariablesList(false, true, true, true)).collect(Collectors.toCollection(ArrayList::new));
if (this.qualifier != null && !this.qualifierDocument && (!variables.contains(this.qualifier) || this.qualifier.equals(this.variable1) || this.qualifier.equals(this.variable2))) {
this.qualifier = null;
if (!this.qualifierAggregation.equals("ignore")) {
this.qualifierAggregation = "ignore";
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Qualifier variable undefined or invalid.",
"When exporting a network, the qualifier variable was either not defined as a variable in the statement type \"" + this.statementType.getLabel() + "\" or was set to be identical to Variable 1 or Variable 2. Hence, no qualifier is used.");
Dna.logger.log(le);
}
}
if (!this.qualifierAggregation.equals("ignore") &&
!this.qualifierAggregation.equals("subtract") &&
!this.qualifierAggregation.equals("combine") &&
!this.qualifierAggregation.equals("congruence") &&
!this.qualifierAggregation.equals("conflict") &&
!(this.qualifierAggregation.equals("congruence & conflict") && timeWindow.equals("events"))) {
this.qualifierAggregation = "ignore";
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Qualifier aggregation setting invalid.",
"When exporting a network, the qualifier aggregation setting was \"" + qualifierAggregation + "\". The only valid settings are \"ignore\", \"combine\", \"congruence\", and \"conflict\", depending on other settings. Using \"ignore\" now.");
Dna.logger.log(le);
}
if (this.qualifierAggregation.equals("combine") && !this.networkType.equals("twomode")) {
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Qualifier aggregation incompatible with network type.",
"When exporting a network, the qualifier aggregation setting was \"combine\", but this setting is only compatible with two-mode networks. Using \"ignore\" now.");
Dna.logger.log(le);
this.qualifierAggregation = "ignore";
}
if ((this.qualifierAggregation.equals("congruence") || this.qualifierAggregation.equals("conflict")) && !this.networkType.equals("onemode")) {
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Qualifier aggregation incompatible with network type.",
"When exporting a network, the qualifier aggregation setting was \"" + this.qualifierAggregation + "\", but this setting is only compatible with one-mode networks. Using \"ignore\" now.");
Dna.logger.log(le);
this.qualifierAggregation = "ignore";
}
if (this.qualifier == null && !this.qualifierAggregation.equals("ignore")) {
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Qualifier aggregation incompatible with qualifier variable.",
"When exporting a network, the qualifier aggregation setting was \"" + this.qualifierAggregation + "\", but no qualifier variable was selected. Using \"ignore\" now.");
Dna.logger.log(le);
this.qualifierAggregation = "ignore";
}
// check normalization (valid values: 'no', 'activity', 'prominence', 'average', 'jaccard', or 'cosine')
this.normalization = normalization.toLowerCase();
if (!this.normalization.equals("no") &&
!this.normalization.equals("activity") &&
!this.normalization.equals("prominence") &&
!this.normalization.equals("average") &&
!this.normalization.equals("jaccard") &&
!this.normalization.equals("cosine")) {
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Normalization setting invalid.",
"When exporting a network, normalization was set to \"" + normalization + "\", which is invalid. The only valid values are \"no\", \"activity\", \"prominence\", \"average\", \"jaccard\", and \"cosine\". Using the default value \"no\" in this case.");
Dna.logger.log(le);
this.normalization = "no";
}
if ((this.normalization.equals("activity") || this.normalization.equals("prominence")) && !this.networkType.equals("twomode")) {
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Normalization setting invalid.",
"When exporting a network, normalization was set to \"" + normalization + "\", which is only possible with two-mode networks. Using the default value \"no\" in this case.");
Dna.logger.log(le);
this.normalization = "no";
}
if ((this.normalization.equals("average") || this.normalization.equals("jaccard") || this.normalization.equals("cosine")) && !this.networkType.equals("onemode")) {
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Normalization setting invalid.",
"When exporting a network, normalization was set to \"" + normalization + "\", which is only possible with one-mode networks. Using the default value \"no\" in this case.");
Dna.logger.log(le);
this.normalization = "no";
}
// isolates setting
this.isolates = isolates;
// check duplicates setting (valid settings: 'include', 'document', 'week', 'month', 'year', or 'acrossrange')
this.duplicates = duplicates.toLowerCase();
if (!this.duplicates.equals("include") &&
!this.duplicates.equals("document") &&
!this.duplicates.equals("week") &&
!this.duplicates.equals("month") &&
!this.duplicates.equals("year") &&
!this.duplicates.equals("acrossrange")) {
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Duplicates setting invalid.",
"When exporting a network, the duplicates setting was \"" + duplicates + "\", which is invalid. The only valid values are \"include\", \"document\", \"week\", \"month\", \"year\", and \"acrossrange\". Using the default value \"include\" in this case.");
Dna.logger.log(le);
this.duplicates = "include";
}
// check time window arguments
this.timeWindow = timeWindow;
if (this.timeWindow == null) {
this.timeWindow = "no";
} else if (!this.timeWindow.equals("no") &&
!this.timeWindow.equals("seconds") &&
!this.timeWindow.equals("minutes") &&
!this.timeWindow.equals("hours") &&
!this.timeWindow.equals("days") &&
!this.timeWindow.equals("weeks") &&
!this.timeWindow.equals("months") &&
!this.timeWindow.equals("years") &&
!this.timeWindow.equals("events")) {
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Time window setting invalid.",
"When exporting a network, the time window setting was \"" + this.timeWindow + "\", which is invalid. The only valid values are \"no\", \"seconds\", \"minutes\", \"hours\", \"days\", \"weeks\", \"months\", \"years\", and \"events\". Using the default value \"no\" in this case.");
Dna.logger.log(le);
this.timeWindow = "no";
}
this.windowSize = windowSize;
if (this.windowSize < 1 && !this.timeWindow.equals("no")) {
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Time window size invalid.",
"When exporting a network, the time window size was " + this.windowSize + ", which is invalid in combination with a time window setting other than \"no\". Using the minimum value of 1 in this case.");
Dna.logger.log(le);
this.windowSize = 1;
}
// check file export format and file name arguments
if (fileFormat != null) {
this.fileFormat = fileFormat.toLowerCase();
if (!this.fileFormat.equals("csv") && !this.fileFormat.equals("dl") && !this.fileFormat.equals("graphml")) {
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: File format invalid.",
"When exporting a network, the file format setting was " + this.fileFormat + ", but \"csv\", \"dl\", and \"graphml\" are the only valid settings. Using \"graphml\" in this case.");
Dna.logger.log(le);
this.fileFormat = "graphml";
}
} else {
this.fileFormat = null;
}
this.outfile = outfile;
if (this.outfile != null) {
if (this.fileFormat.equals("graphml") && !this.outfile.toLowerCase().endsWith(".graphml")) {
this.outfile = this.outfile + ".graphml";
} else if (this.fileFormat.equals("csv") && !this.outfile.toLowerCase().endsWith(".csv")) {
this.outfile = this.outfile + ".csv";
} else if (this.fileFormat.equals("dl") && !this.outfile.toLowerCase().endsWith(".dl")) {
this.outfile = this.outfile + ".dl";
}
}
// remaining arguments
this.startDateTime = startDateTime;
this.stopDateTime = stopDateTime;
this.excludeValues = excludeValues;
this.invertValues = invertValues;
this.excludeAuthors = excludeAuthors;
this.invertAuthors = invertAuthors;
this.excludeSources = excludeSources;
this.invertSources = invertSources;
this.excludeSections = excludeSections;
this.invertSections = invertSections;
this.excludeTypes = excludeTypes;
this.invertTypes = invertTypes;
}
/**
* Constructor with reduced information for generating barplot data. A variable is specified for which frequency
* counts by qualifier level are computed.
*
* @param statementType The statement type.
* @param variable1 The name of the first variable, for example {@code
* "organization"}. In addition to the variables defined in the statement
* type, the document variables {@code author}, {@code source}, {@code
* section}, {@code type}, {@code id}, and {@code title} are valid. If
* document-level variables are used, this must be declared using the
* {@code variable1Document} argument.
* @param qualifier The qualifier variable, for example {@code
* "agreement"}.
* @param duplicates Setting for excluding duplicate statements before
* network construction. Valid values are:
* <ul>
* <li>{@code "include"} (for including all statements in network
* construction)</li>
* <li>{@code "document"} (for counting only one identical statement per
* document)</li>
* <li>{@code "week"} (for counting only one identical statement per
* calendar week as defined in the UK locale, i.e., Monday to Sunday)
* </li>
* <li>{@code "month"} (for counting only one identical statement per
* calendar month)</li>
* <li>{@code "year"} (for counting only one identical statement per
* calendar year)</li>
* <li>{@code "acrossrange"} (for counting only one identical statement
* across the whole time range)</li>
* </ul>
* @param startDateTime The start date and time for network construction.
* All statements before this specified date/time will be excluded.
* @param stopDateTime The stop date and time for network construction.
* All statements after this specified date/time will be excluded.
* @param excludeValues A hash map that contains values which should be
* excluded during network construction. The hash map is indexed by
* variable name (for example, {@code "organization"} as the key, and
* the corresponding value is an array list of values to exclude, for
* example {@code "org A"} or {@code "org B"}. This is irrespective of
* whether these values appear in {@code variable1}, {@code variable2},
* or the {@code qualifier} variable. Note that only variables at the
* statement level can be used here. There are separate arguments for
* excluding statements nested in documents with certain meta-data.
* @param excludeAuthors An array of authors. If a statement is nested in
* a document where one of these authors is set in the {@code author}
* meta-data field, the statement is excluded from network construction.
* @param excludeSources An array of sources. If a statement is nested in
* a document where one of these sources is set in the {@code source}
* meta-data field, the statement is excluded from network construction.
* @param excludeSections An array of sections. If a statement is nested
* in a document where one of these sections is set in the {@code
* section} meta-data field, the statement is excluded from network
* construction.
* @param excludeTypes An array of types. If a statement is nested in a
* document where one of these types is set in the {@code type}
* meta-data field, the statement is excluded from network construction.
* @param invertValues Indicates whether the entries provided by the
* {@code excludeValues} argument should be excluded from network
* construction ({@code false}) or if they should be the only values
* that should be included during network construction ({@code true}).
* @param invertAuthors Indicates whether the values provided by the
* {@code excludeAuthors} argument should be excluded from network
* construction ({@code false}) or if they should be the only values
* that should be included during network construction ({@code true}).
* @param invertSources Indicates whether the values provided by the
* {@code excludeSources} argument should be excluded from network
* construction ({@code false}) or if they should be the only values
* that should be included during network construction ({@code true}).
* @param invertSections Indicates whether the values provided by the
* {@code excludeSections} argument should be excluded from network
* construction ({@code false}) or if they should be the only values
* that should be included during network construction ({@code true}).
* @param invertTypes Indicates whether the values provided by the
* {@code excludeTypes} argument should be excluded from network
* construction ({@code false}) or if they should be the only values
* that should be included during network construction ({@code true}).
*/
public Exporter(
StatementType statementType,
String variable1,
String qualifier,
String duplicates,
LocalDateTime startDateTime,
LocalDateTime stopDateTime,
HashMap<String, ArrayList<String>> excludeValues,
ArrayList<String> excludeAuthors,
ArrayList<String> excludeSources,
ArrayList<String> excludeSections,
ArrayList<String> excludeTypes,
boolean invertValues,
boolean invertAuthors,
boolean invertSources,
boolean invertSections,
boolean invertTypes) {
this.statementType = statementType;
ArrayList<String> shortTextVariables = Stream.of(this.statementType.getVariablesList(false, true, false, false)).collect(Collectors.toCollection(ArrayList::new));
// check variable1, variable1Document, variable2, and variable2Document
this.variable1 = variable1;
this.variable1Document = false;
if (!shortTextVariables.contains(this.variable1)) {
String var1 = this.variable1;
int counter = 0;
while (var1.equals(this.variable1)) {
var1 = shortTextVariables.get(counter);
counter++;
}
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Variable does not exist in statement type.",
"When generating barplot data, the variable was set to be \"" + this.variable1 + "\", but this variable is undefined in the statement type \"" + this.statementType + "\" or is not a short text variable. Using variable \"" + var1 + "\" instead.");
Dna.logger.log(le);
this.variable1 = var1;
}
// check qualifier, qualifierDocument, and qualifierAggregation
this.qualifierDocument = false;
this.qualifierAggregation = "ignore";
if (qualifier != null) {
this.qualifier = qualifier;
this.qualifierAggregation = "combine";
}
ArrayList<String> variables = Stream.of(this.statementType.getVariablesList(false, true, true, true)).collect(Collectors.toCollection(ArrayList::new));
if (this.qualifier != null && (!variables.contains(this.qualifier) || this.qualifier.equals(this.variable1))) {
this.qualifier = null;
this.qualifierAggregation = "ignore";
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Qualifier variable undefined or invalid.",
"When generating barplot data, the qualifier variable was either not defined as a variable in the statement type \"" + this.statementType.getLabel() + "\" or was set to be identical to the barplot variable. Hence, no qualifier is used.");
Dna.logger.log(le);
}
// check duplicates setting (valid settings: 'include', 'document', 'week', 'month', 'year', or 'acrossrange')
this.duplicates = duplicates.toLowerCase();
if (!this.duplicates.equals("include") &&
!this.duplicates.equals("document") &&
!this.duplicates.equals("week") &&
!this.duplicates.equals("month") &&
!this.duplicates.equals("year") &&
!this.duplicates.equals("acrossrange")) {
LogEvent le = new LogEvent(Logger.WARNING,
"Exporter: Duplicates setting invalid.",
"When generating barplot data, the duplicates setting was \"" + duplicates + "\", which is invalid. The only valid values are \"include\", \"document\", \"week\", \"month\", \"year\", and \"acrossrange\". Using the default value \"include\" in this case.");
Dna.logger.log(le);
this.duplicates = "include";
}
// remaining arguments
this.startDateTime = startDateTime;
this.stopDateTime = stopDateTime;
this.excludeValues = excludeValues;
this.invertValues = invertValues;
this.excludeAuthors = excludeAuthors;
this.invertAuthors = invertAuthors;
this.excludeSources = excludeSources;
this.invertSources = invertSources;
this.excludeSections = excludeSections;
this.invertSections = invertSections;
this.excludeTypes = excludeTypes;
this.invertTypes = invertTypes;
}
/**
* Load statements and documents from the database and pre-process them.
*/
public void loadData() {
// put variable data types into a map for quick lookup
this.dataTypes = new HashMap<String, String>();
for (int i = 0; i < this.statementType.getVariables().size(); i++) {
this.dataTypes.put(this.statementType.getVariables().get(i).getKey(), this.statementType.getVariables().get(i).getDataType());
}
// get documents and create document hash map for quick lookup
this.documents = Dna.sql.getTableDocuments(new int[0]);
Collections.sort(documents);
this.docMap = new HashMap<Integer, Integer>();
for (int i = 0; i < documents.size(); i++) {
docMap.put(documents.get(i).getId(), i);
}
// get statements and convert to {@link ExportStatement} objects with additional information
this.originalStatements = Dna.sql.getStatements(new int[0],
this.statementType.getId(),
this.startDateTime,
this.stopDateTime,
this.excludeAuthors,
this.invertAuthors,
this.excludeSources,
this.invertSources,
this.excludeSections,
this.invertSections,
this.excludeTypes,
this.invertTypes)
.stream()
.map(s -> {
int docIndex = docMap.get(s.getDocumentId());
return new ExportStatement(s,
documents.get(docIndex).getTitle(),
documents.get(docIndex).getAuthor(),
documents.get(docIndex).getSource(),
documents.get(docIndex).getSection(),
documents.get(docIndex).getType());
})
.collect(Collectors.toCollection(ArrayList::new));
if (this.originalStatements.size() == 0) {
Dna.logger.log(
new LogEvent(Logger.WARNING,
"No statements found.",
"When processing data for export, no statements were found in the database in the time period under scrutiny and given any document-level exclusion filters.")
);
}
}
/**
* Extract the labels for all nodes for a variable from the statements,
* conditional on isolates settings.
*
* @param processedStatements These are usually filtered statements, but
* could be more processed than just filtered, for example for
* constructing time window sequences of network matrices.
* @param variable String indicating the variable for which labels should be
* extracted, for example {@code "organization"}.
* @param variableDocument Is the variable a document-level variable?
* @return String array containing all sorted node names.
*/
String[] extractLabels(
ArrayList<ExportStatement> processedStatements,
String variable,
boolean variableDocument) {
// decide whether to use the original statements or the filtered statements
ArrayList<ExportStatement> finalStatements;
if (this.isolates) {
finalStatements = originalStatements;
} else {
finalStatements = processedStatements;
}
// go through statements and extract names
ArrayList<String> names = new ArrayList<String>();
String n = null;
ExportStatement es;
for (int i = 0; i < finalStatements.size(); i++) {
es = finalStatements.get(i);
if (variableDocument) {
if (variable.equals("author")) {
n = es.getAuthor();
} else if (variable.equals("source")) {
n = es.getSource();
} else if (variable.equals("section")) {
n = es.getSection();
} else if (variable.equals("type")) {
n = es.getType();
} else if (variable.equals("id")) {
n = es.getDocumentIdAsString();
} else if (variable.equals("title")) {
n = es.getTitle();
}
} else {
n = (String) es.get(variable).toString();
}
if (!names.contains(n)) {
names.add(n);
}
}
// sort and convert to array, then return
Collections.sort(names);
if (names.size() > 0 && names.get(0).equals("")) { // remove empty field
names.remove(0);
}
String[] nameArray = new String[names.size()];
if (names.size() > 0) {
for (int i = 0; i < names.size(); i++) {
nameArray[i] = names.get(i);
}
}
return nameArray;
}
/**
* Filter the statements based on the {@link #originalStatements} slot of
* the class and create a filtered statement list, which is saved in the
* {@link #filteredStatements} slot of the class.
*/
public void filterStatements() {
try (ProgressBar pb = new ProgressBar("Filtering statements", this.originalStatements.size())) {
pb.stepTo(0);
// create a deep copy of the original statements
this.filteredStatements = new ArrayList<ExportStatement>();
for (int i = 0; i < this.originalStatements.size(); i++) {
this.filteredStatements.add(new ExportStatement(this.originalStatements.get(i)));
}
// sort statements by date and time
Collections.sort(this.filteredStatements);
// Create arrays with variable values
String[] values1 = retrieveValues(this.filteredStatements, this.variable1, this.variable1Document);
String[] values2 = new String[0];
if (this.variable2 != null) {
values2 = retrieveValues(this.filteredStatements, this.variable2, this.variable2Document);
}
String[] qualifierValues = new String[0];
if (this.qualifierDocument || (!this.qualifierAggregation.equals("ignore") && dataTypes.get(this.qualifier).equals("short text"))) {
qualifierValues = retrieveValues(this.filteredStatements, this.qualifier, this.qualifierDocument);
}
// process and exclude statements
ExportStatement s;
ArrayList<ExportStatement> al = new ArrayList<ExportStatement>();
String previousVar1 = null;
String previousVar2 = null;
String previousQualifier = null;
LocalDateTime cal, calPrevious;
int year, month, week, yearPrevious, monthPrevious, weekPrevious;
for (int i = 0; i < this.filteredStatements.size(); i++) {
boolean select = true;
s = this.filteredStatements.get(i);
// check against excluded values
Iterator<String> keyIterator = this.excludeValues.keySet().iterator();
while (keyIterator.hasNext()) {
String key = keyIterator.next();
String string = "";
if (dataTypes.get(key) == null) {
throw new NullPointerException("'" + key + "' is not a statement-level variable and cannot be excluded.");
} else if (dataTypes.get(key).equals("boolean") || dataTypes.get(key).equals("integer")) {
string = String.valueOf(s.get(key));
} else if (dataTypes.get(key).equals("short text")) {
string = ((Entity) s.get(key)).getValue();
} else if (dataTypes.get(key).equals("long text")) {
string = (String) s.get(key);
}
if ((this.excludeValues.get(key).contains(string) && !this.invertValues) ||
(!this.excludeValues.get(key).contains(string) && this.invertValues)) {
select = false;
}
}
// check against empty fields
if (select &&
this.networkType != null &&
!this.networkType.equals("eventlist") &&
(values1[i].equals("") || values2[i].equals("") || (!this.qualifierAggregation.equals("ignore") && (qualifierDocument || dataTypes.get(qualifier).equals("short text")) && qualifierValues[i].equals("")))) {
select = false;
} else if (select && this.networkType == null && values1[i].equals("")) { // barplot data because no network type defined
select = false;
}
// check for duplicates
cal = s.getDateTime();
year = cal.getYear();
month = cal.getMonthValue();
@SuppressWarnings("static-access")
WeekFields weekFields = WeekFields.of(Locale.UK.getDefault()); // use UK definition of calendar weeks
week = cal.get(weekFields.weekOfWeekBasedYear());
if (!this.duplicates.equals("include")) {
for (int j = al.size() - 1; j >= 0; j--) {
if (!this.variable1Document) {
previousVar1 = ((Entity) al.get(j).get(this.variable1)).getValue();
} else if (this.variable1.equals("author")) {
previousVar1 = al.get(j).getAuthor();
} else if (this.variable1.equals("source")) {
previousVar1 = al.get(j).getSource();
} else if (this.variable1.equals("section")) {
previousVar1 = al.get(j).getSection();
} else if (this.variable1.equals("type")) {
previousVar1 = al.get(j).getType();
} else if (this.variable1.equals("id")) {
previousVar1 = al.get(j).getDocumentIdAsString();
} else if (this.variable1.equals("title")) {
previousVar1 = al.get(j).getTitle();
}
if (this.variable2 != null) {
if (!this.variable2Document) {
previousVar2 = ((Entity) al.get(j).get(this.variable2)).getValue();
} else if (this.variable2.equals("author")) {
previousVar2 = al.get(j).getAuthor();
} else if (this.variable2.equals("source")) {
previousVar2 = al.get(j).getSource();