Updated browse tag export to properly quote tsv/csv.

thehyve · May 25, 2016 · 28fe984 · 28fe984
1 parent c6908f9
commit 28fe984
Show file tree

Hide file tree

Showing 8 changed files with 149 additions and 9 deletions.
diff --git a/build.gradle b/build.gradle
@@ -47,6 +47,7 @@ dependencies {
     compile group: 'com.google.guava', name: 'guava', version: GUAVA_VERSION
     compile group: 'commons-cli', name: 'commons-cli', version: '1.2'
     compile group: 'org.hibernate', name: 'hibernate-validator', version: HIBERNATE_VALIDATOR_VERSION
+    compile group: 'com.opencsv', name: 'opencsv', version: '3.7'
 
     compile group: 'ch.qos.logback', name: 'logback-classic', version: LOGBACK_VERSION
     compile group: 'org.slf4j', name: 'jcl-over-slf4j', version: SLF4J_VERSION

diff --git a/src/main/groovy/org/transmartproject/batch/batchartifacts/CsvLineAggregator.groovy b/src/main/groovy/org/transmartproject/batch/batchartifacts/CsvLineAggregator.groovy
@@ -0,0 +1,34 @@
+package org.transmartproject.batch.batchartifacts
+
+import com.opencsv.CSVWriter
+import groovy.transform.CompileStatic
+import org.springframework.batch.item.file.transform.ExtractorLineAggregator
+
+/**
+ * Writes lines in CSV format using the opencsv library.
+ */
+@CompileStatic
+class CsvLineAggregator<T> extends ExtractorLineAggregator<T> {
+
+    char separator = CSVWriter.DEFAULT_SEPARATOR
+    char quotechar = CSVWriter.DEFAULT_QUOTE_CHARACTER
+    char escapechar = CSVWriter.DEFAULT_ESCAPE_CHARACTER
+    String lineEnd = CSVWriter.DEFAULT_LINE_END
+
+    @Override
+    protected String doAggregate(Object[] fields) {
+        OutputStream out = new ByteArrayOutputStream()
+        out.withWriter { writer ->
+            CSVWriter csvWriter = new CSVWriter(writer, separator, quotechar, escapechar, lineEnd)
+            List<String> outData = []
+            fields.each { outData << it.toString() }
+            csvWriter.writeNext(outData.toArray(new String[0]), false)
+            csvWriter.flush()
+            csvWriter.close()
+        }
+        String result = out.toString()
+        out.close()
+        result
+    }
+
+}
diff --git a/...ain/groovy/org/transmartproject/batch/browsetag/BrowseTagAssociationDatabaseReader.groovy b/...ain/groovy/org/transmartproject/batch/browsetag/BrowseTagAssociationDatabaseReader.groovy
@@ -157,7 +157,7 @@ class BrowseTagAssociationDatabaseReader implements ItemStreamReader<BrowseTagAs
                 -1 as tag_item_id,
                 'study_description' as code,
                 'Study description' as display_name,
-                0 as display_order,
+                1 as display_order,
                 cast (exp.description as varchar(4000)) as value,
                 cast (exp.description as varchar(4000)) as description
             FROM $Tables.FM_FOLDER f

diff --git a/src/main/groovy/org/transmartproject/batch/browsetag/BrowseTagTypeFlatFileWriter.groovy b/src/main/groovy/org/transmartproject/batch/browsetag/BrowseTagTypeFlatFileWriter.groovy
@@ -4,9 +4,9 @@ import groovy.util.logging.Slf4j
 import org.springframework.batch.item.ItemWriter
 import org.springframework.batch.item.file.FlatFileHeaderCallback
 import org.springframework.batch.item.file.FlatFileItemWriter
-import org.springframework.batch.item.file.transform.DelimitedLineAggregator
 import org.springframework.batch.item.file.transform.FieldExtractor
 import org.springframework.core.io.Resource
+import org.transmartproject.batch.batchartifacts.CsvLineAggregator
 
 import javax.annotation.PostConstruct
 
@@ -19,8 +19,9 @@ class BrowseTagTypeFlatFileWriter implements ItemWriter<BrowseTagValue> {
     @Delegate
     FlatFileItemWriter<BrowseTagType> delegate
 
-    DelimitedLineAggregator<Collection<String>> valuesAggregator = new DelimitedLineAggregator<>(
-        fieldExtractor: { Collection<String> s -> s as Object[] } as FieldExtractor
+    CsvLineAggregator<Collection<String>> valuesAggregator = new CsvLineAggregator<>(
+            lineEnd: '',
+            fieldExtractor: { Collection<String> s -> s as Object[] } as FieldExtractor
     )
 
     private final Resource resource
@@ -33,8 +34,9 @@ class BrowseTagTypeFlatFileWriter implements ItemWriter<BrowseTagValue> {
     void init() {
         delegate = new FlatFileItemWriter(
                 resource: resource,
-                lineAggregator: new DelimitedLineAggregator<BrowseTagType>(
-                    delimiter: '\t',
+                lineAggregator: new CsvLineAggregator<BrowseTagType>(
+                    separator: '\t',
+                    lineEnd: '',
                     fieldExtractor: { BrowseTagType type ->
                         [   type.folderType.type,
                             type.displayName,

diff --git a/src/main/groovy/org/transmartproject/batch/browsetag/BrowseTagsExportJobConfiguration.groovy b/src/main/groovy/org/transmartproject/batch/browsetag/BrowseTagsExportJobConfiguration.groovy
@@ -13,14 +13,14 @@ import org.springframework.batch.core.job.flow.JobExecutionDecider
 import org.springframework.batch.core.step.tasklet.TaskletStep
 import org.springframework.batch.item.file.FlatFileHeaderCallback
 import org.springframework.batch.item.file.FlatFileItemWriter
-import org.springframework.batch.item.file.transform.DelimitedLineAggregator
 import org.springframework.batch.item.file.transform.FieldExtractor
 import org.springframework.beans.factory.annotation.Value
 import org.springframework.context.annotation.Bean
 import org.springframework.context.annotation.ComponentScan
 import org.springframework.context.annotation.Configuration
 import org.springframework.core.io.FileSystemResource
 import org.springframework.core.io.Resource
+import org.transmartproject.batch.batchartifacts.CsvLineAggregator
 import org.transmartproject.batch.beans.AbstractJobConfiguration
 
 /**
@@ -129,8 +129,9 @@ class BrowseTagsExportJobConfiguration extends AbstractJobConfiguration {
         Resource resource = browseTagsFileResource(null)
         new FlatFileItemWriter(
                 resource: resource,
-                lineAggregator: new DelimitedLineAggregator<BrowseTagAssociation>(
-                        delimiter: '\t',
+                lineAggregator: new CsvLineAggregator<BrowseTagAssociation>(
+                        separator: '\t',
+                        lineEnd: '',
                         fieldExtractor: { BrowseTagAssociation item ->
                             ['\\',
                              item.value.type.displayName,

diff --git a/src/test-func/groovy/org/transmartproject/batch/tag/TagsLoadJobTests.groovy b/src/test-func/groovy/org/transmartproject/batch/tag/TagsLoadJobTests.groovy
@@ -75,6 +75,12 @@ class TagsLoadJobTests implements JobRunningTestTrait {
                         hasEntry('tag_type', 'SYNONYMS'),
                         hasEntry(is('tags_idx'), isIntegerNumber(5)),
                 ),
+                allOf(
+                        hasEntry('path', '\\Public Studies\\GSE8581\\'),
+                        hasEntry('tag', 'Text with tab characters (\t), and\nnew lines.'),
+                        hasEntry('tag_type', 'FREE TEXT'),
+                        hasEntry(is('tags_idx'), isIntegerNumber(6)),
+                ),
         )
 
         def runJob = RunJob.createInstance(
@@ -112,6 +118,12 @@ class TagsLoadJobTests implements JobRunningTestTrait {
                         hasEntry('tag_type', 'SYNONYMS'),
                         hasEntry(is('tags_idx'), isIntegerNumber(5)),
                 ),
+                allOf(
+                        hasEntry('path', '\\Public Studies\\GSE8581\\'),
+                        hasEntry('tag', 'Text with tab characters (\t), and\nnew lines.'),
+                        hasEntry('tag_type', 'FREE TEXT'),
+                        hasEntry(is('tags_idx'), isIntegerNumber(6)),
+                ),
         )
     }
 }
diff --git a/src/test/groovy/org/transmartproject/batch/batchartifacts/CsvLineAggregatorTests.groovy b/src/test/groovy/org/transmartproject/batch/batchartifacts/CsvLineAggregatorTests.groovy
@@ -0,0 +1,88 @@
+package org.transmartproject.batch.batchartifacts
+
+import org.junit.Test
+import org.springframework.batch.item.ExecutionContext
+import org.springframework.batch.item.ItemStreamReader
+import org.springframework.batch.item.file.FlatFileItemReader
+import org.springframework.batch.item.file.mapping.DefaultLineMapper
+import org.springframework.batch.item.file.mapping.FieldSetMapper
+import org.springframework.batch.item.file.separator.DefaultRecordSeparatorPolicy
+import org.springframework.batch.item.file.transform.DelimitedLineTokenizer
+import org.springframework.batch.item.file.transform.FieldSet
+import org.springframework.batch.item.file.transform.LineAggregator
+import org.springframework.batch.item.file.transform.PassThroughFieldExtractor
+import org.springframework.core.io.ByteArrayResource
+import org.springframework.core.io.Resource
+
+import static org.hamcrest.MatcherAssert.assertThat
+import static org.hamcrest.Matchers.*
+
+/**
+ * Tests the {@link CsvLineAggregator}, to see if it writes lines such that
+ * they can be read correctly by the {@link DelimitedLineTokenizer}.
+ */
+class CsvLineAggregatorTests {
+
+    static <T> void write(List<? extends T> items, LineAggregator<T> lineAggregator, Writer writer) {
+        StringBuilder lines = new StringBuilder()
+        for (T item : items) {
+            lines.append(lineAggregator.aggregate(item))
+        }
+        writer.write(lines.toString())
+    }
+
+    static <T> List<T> read(ItemStreamReader<T> reader) {
+        List<T> result = []
+        reader.open(new ExecutionContext())
+        List<String> item
+        while ((item = reader.read()) != null) {
+            result << item
+        }
+        reader.close()
+        result
+    }
+
+    private static final List<List<String>> TEST_DATA = [
+            [ 'Column 1', 'Column 2', 'Column 3', 'Column 4' ],
+            [ 'A', 'Test', 'Test with tab (\t).', 'Test should pass.' ],
+            [ 'B', 'Test', 'Test with comma (,) in the text.', 'Test should pass.' ],
+            [ 'C', 'Test', 'Test with newline (\n) in it.', 'Test should pass.' ],
+    ]
+
+    @Test
+    void testWriteCsvData() {
+        // Writing test data to byte array
+        ByteArrayOutputStream out = new ByteArrayOutputStream()
+        out.withWriter { writer ->
+            def aggregator = new CsvLineAggregator<List<String>>(
+                    separator: '\t',
+                    fieldExtractor: new PassThroughFieldExtractor<List<String>>(),
+            )
+            write(TEST_DATA, aggregator, writer)
+            writer.flush()
+        }
+
+        // Reading data from byte array
+        def tokenizer = new DelimitedLineTokenizer(
+                delimiter: '\t',
+        )
+        def lineMapper = new DefaultLineMapper(
+                lineTokenizer: tokenizer,
+                fieldSetMapper: { FieldSet fs ->
+                    fs.values as List<String>
+                } as FieldSetMapper<List<String>>,
+        )
+
+        Resource resource = new ByteArrayResource(out.toByteArray())
+        def reader = new FlatFileItemReader<List<String>>(
+                lineMapper: lineMapper,
+                recordSeparatorPolicy: new DefaultRecordSeparatorPolicy(),
+                resource: resource,
+        )
+        reader.afterPropertiesSet()
+        List<List<String>> result = read(reader)
+
+        assertThat result, equalTo(TEST_DATA)
+    }
+
+}
diff --git a/studies/GSE8581/tags/tags_1.txt b/studies/GSE8581/tags/tags_1.txt
@@ -1,5 +1,7 @@
 concept_key	tag_title	tag_description	index
 \	TITLE	Human Chronic Obstructive Pulmonary Disorder Biomarker	2
 \	ORGANISM	Homo sapiens	3
+\	FREE TEXT	"Text with tab characters (	), and
+new lines."	6
 \Endpoints\FEV1	NAME	FEV1/FVC ratio	4
 \Endpoints\FEV1	SYNONYMS	Tiffeneau-Pinelli	5