Merge pull request #63 from mvlakh/IssueWithTextQualifiers

Fixed unit test that replicates the issue qualifiers #60 and a fix suggestion for it and similar issues
Appendium · Jun 1, 2021 · 2360919 · 2360919
2 parents 9d35430 + 18431a2
commit 2360919
Show file tree

Hide file tree

Showing 2 changed files with 126 additions and 80 deletions.
diff --git a/flatpack/src/main/java/net/sf/flatpack/AbstractDelimiterParser.java b/flatpack/src/main/java/net/sf/flatpack/AbstractDelimiterParser.java
@@ -236,101 +236,147 @@ private boolean oddNumberOfQualifier(final String line, final char q) {
      * could span multiple lines.
      * NULL will be returned when the end of the file is reached
      *
-     * @param br
+     * @param aContentReader
      *          Open reader being used to read through the file
-     * @param qual
+     * @param aQualifier
      *          Qualifier being used for parse
-     * @param delim
+     * @param aDelimiter
      *          Delimiter being used for parse
      * @return String
      *          Record from delimited file
      * @throws IOException if any problem with the stream of data (e.g. file reader)
+     *
+     * Improved version of line fetching that solves some of the issues of flatpack parser.
      */
-    protected String fetchNextRecord(final BufferedReader br, final char qual, final char delim) throws IOException {
+    protected String fetchNextRecord(BufferedReader aContentReader, char aQualifier, char aDelimiter) throws IOException
+    {
+        if (aQualifier == FPConstants.NO_QUALIFIER)
+        {
+            // no qualifier defined, then there can't be line breaks in the line
+            return aContentReader.readLine();
+        }
+
+        StringBuilder lineData = null;
         String line = null;
-        final StringBuilder lineData = new StringBuilder();
-        boolean processingMultiLine = false;
+        boolean multiline = false;
+
+        // consuming lines until we find end of the data row
+        while ((line = aContentReader.readLine()) != null)
+        {
+            if(lineData == null)
+            {
+                lineData = new StringBuilder(line);
+            }
+            else
+            {
+                lineData.append(LINE_BREAK).append(line);
+            }
+
+            multiline = isMultiline(line.toCharArray(), multiline, aQualifier, aDelimiter);
+            if(! multiline)
+            {
+                // data row ended
+                break;
+            }
+        }
 
-        while ((line = br.readLine()) != null) {
+        if(lineData != null)
+        {
             lineCount++;
-            final String trimmed = line.trim();
-            final int trimmedLen = trimmed.length();
-            if (!processingMultiLine && trimmed.length() == 0) {
-                // empty line skip past it, as long as it
-                // is not part of the multiline
-                continue;
+
+            String result = lineData.toString();
+            // no line break character at the end of data row
+            return result.endsWith(LINE_BREAK) ? result.substring(0, result.length() - LINE_BREAK.length()) : result;
+        }
+
+        return null;
+    }
+
+    /**
+     * Checks if we need to consume one more line because data row was splitted to multiple lines.
+     * @param aСhrArray
+     * @param aMultiline
+     * @param aQualifier
+     * @param aDelimiter
+     * @return
+     */
+    protected boolean isMultiline(char[] aСhrArray, boolean aMultiline, char aQualifier, char aDelimiter)
+    {
+        // do not trim the line, according to rfc4180:
+        // Spaces are considered part of a field and should not be ignored
+        int position = 0;
+
+        do
+        {
+            // field processing here
+            if (! aMultiline && aСhrArray[position] == aDelimiter)
+            {
+                // empty field
+                position++;
             }
+            else if (!aMultiline && aСhrArray[position] != aQualifier)
+            {
+                // if the first char of the line is NOT a qualifier, then the field should not
+                // contain CRLF, double quotes, and commas
+                // therefore find the end of the field by looking for the first delimiter
+
+                while (++position < aСhrArray.length)
+                {
+                    if (aСhrArray[position] == aDelimiter)
+                    {
+                        position++;
+                        break;
+                    }
+                }
 
-            // ********************************************************
-            // new functionality as of 2.1.0 check to see if we have
-            // any line breaks in the middle of the record, this will only
-            // be checked if we have specified a delimiter
-            // ********************************************************
-            final char[] chrArry = trimmed.toCharArray();
-            if (!processingMultiLine && delim > 0 && qual != FPConstants.NO_QUALIFIER) {
-                processingMultiLine = ParserUtils.isMultiLine(chrArry, delim, qual);
+                if (position >= aСhrArray.length)
+                {
+                    // end of the line without any delimiters so it's safe to say its the end of the line
+                    // and not multiline
+                    return false;
+                }
             }
+            else
+            {
+                // the first char is a qualifier, the field may contain CRLF, double quotes, and commas
+                // double quotes must be escaped with a double quote (i.e. "some ""data"" here").
+                // newline won't be present in the line because it's removed by the reader during
+                // readLine() call. so look for dangling "
+
+                aMultiline = true;
+                if(aСhrArray[position] == aQualifier)
+                {
+                    // if we have just now found a qualifier we need to pome cursor to the next char
+                    position++;
+                }
 
-            // check to see if we have reached the end of the linebreak in
-            // the record
-
-            final String trimmedLineData = lineData.toString().trim();
-            if (processingMultiLine && trimmedLineData.length() > 0 && trimmedLen > 0) {
-                // need to do one last check here. it is possible that the "
-                // could be part of the data
-                // excel will escape these with another quote; here is some
-                // data "" This would indicate
-                // there is more to the multiline
-                if (trimmed.charAt(trimmed.length() - 1) == qual && !trimmed.endsWith("" + qual + qual)) {
-                    // it is safe to assume we have reached the end of the
-                    // line break
-                    processingMultiLine = false;
-                    lineData.append(LINE_BREAK).append(line);
-                } else {
-                    // check to see if this is the last line of the record
-                    // looking for a qualifier followed by a delimiter
-                    lineData.append(LINE_BREAK).append(line);
-                    boolean qualiFound = false;
-                    for (final char element : chrArry) {
-                        if (qualiFound) {
-                            if (element == ' ') {
-                                continue;
-                            } else if (element == delim) {
-                                processingMultiLine = ParserUtils.isMultiLine(chrArry, delim, qual);
-                                break;
-                            }
-                            qualiFound = false;
-                        } else if (element == qual) {
-                            qualiFound = true;
+                // looking for the end of the text field
+                while(position < aСhrArray.length)
+                {
+                    if(aСhrArray[position] == aQualifier)
+                    {
+                        if(position == (aСhrArray.length - 1) || aСhrArray[position + 1] != aQualifier)
+                        {
+                            // end of text found
+                            position++;
+                            aMultiline = false;
+                            break;
+                        }
+                        else
+                        {
+                            // skipping escaped qualified like ""
+                            position += 2;
                         }
                     }
-
-                    // check to see if we are still in multi line mode, if
-                    // so grab the next line
-                    if (processingMultiLine) {
-                        continue;
+                    else
+                    {
+                        position++;
                     }
                 }
-            } else {
-                // throw the line into lineData var.
-                // need to check to see if we need to insert a line break.
-                // The buffered reader excludes the breaks
-                lineData.append(trimmedLen == 0 ? LINE_BREAK : line);
-                if (processingMultiLine) {
-                    continue; // if we are working on a multiline rec, get
-                    // the data on the next line
-                }
             }
-
-            break;
         }
+        while( position < aСhrArray.length - 1 );
 
-        if (line == null && lineData.length() == 0) {
-            // eof
-            return null;
-        }
-
-        return lineData.toString();
-
+        return aMultiline;
     }
 }
diff --git a/flatpack/src/test/java/net/sf/flatpack/delim/csv/CsvParserTest.java b/flatpack/src/test/java/net/sf/flatpack/delim/csv/CsvParserTest.java
@@ -69,6 +69,7 @@ public void testCsvWithWrongPzMap() {
      */
     public void testCsvDocumentWithMultilineString() {
         final String testCsv =
+            "col1,col2,col3,col4,col5,col6,col7" + System.lineSeparator() +
             "Bob,Smith,bsmiht@test.com,\"This is a long fragment of text" + System.lineSeparator() +
             "that should be processed as a single field\", 1988, 111-222-33,\"another field with new line character" + System.lineSeparator() +
             "that should be considered as a field of the same data row\"";
@@ -87,17 +88,16 @@ public void testCsvDocumentWithMultilineString() {
         final DelimiterParser parser = new DelimiterParser(bis, ',', '"', false);
         final DataSet result =  parser.parse();
 
+        // no errors should be in result, we should have 1 row with 7 columns
         assertThat(result.getErrorCount()).isEqualTo(0);
         assertThat(result.getColumns().length).isEqualTo(expectedResult.length);
         assertThat(result.getRowCount()).isEqualTo(1);
 
-        result.next();
-        String[] row = result.getColumns();
+        String[] columns = result.getColumns();
 
+        result.next();
         for (int i = 0; i < expectedResult.length; ++i) {
-            assertThat(expectedResult[i]).isEqualTo(row[i]);
+            assertThat(expectedResult[i]).isEqualTo(result.getString(columns[i]));
         }
-
     }
-
 }