Handle fields missing in the source in ParquetNativeRecordReader (#7742)

* Fix ParquetNativeRecordExtractor for fields missing in the source * nit * Same bug in proto
apache · Nov 11, 2021 · 8ef2ed6 · 8ef2ed6
1 parent 0fa1580
commit 8ef2ed6
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 3 deletions.
diff --git a/...c/main/java/org/apache/pinot/plugin/inputformat/parquet/ParquetNativeRecordExtractor.java b/...c/main/java/org/apache/pinot/plugin/inputformat/parquet/ParquetNativeRecordExtractor.java
@@ -114,7 +114,7 @@ public GenericRow extract(Group from, GenericRow to) {
       }
     } else {
       for (String fieldName : _fields) {
-        Object value = extractValue(from, fromType.getFieldIndex(fieldName));
+        Object value = fromType.containsField(fieldName) ? extractValue(from, fromType.getFieldIndex(fieldName)) : null;
         if (value != null) {
           value = convert(value);
         }

diff --git a/.../test/java/org/apache/pinot/plugin/inputformat/parquet/ParquetNativeRecordReaderTest.java b/.../test/java/org/apache/pinot/plugin/inputformat/parquet/ParquetNativeRecordReaderTest.java
@@ -0,0 +1,66 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.plugin.inputformat.parquet;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import org.apache.avro.Schema;
+import org.apache.avro.generic.GenericData;
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.pinot.plugin.inputformat.avro.AvroUtils;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.apache.pinot.spi.data.readers.AbstractRecordReaderTest;
+import org.apache.pinot.spi.data.readers.RecordReader;
+
+
+public class ParquetNativeRecordReaderTest extends AbstractRecordReaderTest {
+  private final File _dataFile = new File(_tempDir, "data.parquet");
+
+  @Override
+  protected RecordReader createRecordReader()
+      throws Exception {
+    ParquetNativeRecordReader recordReader = new ParquetNativeRecordReader();
+    recordReader.init(_dataFile, _sourceFields, null);
+    return recordReader;
+  }
+
+  @Override
+  protected void writeRecordsToFile(List<Map<String, Object>> recordsToWrite)
+      throws Exception {
+    Schema schema = AvroUtils.getAvroSchemaFromPinotSchema(getPinotSchema());
+    List<GenericRecord> records = new ArrayList<>();
+    for (Map<String, Object> r : recordsToWrite) {
+      GenericRecord record = new GenericData.Record(schema);
+      for (FieldSpec fieldSpec : getPinotSchema().getAllFieldSpecs()) {
+        record.put(fieldSpec.getName(), r.get(fieldSpec.getName()));
+      }
+      records.add(record);
+    }
+    try (ParquetWriter<GenericRecord> writer = ParquetUtils
+        .getParquetAvroWriter(new Path(_dataFile.getAbsolutePath()), schema)) {
+      for (GenericRecord record : records) {
+        writer.write(record);
+      }
+    }
+  }
+}
diff --git a/...f/src/main/java/org/apache/pinot/plugin/inputformat/protobuf/ProtoBufRecordExtractor.java b/...f/src/main/java/org/apache/pinot/plugin/inputformat/protobuf/ProtoBufRecordExtractor.java
@@ -67,7 +67,7 @@ public GenericRow extract(Message from, GenericRow to) {
     } else {
       for (String fieldName : _fields) {
         Descriptors.FieldDescriptor fieldDescriptor = descriptor.findFieldByName(fieldName);
-        Object fieldValue = from.getField(fieldDescriptor);
+        Object fieldValue = fieldDescriptor != null ? from.getField(fieldDescriptor) : null;
         if (fieldValue != null) {
           fieldValue = convert(new ProtoBufFieldInfo(fieldValue, descriptor.findFieldByName(fieldName)));
         }

diff --git a/pinot-spi/src/test/java/org/apache/pinot/spi/data/readers/AbstractRecordReaderTest.java b/pinot-spi/src/test/java/org/apache/pinot/spi/data/readers/AbstractRecordReaderTest.java
@@ -150,7 +150,9 @@ protected List<String> getPrimaryKeyColumns() {
   }
 
   protected Set<String> getSourceFields(Schema schema) {
-    return Sets.newHashSet(schema.getColumnNames());
+    Set<String> sourceFields = Sets.newHashSet(schema.getColumnNames());
+    sourceFields.add("column_not_in_source");
+    return sourceFields;
   }
 
   @BeforeClass