Skip to content

Commit

Permalink
Handle fields missing in the source in ParquetNativeRecordReader (#7742)
Browse files Browse the repository at this point in the history
* Fix ParquetNativeRecordExtractor for fields missing in the source

* nit

* Same bug in proto
  • Loading branch information
npawar authored and xiangfu0 committed Nov 11, 2021
1 parent 0fa1580 commit 8ef2ed6
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ public GenericRow extract(Group from, GenericRow to) {
}
} else {
for (String fieldName : _fields) {
Object value = extractValue(from, fromType.getFieldIndex(fieldName));
Object value = fromType.containsField(fieldName) ? extractValue(from, fromType.getFieldIndex(fieldName)) : null;
if (value != null) {
value = convert(value);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.pinot.plugin.inputformat.parquet;

import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.pinot.plugin.inputformat.avro.AvroUtils;
import org.apache.pinot.spi.data.FieldSpec;
import org.apache.pinot.spi.data.readers.AbstractRecordReaderTest;
import org.apache.pinot.spi.data.readers.RecordReader;


public class ParquetNativeRecordReaderTest extends AbstractRecordReaderTest {
private final File _dataFile = new File(_tempDir, "data.parquet");

@Override
protected RecordReader createRecordReader()
throws Exception {
ParquetNativeRecordReader recordReader = new ParquetNativeRecordReader();
recordReader.init(_dataFile, _sourceFields, null);
return recordReader;
}

@Override
protected void writeRecordsToFile(List<Map<String, Object>> recordsToWrite)
throws Exception {
Schema schema = AvroUtils.getAvroSchemaFromPinotSchema(getPinotSchema());
List<GenericRecord> records = new ArrayList<>();
for (Map<String, Object> r : recordsToWrite) {
GenericRecord record = new GenericData.Record(schema);
for (FieldSpec fieldSpec : getPinotSchema().getAllFieldSpecs()) {
record.put(fieldSpec.getName(), r.get(fieldSpec.getName()));
}
records.add(record);
}
try (ParquetWriter<GenericRecord> writer = ParquetUtils
.getParquetAvroWriter(new Path(_dataFile.getAbsolutePath()), schema)) {
for (GenericRecord record : records) {
writer.write(record);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ public GenericRow extract(Message from, GenericRow to) {
} else {
for (String fieldName : _fields) {
Descriptors.FieldDescriptor fieldDescriptor = descriptor.findFieldByName(fieldName);
Object fieldValue = from.getField(fieldDescriptor);
Object fieldValue = fieldDescriptor != null ? from.getField(fieldDescriptor) : null;
if (fieldValue != null) {
fieldValue = convert(new ProtoBufFieldInfo(fieldValue, descriptor.findFieldByName(fieldName)));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,9 @@ protected List<String> getPrimaryKeyColumns() {
}

protected Set<String> getSourceFields(Schema schema) {
return Sets.newHashSet(schema.getColumnNames());
Set<String> sourceFields = Sets.newHashSet(schema.getColumnNames());
sourceFields.add("column_not_in_source");
return sourceFields;
}

@BeforeClass
Expand Down

0 comments on commit 8ef2ed6

Please sign in to comment.