From bb844c66d1b935e44a008cceba6acb7fcd068ea2 Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Wed, 24 Mar 2021 00:05:23 -0400 Subject: [PATCH 01/16] update --- pom.xml | 5 + .../pubsublite/spark/CachedPublishers.java | 129 ++++++++++++++++++ .../cloud/pubsublite/spark/Constants.java | 14 +- .../spark/PslCredentialsProvider.java | 10 +- .../cloud/pubsublite/spark/PslDataSource.java | 58 +++++--- .../cloud/pubsublite/spark/PslDataWriter.java | 102 ++++++++++++++ .../spark/PslDataWriterFactory.java | 40 ++++++ ...ons.java => PslReadDataSourceOptions.java} | 8 +- .../cloud/pubsublite/spark/PslSparkUtils.java | 77 ++++++++++- .../pubsublite/spark/PslStreamWriter.java | 65 +++++++++ .../spark/PslWriteDataSourceOptions.java | 66 +++++++++ .../spark/PslWriterCommitMessage.java | 30 ++++ .../spark/PslContinuousReaderTest.java | 4 +- .../pubsublite/spark/PslDataWriterTest.java | 79 +++++++++++ .../spark/PslMicroBatchReaderTest.java | 4 +- ...java => PslReadDataSourceOptionsTest.java} | 4 +- .../pubsublite/spark/PslSparkUtilsTest.java | 76 +++++++++++ .../spark/PslWriteDataSourceOptionsTest.java | 35 +++++ 18 files changed, 767 insertions(+), 39 deletions(-) create mode 100644 src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java create mode 100644 src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java create mode 100644 src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java rename src/main/java/com/google/cloud/pubsublite/spark/{PslDataSourceOptions.java => PslReadDataSourceOptions.java} (96%) create mode 100644 src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java create mode 100644 src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java create mode 100644 src/main/java/com/google/cloud/pubsublite/spark/PslWriterCommitMessage.java create mode 100644 src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java rename src/test/java/com/google/cloud/pubsublite/spark/{PslDataSourceOptionsTest.java => PslReadDataSourceOptionsTest.java} (89%) create mode 100644 src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java diff --git a/pom.xml b/pom.xml index c377b164..74123ab4 100644 --- a/pom.xml +++ b/pom.xml @@ -113,6 +113,11 @@ ${scala.version} provided + + org.scala-lang.modules + scala-java8-compat_2.11 + 0.9.1 + diff --git a/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java b/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java new file mode 100644 index 00000000..84a5480e --- /dev/null +++ b/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java @@ -0,0 +1,129 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.pubsublite.spark; + +import static com.google.cloud.pubsublite.internal.ExtractStatus.toCanonical; +import static com.google.cloud.pubsublite.internal.wire.ServiceClients.addDefaultMetadata; +import static com.google.cloud.pubsublite.internal.wire.ServiceClients.addDefaultSettings; + +import com.google.api.core.ApiService; +import com.google.api.gax.rpc.ApiException; +import com.google.cloud.pubsublite.AdminClient; +import com.google.cloud.pubsublite.AdminClientSettings; +import com.google.cloud.pubsublite.MessageMetadata; +import com.google.cloud.pubsublite.Partition; +import com.google.cloud.pubsublite.internal.CloseableMonitor; +import com.google.cloud.pubsublite.internal.Publisher; +import com.google.cloud.pubsublite.internal.wire.PartitionCountWatchingPublisherSettings; +import com.google.cloud.pubsublite.internal.wire.PubsubContext; +import com.google.cloud.pubsublite.internal.wire.RoutingMetadata; +import com.google.cloud.pubsublite.internal.wire.SinglePartitionPublisherBuilder; +import com.google.cloud.pubsublite.v1.AdminServiceClient; +import com.google.cloud.pubsublite.v1.AdminServiceSettings; +import com.google.cloud.pubsublite.v1.PublisherServiceClient; +import com.google.cloud.pubsublite.v1.PublisherServiceSettings; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.Executor; +import java.util.concurrent.Executors; +import javax.annotation.concurrent.GuardedBy; + +/** Cached {@link Publisher}s to reuse publisher of same settings in the same task. */ +public class CachedPublishers { + + private final CloseableMonitor monitor = new CloseableMonitor(); + + private final Executor listenerExecutor = Executors.newSingleThreadExecutor(); + + @GuardedBy("monitor.monitor") + private static final Map> publishers = + new HashMap<>(); + + public Publisher getOrCreate(PslWriteDataSourceOptions writeOptions) { + try (CloseableMonitor.Hold h = monitor.enter()) { + Publisher publisher = publishers.get(writeOptions); + if (publisher != null) { + return publisher; + } + + publisher = createPublisherInternal(writeOptions); + publishers.put(writeOptions, publisher); + publisher.addListener( + new ApiService.Listener() { + @Override + public void failed(ApiService.State s, Throwable t) { + try (CloseableMonitor.Hold h = monitor.enter()) { + publishers.remove(writeOptions); + } + } + }, + listenerExecutor); + publisher.startAsync().awaitRunning(); + return publisher; + } + } + + private PublisherServiceClient newServiceClient( + PslWriteDataSourceOptions writeOptions, Partition partition) throws ApiException { + PublisherServiceSettings.Builder settingsBuilder = PublisherServiceSettings.newBuilder(); + settingsBuilder = settingsBuilder.setCredentialsProvider(writeOptions.getCredentialProvider()); + settingsBuilder = + addDefaultMetadata( + PubsubContext.of(Constants.FRAMEWORK), + RoutingMetadata.of(writeOptions.topicPath(), partition), + settingsBuilder); + try { + return PublisherServiceClient.create( + addDefaultSettings(writeOptions.topicPath().location().region(), settingsBuilder)); + } catch (Throwable t) { + throw toCanonical(t).underlying; + } + } + + private AdminClient getAdminClient(PslWriteDataSourceOptions writeOptions) throws ApiException { + try { + return AdminClient.create( + AdminClientSettings.newBuilder() + .setServiceClient( + AdminServiceClient.create( + addDefaultSettings( + writeOptions.topicPath().location().region(), + AdminServiceSettings.newBuilder() + .setCredentialsProvider(writeOptions.getCredentialProvider())))) + .setRegion(writeOptions.topicPath().location().region()) + .build()); + } catch (Throwable t) { + throw toCanonical(t).underlying; + } + } + + private Publisher createPublisherInternal( + PslWriteDataSourceOptions writeOptions) { + return PartitionCountWatchingPublisherSettings.newBuilder() + .setTopic(writeOptions.topicPath()) + .setPublisherFactory( + partition -> + SinglePartitionPublisherBuilder.newBuilder() + .setTopic(writeOptions.topicPath()) + .setPartition(partition) + .setServiceClient(newServiceClient(writeOptions, partition)) + .build()) + .setAdminClient(getAdminClient(writeOptions)) + .build() + .instantiate(); + } +} diff --git a/src/main/java/com/google/cloud/pubsublite/spark/Constants.java b/src/main/java/com/google/cloud/pubsublite/spark/Constants.java index cac4337a..b8877745 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/Constants.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/Constants.java @@ -17,7 +17,9 @@ package com.google.cloud.pubsublite.spark; import com.google.cloud.pubsublite.internal.wire.PubsubContext; +import org.apache.spark.sql.types.ArrayType; import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.MapType; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; @@ -26,6 +28,10 @@ public class Constants { public static long DEFAULT_BYTES_OUTSTANDING = 50_000_000; public static long DEFAULT_MESSAGES_OUTSTANDING = Long.MAX_VALUE; public static long DEFAULT_MAX_MESSAGES_PER_BATCH = Long.MAX_VALUE; + public static ArrayType ATTRIBUTES_PER_KEY_DATATYPE = + DataTypes.createArrayType(DataTypes.BinaryType); + public static MapType ATTRIBUTES_DATATYPE = + DataTypes.createMapType(DataTypes.StringType, ATTRIBUTES_PER_KEY_DATATYPE); public static StructType DEFAULT_SCHEMA = new StructType( new StructField[] { @@ -36,12 +42,7 @@ public class Constants { new StructField("data", DataTypes.BinaryType, false, Metadata.empty()), new StructField("publish_timestamp", DataTypes.TimestampType, false, Metadata.empty()), new StructField("event_timestamp", DataTypes.TimestampType, true, Metadata.empty()), - new StructField( - "attributes", - DataTypes.createMapType( - DataTypes.StringType, DataTypes.createArrayType(DataTypes.BinaryType)), - true, - Metadata.empty()) + new StructField("attributes", ATTRIBUTES_DATATYPE, true, Metadata.empty()) }); public static final PubsubContext.Framework FRAMEWORK = PubsubContext.Framework.of("SPARK"); @@ -52,6 +53,7 @@ public class Constants { "pubsublite.flowcontrol.byteoutstandingperpartition"; public static String MESSAGES_OUTSTANDING_CONFIG_KEY = "pubsublite.flowcontrol.messageoutstandingperparition"; + public static String TOPIC_CONFIG_KEY = "pubsublite.topic"; public static String SUBSCRIPTION_CONFIG_KEY = "pubsublite.subscription"; public static String CREDENTIALS_KEY_CONFIG_KEY = "gcp.credentials.key"; } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslCredentialsProvider.java b/src/main/java/com/google/cloud/pubsublite/spark/PslCredentialsProvider.java index 6dce5272..53eac0a2 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslCredentialsProvider.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslCredentialsProvider.java @@ -28,7 +28,15 @@ public class PslCredentialsProvider implements CredentialsProvider { private final Credentials credentials; - public PslCredentialsProvider(PslDataSourceOptions options) { + public PslCredentialsProvider(PslReadDataSourceOptions options) { + if (options.credentialsKey() != null) { + this.credentials = createCredentialsFromKey(options.credentialsKey()); + } else { + this.credentials = createDefaultCredentials(); + } + } + + public PslCredentialsProvider(PslWriteDataSourceOptions options) { if (options.credentialsKey() != null) { this.credentials = createCredentialsFromKey(options.credentialsKey()); } else { diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java index 08a96ee8..8657e55a 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java @@ -30,13 +30,20 @@ import org.apache.spark.sql.sources.v2.DataSourceOptions; import org.apache.spark.sql.sources.v2.DataSourceV2; import org.apache.spark.sql.sources.v2.MicroBatchReadSupport; +import org.apache.spark.sql.sources.v2.StreamWriteSupport; import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader; import org.apache.spark.sql.sources.v2.reader.streaming.MicroBatchReader; +import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter; +import org.apache.spark.sql.streaming.OutputMode; import org.apache.spark.sql.types.StructType; @AutoService(DataSourceRegister.class) public final class PslDataSource - implements DataSourceV2, ContinuousReadSupport, MicroBatchReadSupport, DataSourceRegister { + implements DataSourceV2, + ContinuousReadSupport, + MicroBatchReadSupport, + StreamWriteSupport, + DataSourceRegister { @Override public String shortName() { @@ -51,23 +58,24 @@ public ContinuousReader createContinuousReader( "PubSub Lite uses fixed schema and custom schema is not allowed"); } - PslDataSourceOptions pslDataSourceOptions = - PslDataSourceOptions.fromSparkDataSourceOptions(options); - SubscriptionPath subscriptionPath = pslDataSourceOptions.subscriptionPath(); + PslReadDataSourceOptions pslReadDataSourceOptions = + PslReadDataSourceOptions.fromSparkDataSourceOptions(options); + SubscriptionPath subscriptionPath = pslReadDataSourceOptions.subscriptionPath(); TopicPath topicPath; - try (AdminClient adminClient = pslDataSourceOptions.newAdminClient()) { + try (AdminClient adminClient = pslReadDataSourceOptions.newAdminClient()) { topicPath = TopicPath.parse(adminClient.getSubscription(subscriptionPath).get().getTopic()); } catch (Throwable t) { throw toCanonical(t).underlying; } PartitionCountReader partitionCountReader = - new CachedPartitionCountReader(pslDataSourceOptions.newAdminClient(), topicPath); + new CachedPartitionCountReader(pslReadDataSourceOptions.newAdminClient(), topicPath); return new PslContinuousReader( - pslDataSourceOptions.newCursorClient(), - pslDataSourceOptions.newMultiPartitionCommitter(partitionCountReader.getPartitionCount()), - pslDataSourceOptions.getSubscriberFactory(), + pslReadDataSourceOptions.newCursorClient(), + pslReadDataSourceOptions.newMultiPartitionCommitter( + partitionCountReader.getPartitionCount()), + pslReadDataSourceOptions.getSubscriberFactory(), subscriptionPath, - Objects.requireNonNull(pslDataSourceOptions.flowControlSettings()), + Objects.requireNonNull(pslReadDataSourceOptions.flowControlSettings()), partitionCountReader); } @@ -79,28 +87,36 @@ public MicroBatchReader createMicroBatchReader( "PubSub Lite uses fixed schema and custom schema is not allowed"); } - PslDataSourceOptions pslDataSourceOptions = - PslDataSourceOptions.fromSparkDataSourceOptions(options); - SubscriptionPath subscriptionPath = pslDataSourceOptions.subscriptionPath(); + PslReadDataSourceOptions pslReadDataSourceOptions = + PslReadDataSourceOptions.fromSparkDataSourceOptions(options); + SubscriptionPath subscriptionPath = pslReadDataSourceOptions.subscriptionPath(); TopicPath topicPath; - try (AdminClient adminClient = pslDataSourceOptions.newAdminClient()) { + try (AdminClient adminClient = pslReadDataSourceOptions.newAdminClient()) { topicPath = TopicPath.parse(adminClient.getSubscription(subscriptionPath).get().getTopic()); } catch (Throwable t) { throw toCanonical(t).underlying; } PartitionCountReader partitionCountReader = - new CachedPartitionCountReader(pslDataSourceOptions.newAdminClient(), topicPath); + new CachedPartitionCountReader(pslReadDataSourceOptions.newAdminClient(), topicPath); return new PslMicroBatchReader( - pslDataSourceOptions.newCursorClient(), - pslDataSourceOptions.newMultiPartitionCommitter(partitionCountReader.getPartitionCount()), - pslDataSourceOptions.getSubscriberFactory(), + pslReadDataSourceOptions.newCursorClient(), + pslReadDataSourceOptions.newMultiPartitionCommitter( + partitionCountReader.getPartitionCount()), + pslReadDataSourceOptions.getSubscriberFactory(), new LimitingHeadOffsetReader( - pslDataSourceOptions.newTopicStatsClient(), + pslReadDataSourceOptions.newTopicStatsClient(), topicPath, partitionCountReader, Ticker.systemTicker()), subscriptionPath, - Objects.requireNonNull(pslDataSourceOptions.flowControlSettings()), - pslDataSourceOptions.maxMessagesPerBatch()); + Objects.requireNonNull(pslReadDataSourceOptions.flowControlSettings()), + pslReadDataSourceOptions.maxMessagesPerBatch()); + } + + @Override + public StreamWriter createStreamWriter( + String queryId, StructType schema, OutputMode mode, DataSourceOptions options) { + return new PslStreamWriter( + schema, PslWriteDataSourceOptions.fromSparkDataSourceOptions(options)); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java new file mode 100644 index 00000000..94da4b92 --- /dev/null +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java @@ -0,0 +1,102 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.pubsublite.spark; + +import com.google.api.core.ApiFuture; +import com.google.cloud.pubsublite.MessageMetadata; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.flogger.GoogleLogger; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.concurrent.ExecutionException; +import javax.annotation.concurrent.GuardedBy; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.sources.v2.writer.DataWriter; +import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage; +import org.apache.spark.sql.types.StructType; + +public class PslDataWriter implements DataWriter { + + private static final GoogleLogger log = GoogleLogger.forEnclosingClass(); + + private static final CachedPublishers CACHED_PUBLISHERS = new CachedPublishers(); + + private final long partitionId, taskId, epochId; + private final StructType inputSchema; + private final PslWriteDataSourceOptions writeOptions; + private final CachedPublishers cachedPublishers; // just a reference + + @GuardedBy("this") + private final List> futures = new ArrayList<>(); + + public PslDataWriter( + long partitionId, + long taskId, + long epochId, + StructType schema, + PslWriteDataSourceOptions writeOptions) { + this(partitionId, taskId, epochId, schema, writeOptions, CACHED_PUBLISHERS); + } + + @VisibleForTesting + public PslDataWriter( + long partitionId, + long taskId, + long epochId, + StructType schema, + PslWriteDataSourceOptions writeOptions, + CachedPublishers cachedPublishers) { + this.partitionId = partitionId; + this.taskId = taskId; + this.epochId = epochId; + this.inputSchema = schema; + this.writeOptions = writeOptions; + this.cachedPublishers = cachedPublishers; + } + + @Override + public synchronized void write(InternalRow record) { + futures.add( + cachedPublishers + .getOrCreate(writeOptions) + .publish(Objects.requireNonNull(PslSparkUtils.toPubSubMessage(inputSchema, record)))); + } + + @Override + public synchronized WriterCommitMessage commit() throws IOException { + for (ApiFuture f : futures) { + try { + f.get(); + } catch (InterruptedException | ExecutionException e) { + throw new IOException(e); + } + } + log.atInfo().log( + "All writes for partitionId:%d, taskId:%d, epochId:%d succeeded, committing...", + partitionId, taskId, epochId); + return PslWriterCommitMessage.create(futures.size()); + } + + @Override + public synchronized void abort() { + log.atWarning().log( + "One or more writes for partitionId:%d, taskId:%d, epochId:%d failed, aborted.", + partitionId, taskId, epochId); + } +} diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java new file mode 100644 index 00000000..0a1e0cb7 --- /dev/null +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java @@ -0,0 +1,40 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.pubsublite.spark; + +import java.io.Serializable; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.sources.v2.writer.DataWriter; +import org.apache.spark.sql.sources.v2.writer.DataWriterFactory; +import org.apache.spark.sql.types.StructType; + +public class PslDataWriterFactory implements Serializable, DataWriterFactory { + private static final long serialVersionUID = -6904546364310978844L; + + private final StructType inputSchema; + private final PslWriteDataSourceOptions writeOptions; + + public PslDataWriterFactory(StructType inputSchema, PslWriteDataSourceOptions writeOptions) { + this.inputSchema = inputSchema; + this.writeOptions = writeOptions; + } + + @Override + public DataWriter createDataWriter(int partitionId, long taskId, long epochId) { + return new PslDataWriter(partitionId, taskId, epochId, inputSchema, writeOptions); + } +} diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java similarity index 96% rename from src/main/java/com/google/cloud/pubsublite/spark/PslDataSourceOptions.java rename to src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java index 380e022a..f9d6c990 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSourceOptions.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java @@ -47,7 +47,7 @@ import org.apache.spark.sql.sources.v2.DataSourceOptions; @AutoValue -public abstract class PslDataSourceOptions implements Serializable { +public abstract class PslReadDataSourceOptions implements Serializable { private static final long serialVersionUID = 2680059304693561607L; @Nullable @@ -60,7 +60,7 @@ public abstract class PslDataSourceOptions implements Serializable { public abstract long maxMessagesPerBatch(); public static Builder builder() { - return new AutoValue_PslDataSourceOptions.Builder() + return new AutoValue_PslReadDataSourceOptions.Builder() .setCredentialsKey(null) .setMaxMessagesPerBatch(Constants.DEFAULT_MAX_MESSAGES_PER_BATCH) .setFlowControlSettings( @@ -70,7 +70,7 @@ public static Builder builder() { .build()); } - public static PslDataSourceOptions fromSparkDataSourceOptions(DataSourceOptions options) { + public static PslReadDataSourceOptions fromSparkDataSourceOptions(DataSourceOptions options) { if (!options.get(Constants.SUBSCRIPTION_CONFIG_KEY).isPresent()) { throw new IllegalArgumentException(Constants.SUBSCRIPTION_CONFIG_KEY + " is required."); } @@ -115,7 +115,7 @@ public abstract static class Builder { public abstract Builder setFlowControlSettings(FlowControlSettings flowControlSettings); - public abstract PslDataSourceOptions build(); + public abstract PslReadDataSourceOptions build(); } MultiPartitionCommitter newMultiPartitionCommitter(long topicPartitionCount) { diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java b/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java index 1d54fe19..e3df2f6f 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java @@ -19,11 +19,15 @@ import static com.google.common.base.Preconditions.checkArgument; import static scala.collection.JavaConverters.asScalaBufferConverter; +import com.google.cloud.pubsublite.Message; import com.google.cloud.pubsublite.Offset; import com.google.cloud.pubsublite.Partition; import com.google.cloud.pubsublite.SequencedMessage; import com.google.cloud.pubsublite.SubscriptionPath; import com.google.cloud.pubsublite.internal.CursorClient; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.ImmutableListMultimap; import com.google.common.collect.ListMultimap; import com.google.common.math.LongMath; import com.google.protobuf.ByteString; @@ -34,15 +38,23 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ExecutionException; +import java.util.function.Consumer; import java.util.stream.Collectors; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.util.ArrayBasedMapData; +import org.apache.spark.sql.catalyst.util.ArrayData; import org.apache.spark.sql.catalyst.util.GenericArrayData; +import org.apache.spark.sql.catalyst.util.MapData; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.types.ByteArray; import org.apache.spark.unsafe.types.UTF8String; +import scala.compat.java8.functionConverterImpls.FromJavaBiConsumer; public class PslSparkUtils { - private static ArrayBasedMapData convertAttributesToSparkMap( + @VisibleForTesting + public static ArrayBasedMapData convertAttributesToSparkMap( ListMultimap attributeMap) { List keyList = new ArrayList<>(); @@ -83,6 +95,69 @@ public static InternalRow toInternalRow( return InternalRow.apply(asScalaBufferConverter(list).asScala()); } + private static void extractVal( + StructType inputSchema, + InternalRow row, + String fieldName, + DataType expectedDataType, + Consumer consumer) { + if (!inputSchema.getFieldIndex(fieldName).isEmpty()) { + Integer idx = (Integer) inputSchema.getFieldIndex(fieldName).get(); + try { + consumer.accept(row.get(idx, expectedDataType)); + } catch (ClassCastException e) { + // This means the field has a wrong class type. + } + } + } + + public static Message toPubSubMessage(StructType inputSchema, InternalRow row) { + Message.Builder builder = Message.builder(); + extractVal( + inputSchema, + row, + "key", + DataTypes.BinaryType, + o -> builder.setKey(ByteString.copyFrom((byte[]) o))); + extractVal( + inputSchema, + row, + "data", + DataTypes.BinaryType, + o -> builder.setData(ByteString.copyFrom((byte[]) o))); + extractVal( + inputSchema, + row, + "event_timestamp", + DataTypes.TimestampType, + o -> builder.setEventTime(Timestamps.fromMicros((long) o))); + extractVal( + inputSchema, + row, + "attributes", + Constants.ATTRIBUTES_DATATYPE, + o -> { + MapData mapData = (MapData) o; + ListMultimap attributeMap = ArrayListMultimap.create(); + mapData.foreach( + DataTypes.StringType, + Constants.ATTRIBUTES_PER_KEY_DATATYPE, + new FromJavaBiConsumer<>( + (k, v) -> { + String key = ((UTF8String) k).toString(); + ArrayData values = (ArrayData) v; + values.foreach( + DataTypes.BinaryType, + new FromJavaBiConsumer<>( + (idx, a) -> { + attributeMap.put(key, ByteString.copyFrom((byte[]) a)); + })); + })); + builder.setAttributes(ImmutableListMultimap.copyOf(attributeMap)); + }); + return builder.build(); + } + public static SparkSourceOffset toSparkSourceOffset(PslSourceOffset pslSourceOffset) { return new SparkSourceOffset( pslSourceOffset.partitionOffsetMap().entrySet().stream() diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java new file mode 100644 index 00000000..ddd424da --- /dev/null +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java @@ -0,0 +1,65 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.pubsublite.spark; + +import static com.google.common.base.Preconditions.checkArgument; + +import com.google.common.flogger.GoogleLogger; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.sources.v2.writer.DataWriterFactory; +import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage; +import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter; +import org.apache.spark.sql.types.StructType; + +public class PslStreamWriter implements StreamWriter { + + private static final GoogleLogger log = GoogleLogger.forEnclosingClass(); + + private final StructType inputSchema; + private final PslWriteDataSourceOptions writeOptions; + + public PslStreamWriter(StructType schema, PslWriteDataSourceOptions writeOptions) { + this.inputSchema = schema; + this.writeOptions = writeOptions; + } + + @Override + public void commit(long epochId, WriterCommitMessage[] messages) { + log.atInfo().log("Committed %d messages for epochId:%d.", countMessages(messages), epochId); + } + + @Override + public void abort(long epochId, WriterCommitMessage[] messages) { + log.atWarning().log( + "Epoch id: %d is aborted, including %d messages.", epochId, countMessages(messages)); + } + + private long countMessages(WriterCommitMessage[] messages) { + long cnt = 0; + for (WriterCommitMessage m : messages) { + checkArgument( + m instanceof PslWriterCommitMessage, "commit message not typed PslWriterCommitMessage"); + cnt += ((PslWriterCommitMessage) m).numMessages(); + } + return cnt; + } + + @Override + public DataWriterFactory createWriterFactory() { + return new PslDataWriterFactory(inputSchema, writeOptions); + } +} diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java new file mode 100644 index 00000000..e8a467a9 --- /dev/null +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java @@ -0,0 +1,66 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.pubsublite.spark; + +import com.google.api.gax.rpc.ApiException; +import com.google.auto.value.AutoValue; +import com.google.cloud.pubsublite.TopicPath; +import javax.annotation.Nullable; +import org.apache.spark.sql.sources.v2.DataSourceOptions; + +@AutoValue +public abstract class PslWriteDataSourceOptions { + + @Nullable + public abstract String credentialsKey(); + + public abstract TopicPath topicPath(); + + public static Builder builder() { + return new AutoValue_PslWriteDataSourceOptions.Builder().setCredentialsKey(null); + } + + @AutoValue.Builder + public abstract static class Builder { + + public abstract PslWriteDataSourceOptions.Builder setCredentialsKey(String credentialsKey); + + public abstract PslWriteDataSourceOptions.Builder setTopicPath(TopicPath topicPath); + + public abstract PslWriteDataSourceOptions build(); + } + + public static PslWriteDataSourceOptions fromSparkDataSourceOptions(DataSourceOptions options) { + if (!options.get(Constants.TOPIC_CONFIG_KEY).isPresent()) { + throw new IllegalArgumentException(Constants.TOPIC_CONFIG_KEY + " is required."); + } + + Builder builder = builder(); + String topicPathVal = options.get(Constants.TOPIC_CONFIG_KEY).get(); + try { + builder.setTopicPath(TopicPath.parse(topicPathVal)); + } catch (ApiException e) { + throw new IllegalArgumentException("Unable to parse topic path " + topicPathVal, e); + } + options.get(Constants.CREDENTIALS_KEY_CONFIG_KEY).ifPresent(builder::setCredentialsKey); + return builder.build(); + } + + public PslCredentialsProvider getCredentialProvider() { + return new PslCredentialsProvider(this); + } +} diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslWriterCommitMessage.java b/src/main/java/com/google/cloud/pubsublite/spark/PslWriterCommitMessage.java new file mode 100644 index 00000000..9204d169 --- /dev/null +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslWriterCommitMessage.java @@ -0,0 +1,30 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.pubsublite.spark; + +import com.google.auto.value.AutoValue; +import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage; + +@AutoValue +public abstract class PslWriterCommitMessage implements WriterCommitMessage { + + public abstract long numMessages(); + + public static PslWriterCommitMessage create(long numMessages) { + return new AutoValue_PslWriterCommitMessage(numMessages); + } +} diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslContinuousReaderTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslContinuousReaderTest.java index 36bcdf91..3cfde250 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslContinuousReaderTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslContinuousReaderTest.java @@ -30,8 +30,8 @@ public class PslContinuousReaderTest { - private static final PslDataSourceOptions OPTIONS = - PslDataSourceOptions.builder() + private static final PslReadDataSourceOptions OPTIONS = + PslReadDataSourceOptions.builder() .setSubscriptionPath(UnitTestExamples.exampleSubscriptionPath()) .build(); private final CursorClient cursorClient = mock(CursorClient.class); diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java new file mode 100644 index 00000000..70e359a0 --- /dev/null +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java @@ -0,0 +1,79 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.pubsublite.spark; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertThrows; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import com.google.api.core.ApiFutures; +import com.google.cloud.pubsublite.MessageMetadata; +import com.google.cloud.pubsublite.Offset; +import com.google.cloud.pubsublite.Partition; +import com.google.cloud.pubsublite.internal.Publisher; +import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; +import java.io.IOException; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.DataType; +import org.junit.Test; + +public class PslDataWriterTest { + + private final InternalRow row = mock(InternalRow.class); + + @SuppressWarnings("unchecked") + private final Publisher publisher = mock(Publisher.class); + + private final CachedPublishers cachedPublishers = mock(CachedPublishers.class); + private final PslDataWriter writer = + new PslDataWriter( + 1L, + 2L, + 3L, + Constants.DEFAULT_SCHEMA, + PslWriteDataSourceOptions.builder() + .setTopicPath(UnitTestExamples.exampleTopicPath()) + .build(), + cachedPublishers); + + @Test + public void testAllSuccess() throws IOException { + when(cachedPublishers.getOrCreate(any())).thenReturn(publisher); + when(publisher.publish(any())) + .thenReturn( + ApiFutures.immediateFuture(MessageMetadata.of(Partition.of(0L), Offset.of(0L)))); + when(row.get(anyInt(), any(DataType.class))).thenReturn(0); + writer.write(row); + writer.write(row); + assertThat(writer.commit()).isEqualTo(PslWriterCommitMessage.create(2)); + } + + @Test + public void testPartialFail() { + when(cachedPublishers.getOrCreate(any())).thenReturn(publisher); + when(publisher.publish(any())) + .thenReturn(ApiFutures.immediateFuture(MessageMetadata.of(Partition.of(0L), Offset.of(0L)))) + .thenReturn(ApiFutures.immediateFailedFuture(new InternalError(""))); + when(row.get(anyInt(), any(DataType.class))).thenReturn(0); + writer.write(row); + writer.write(row); + assertThrows(IOException.class, writer::commit); + } +} diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslMicroBatchReaderTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslMicroBatchReaderTest.java index 3692e7a5..4b1f6cac 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslMicroBatchReaderTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslMicroBatchReaderTest.java @@ -33,8 +33,8 @@ import org.junit.Test; public class PslMicroBatchReaderTest { - private static final PslDataSourceOptions OPTIONS = - PslDataSourceOptions.builder() + private static final PslReadDataSourceOptions OPTIONS = + PslReadDataSourceOptions.builder() .setSubscriptionPath(UnitTestExamples.exampleSubscriptionPath()) .build(); private final CursorClient cursorClient = mock(CursorClient.class); diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslDataSourceOptionsTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptionsTest.java similarity index 89% rename from src/test/java/com/google/cloud/pubsublite/spark/PslDataSourceOptionsTest.java rename to src/test/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptionsTest.java index bc794ead..2db8f705 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslDataSourceOptionsTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptionsTest.java @@ -22,7 +22,7 @@ import org.apache.spark.sql.sources.v2.DataSourceOptions; import org.junit.Test; -public class PslDataSourceOptionsTest { +public class PslReadDataSourceOptionsTest { @Test public void testInvalidSubPath() { @@ -30,6 +30,6 @@ public void testInvalidSubPath() { new DataSourceOptions(ImmutableMap.of(Constants.SUBSCRIPTION_CONFIG_KEY, "invalid/path")); assertThrows( IllegalArgumentException.class, - () -> PslDataSourceOptions.fromSparkDataSourceOptions(options)); + () -> PslReadDataSourceOptions.fromSparkDataSourceOptions(options)); } } diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslSparkUtilsTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslSparkUtilsTest.java index b3b81246..d49876e6 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslSparkUtilsTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslSparkUtilsTest.java @@ -17,6 +17,7 @@ package com.google.cloud.pubsublite.spark; import static com.google.common.truth.Truth.assertThat; +import static scala.collection.JavaConverters.asScalaBufferConverter; import com.google.cloud.pubsublite.Message; import com.google.cloud.pubsublite.Offset; @@ -29,10 +30,18 @@ import com.google.protobuf.Timestamp; import com.google.protobuf.util.Timestamps; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.util.ArrayData; import org.apache.spark.sql.catalyst.util.GenericArrayData; import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.ByteArray; import org.junit.Test; public class PslSparkUtilsTest { @@ -105,4 +114,71 @@ public void testToPslPartitionOffset() { assertThat(PslSparkUtils.toPslPartitionOffset(sparkPartitionOffset)) .isEqualTo(pslPartitionOffset); } + + @Test + public void testToPubSubMessage() { + Timestamp eventTimestamp = Timestamp.newBuilder().setSeconds(10000000L).build(); + Message message = + Message.builder() + .setKey(ByteString.copyFromUtf8("key")) + .setData(ByteString.copyFromUtf8("data")) + .setEventTime(eventTimestamp) + .setAttributes( + ImmutableListMultimap.of( + "key1", ByteString.copyFromUtf8("val1"), + "key1", ByteString.copyFromUtf8("val2"), + "key2", ByteString.copyFromUtf8("val3"))) + .build(); + List list = + new ArrayList<>( + Arrays.asList( + ByteArray.concat(message.key().toByteArray()), + ByteArray.concat(message.data().toByteArray()), + PslSparkUtils.convertAttributesToSparkMap(message.attributes()), + Timestamps.toMicros(message.eventTime().get()), + "abc".getBytes())); + InternalRow row = InternalRow.apply(asScalaBufferConverter(list).asScala()); + + StructType structType = + new StructType( + new StructField[] { + new StructField("key", DataTypes.BinaryType, false, Metadata.empty()), + new StructField("data", DataTypes.BinaryType, false, Metadata.empty()), + new StructField("attributes", Constants.ATTRIBUTES_DATATYPE, true, Metadata.empty()), + new StructField("event_timestamp", DataTypes.TimestampType, true, Metadata.empty()), + new StructField("random_extra_field", DataTypes.BinaryType, false, Metadata.empty()) + }); + + assertThat(message).isEqualTo(PslSparkUtils.toPubSubMessage(structType, row)); + } + + @Test + public void testToPubSubMessageTypeMismatch() { + StructType structType = + new StructType( + new StructField[] { + new StructField("key", DataTypes.TimestampType, false, Metadata.empty()) + }); + List list = Collections.singletonList(/*Timestamp=*/ 100000L); + InternalRow row = InternalRow.apply(asScalaBufferConverter(list).asScala()); + + Message message = PslSparkUtils.toPubSubMessage(structType, row); + assertThat(message).isEqualTo(Message.builder().build()); + } + + @Test + public void testToPubSubMessageLongForEventTimestamp() { + Message expectedMsg = Message.builder().setEventTime(Timestamps.fromMicros(100000L)).build(); + + StructType structType = + new StructType( + new StructField[] { + new StructField("event_timestamp", DataTypes.LongType, false, Metadata.empty()) + }); + List list = Collections.singletonList(/*Timestamp=*/ 100000L); + InternalRow row = InternalRow.apply(asScalaBufferConverter(list).asScala()); + + Message message = PslSparkUtils.toPubSubMessage(structType, row); + assertThat(message).isEqualTo(expectedMsg); + } } diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java new file mode 100644 index 00000000..5cf10f50 --- /dev/null +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java @@ -0,0 +1,35 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.pubsublite.spark; + +import static org.junit.Assert.assertThrows; + +import com.google.common.collect.ImmutableMap; +import org.apache.spark.sql.sources.v2.DataSourceOptions; +import org.junit.Test; + +public class PslWriteDataSourceOptionsTest { + + @Test + public void testInvalidTopicPath() { + DataSourceOptions options = + new DataSourceOptions(ImmutableMap.of(Constants.TOPIC_CONFIG_KEY, "invalid/path")); + assertThrows( + IllegalArgumentException.class, + () -> PslWriteDataSourceOptions.fromSparkDataSourceOptions(options)); + } +} From ad964be51a7b9e19eeef06208a9f04d0a2976b1e Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Mon, 29 Mar 2021 22:34:12 -0400 Subject: [PATCH 02/16] update --- .../pubsublite/spark/CachedPublishers.java | 81 +++---------------- .../cloud/pubsublite/spark/PslDataSource.java | 5 +- .../cloud/pubsublite/spark/PslDataWriter.java | 17 ++-- .../spark/PslDataWriterFactory.java | 12 ++- .../pubsublite/spark/PslStreamWriter.java | 11 ++- .../spark/PslWriteDataSourceOptions.java | 72 +++++++++++++++++ .../pubsublite/spark/PublisherFactory.java | 30 +++++++ .../spark/PslDataWriterFactoryTest.java | 49 +++++++++++ .../pubsublite/spark/PslDataWriterTest.java | 9 +-- 9 files changed, 194 insertions(+), 92 deletions(-) create mode 100644 src/main/java/com/google/cloud/pubsublite/spark/PublisherFactory.java create mode 100644 src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterFactoryTest.java diff --git a/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java b/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java index 84a5480e..0568d8e6 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java @@ -16,31 +16,17 @@ package com.google.cloud.pubsublite.spark; -import static com.google.cloud.pubsublite.internal.ExtractStatus.toCanonical; -import static com.google.cloud.pubsublite.internal.wire.ServiceClients.addDefaultMetadata; -import static com.google.cloud.pubsublite.internal.wire.ServiceClients.addDefaultSettings; - import com.google.api.core.ApiService; -import com.google.api.gax.rpc.ApiException; -import com.google.cloud.pubsublite.AdminClient; -import com.google.cloud.pubsublite.AdminClientSettings; import com.google.cloud.pubsublite.MessageMetadata; -import com.google.cloud.pubsublite.Partition; +import com.google.cloud.pubsublite.TopicPath; import com.google.cloud.pubsublite.internal.CloseableMonitor; import com.google.cloud.pubsublite.internal.Publisher; -import com.google.cloud.pubsublite.internal.wire.PartitionCountWatchingPublisherSettings; -import com.google.cloud.pubsublite.internal.wire.PubsubContext; -import com.google.cloud.pubsublite.internal.wire.RoutingMetadata; -import com.google.cloud.pubsublite.internal.wire.SinglePartitionPublisherBuilder; -import com.google.cloud.pubsublite.v1.AdminServiceClient; -import com.google.cloud.pubsublite.v1.AdminServiceSettings; -import com.google.cloud.pubsublite.v1.PublisherServiceClient; -import com.google.cloud.pubsublite.v1.PublisherServiceSettings; + +import javax.annotation.concurrent.GuardedBy; import java.util.HashMap; import java.util.Map; import java.util.concurrent.Executor; import java.util.concurrent.Executors; -import javax.annotation.concurrent.GuardedBy; /** Cached {@link Publisher}s to reuse publisher of same settings in the same task. */ public class CachedPublishers { @@ -50,24 +36,24 @@ public class CachedPublishers { private final Executor listenerExecutor = Executors.newSingleThreadExecutor(); @GuardedBy("monitor.monitor") - private static final Map> publishers = + private static final Map> publishers = new HashMap<>(); - public Publisher getOrCreate(PslWriteDataSourceOptions writeOptions) { + public Publisher getOrCreate(TopicPath topicPath, PublisherFactory publisherFactory) { try (CloseableMonitor.Hold h = monitor.enter()) { - Publisher publisher = publishers.get(writeOptions); + Publisher publisher = publishers.get(topicPath); if (publisher != null) { return publisher; } - publisher = createPublisherInternal(writeOptions); - publishers.put(writeOptions, publisher); + publisher = publisherFactory.newPublisher(topicPath); + publishers.put(topicPath, publisher); publisher.addListener( new ApiService.Listener() { @Override public void failed(ApiService.State s, Throwable t) { try (CloseableMonitor.Hold h = monitor.enter()) { - publishers.remove(writeOptions); + publishers.remove(topicPath); } } }, @@ -77,53 +63,4 @@ public void failed(ApiService.State s, Throwable t) { } } - private PublisherServiceClient newServiceClient( - PslWriteDataSourceOptions writeOptions, Partition partition) throws ApiException { - PublisherServiceSettings.Builder settingsBuilder = PublisherServiceSettings.newBuilder(); - settingsBuilder = settingsBuilder.setCredentialsProvider(writeOptions.getCredentialProvider()); - settingsBuilder = - addDefaultMetadata( - PubsubContext.of(Constants.FRAMEWORK), - RoutingMetadata.of(writeOptions.topicPath(), partition), - settingsBuilder); - try { - return PublisherServiceClient.create( - addDefaultSettings(writeOptions.topicPath().location().region(), settingsBuilder)); - } catch (Throwable t) { - throw toCanonical(t).underlying; - } - } - - private AdminClient getAdminClient(PslWriteDataSourceOptions writeOptions) throws ApiException { - try { - return AdminClient.create( - AdminClientSettings.newBuilder() - .setServiceClient( - AdminServiceClient.create( - addDefaultSettings( - writeOptions.topicPath().location().region(), - AdminServiceSettings.newBuilder() - .setCredentialsProvider(writeOptions.getCredentialProvider())))) - .setRegion(writeOptions.topicPath().location().region()) - .build()); - } catch (Throwable t) { - throw toCanonical(t).underlying; - } - } - - private Publisher createPublisherInternal( - PslWriteDataSourceOptions writeOptions) { - return PartitionCountWatchingPublisherSettings.newBuilder() - .setTopic(writeOptions.topicPath()) - .setPublisherFactory( - partition -> - SinglePartitionPublisherBuilder.newBuilder() - .setTopic(writeOptions.topicPath()) - .setPartition(partition) - .setServiceClient(newServiceClient(writeOptions, partition)) - .build()) - .setAdminClient(getAdminClient(writeOptions)) - .build() - .instantiate(); - } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java index 8657e55a..5f45266d 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java @@ -116,7 +116,10 @@ public MicroBatchReader createMicroBatchReader( @Override public StreamWriter createStreamWriter( String queryId, StructType schema, OutputMode mode, DataSourceOptions options) { + PslWriteDataSourceOptions pslWriteDataSourceOptions = + PslWriteDataSourceOptions.fromSparkDataSourceOptions(options); return new PslStreamWriter( - schema, PslWriteDataSourceOptions.fromSparkDataSourceOptions(options)); + schema, pslWriteDataSourceOptions.topicPath(), + pslWriteDataSourceOptions.getPublisherFactory()); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java index 94da4b92..b82eb12e 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java @@ -18,6 +18,7 @@ import com.google.api.core.ApiFuture; import com.google.cloud.pubsublite.MessageMetadata; +import com.google.cloud.pubsublite.TopicPath; import com.google.common.annotations.VisibleForTesting; import com.google.common.flogger.GoogleLogger; import java.io.IOException; @@ -39,7 +40,8 @@ public class PslDataWriter implements DataWriter { private final long partitionId, taskId, epochId; private final StructType inputSchema; - private final PslWriteDataSourceOptions writeOptions; + private final TopicPath topicPath; + private final PublisherFactory publisherFactory; private final CachedPublishers cachedPublishers; // just a reference @GuardedBy("this") @@ -50,8 +52,9 @@ public PslDataWriter( long taskId, long epochId, StructType schema, - PslWriteDataSourceOptions writeOptions) { - this(partitionId, taskId, epochId, schema, writeOptions, CACHED_PUBLISHERS); + TopicPath topicPath, + PublisherFactory publisherFactory) { + this(partitionId, taskId, epochId, schema, topicPath, publisherFactory, CACHED_PUBLISHERS); } @VisibleForTesting @@ -60,13 +63,15 @@ public PslDataWriter( long taskId, long epochId, StructType schema, - PslWriteDataSourceOptions writeOptions, + TopicPath topicPath, + PublisherFactory publisherFactory, CachedPublishers cachedPublishers) { this.partitionId = partitionId; this.taskId = taskId; this.epochId = epochId; this.inputSchema = schema; - this.writeOptions = writeOptions; + this.topicPath = topicPath; + this.publisherFactory = publisherFactory; this.cachedPublishers = cachedPublishers; } @@ -74,7 +79,7 @@ public PslDataWriter( public synchronized void write(InternalRow record) { futures.add( cachedPublishers - .getOrCreate(writeOptions) + .getOrCreate(topicPath, publisherFactory) .publish(Objects.requireNonNull(PslSparkUtils.toPubSubMessage(inputSchema, record)))); } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java index 0a1e0cb7..99d9c331 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java @@ -17,6 +17,8 @@ package com.google.cloud.pubsublite.spark; import java.io.Serializable; + +import com.google.cloud.pubsublite.TopicPath; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.sources.v2.writer.DataWriter; import org.apache.spark.sql.sources.v2.writer.DataWriterFactory; @@ -26,15 +28,17 @@ public class PslDataWriterFactory implements Serializable, DataWriterFactory createDataWriter(int partitionId, long taskId, long epochId) { - return new PslDataWriter(partitionId, taskId, epochId, inputSchema, writeOptions); + return new PslDataWriter(partitionId, taskId, epochId, inputSchema, topicPath, publisherFactory); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java index ddd424da..6e91feab 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java @@ -18,6 +18,7 @@ import static com.google.common.base.Preconditions.checkArgument; +import com.google.cloud.pubsublite.TopicPath; import com.google.common.flogger.GoogleLogger; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.sources.v2.writer.DataWriterFactory; @@ -30,11 +31,13 @@ public class PslStreamWriter implements StreamWriter { private static final GoogleLogger log = GoogleLogger.forEnclosingClass(); private final StructType inputSchema; - private final PslWriteDataSourceOptions writeOptions; + private final TopicPath topicPath; + private final PublisherFactory publisherFactory; - public PslStreamWriter(StructType schema, PslWriteDataSourceOptions writeOptions) { + public PslStreamWriter(StructType schema, TopicPath topicPath, PublisherFactory publisherFactory) { this.inputSchema = schema; - this.writeOptions = writeOptions; + this.topicPath = topicPath; + this.publisherFactory = publisherFactory; } @Override @@ -60,6 +63,6 @@ private long countMessages(WriterCommitMessage[] messages) { @Override public DataWriterFactory createWriterFactory() { - return new PslDataWriterFactory(inputSchema, writeOptions); + return new PslDataWriterFactory(inputSchema, topicPath, publisherFactory); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java index e8a467a9..058c2721 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java @@ -18,10 +18,28 @@ import com.google.api.gax.rpc.ApiException; import com.google.auto.value.AutoValue; +import com.google.cloud.pubsublite.AdminClient; +import com.google.cloud.pubsublite.AdminClientSettings; +import com.google.cloud.pubsublite.MessageMetadata; +import com.google.cloud.pubsublite.Partition; import com.google.cloud.pubsublite.TopicPath; import javax.annotation.Nullable; + +import com.google.cloud.pubsublite.internal.Publisher; +import com.google.cloud.pubsublite.internal.wire.PartitionCountWatchingPublisherSettings; +import com.google.cloud.pubsublite.internal.wire.PubsubContext; +import com.google.cloud.pubsublite.internal.wire.RoutingMetadata; +import com.google.cloud.pubsublite.internal.wire.SinglePartitionPublisherBuilder; +import com.google.cloud.pubsublite.v1.AdminServiceClient; +import com.google.cloud.pubsublite.v1.AdminServiceSettings; +import com.google.cloud.pubsublite.v1.PublisherServiceClient; +import com.google.cloud.pubsublite.v1.PublisherServiceSettings; import org.apache.spark.sql.sources.v2.DataSourceOptions; +import static com.google.cloud.pubsublite.internal.ExtractStatus.toCanonical; +import static com.google.cloud.pubsublite.internal.wire.ServiceClients.addDefaultMetadata; +import static com.google.cloud.pubsublite.internal.wire.ServiceClients.addDefaultSettings; + @AutoValue public abstract class PslWriteDataSourceOptions { @@ -63,4 +81,58 @@ public static PslWriteDataSourceOptions fromSparkDataSourceOptions(DataSourceOpt public PslCredentialsProvider getCredentialProvider() { return new PslCredentialsProvider(this); } + + public PublisherFactory getPublisherFactory() { + return (topicPath) -> createPublisherInternal(this); + } + + private PublisherServiceClient newServiceClient( + PslWriteDataSourceOptions writeOptions, Partition partition) throws ApiException { + PublisherServiceSettings.Builder settingsBuilder = PublisherServiceSettings.newBuilder(); + settingsBuilder = settingsBuilder.setCredentialsProvider(writeOptions.getCredentialProvider()); + settingsBuilder = + addDefaultMetadata( + PubsubContext.of(Constants.FRAMEWORK), + RoutingMetadata.of(writeOptions.topicPath(), partition), + settingsBuilder); + try { + return PublisherServiceClient.create( + addDefaultSettings(writeOptions.topicPath().location().region(), settingsBuilder)); + } catch (Throwable t) { + throw toCanonical(t).underlying; + } + } + + private AdminClient getAdminClient(PslWriteDataSourceOptions writeOptions) throws ApiException { + try { + return AdminClient.create( + AdminClientSettings.newBuilder() + .setServiceClient( + AdminServiceClient.create( + addDefaultSettings( + writeOptions.topicPath().location().region(), + AdminServiceSettings.newBuilder() + .setCredentialsProvider(writeOptions.getCredentialProvider())))) + .setRegion(writeOptions.topicPath().location().region()) + .build()); + } catch (Throwable t) { + throw toCanonical(t).underlying; + } + } + + private Publisher createPublisherInternal( + PslWriteDataSourceOptions writeOptions) { + return PartitionCountWatchingPublisherSettings.newBuilder() + .setTopic(writeOptions.topicPath()) + .setPublisherFactory( + partition -> + SinglePartitionPublisherBuilder.newBuilder() + .setTopic(writeOptions.topicPath()) + .setPartition(partition) + .setServiceClient(newServiceClient(writeOptions, partition)) + .build()) + .setAdminClient(getAdminClient(writeOptions)) + .build() + .instantiate(); + } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PublisherFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/PublisherFactory.java new file mode 100644 index 00000000..706da684 --- /dev/null +++ b/src/main/java/com/google/cloud/pubsublite/spark/PublisherFactory.java @@ -0,0 +1,30 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.pubsublite.spark; + +import com.google.api.gax.rpc.ApiException; +import com.google.cloud.pubsublite.MessageMetadata; +import com.google.cloud.pubsublite.TopicPath; +import com.google.cloud.pubsublite.internal.Publisher; + +import java.io.Serializable; + +interface PublisherFactory extends Serializable { + + Publisher newPublisher( + TopicPath topicPath) throws ApiException; +} diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterFactoryTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterFactoryTest.java new file mode 100644 index 00000000..605870c9 --- /dev/null +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterFactoryTest.java @@ -0,0 +1,49 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.pubsublite.spark; + + +import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; +import org.junit.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.ObjectInput; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; + +public class PslDataWriterFactoryTest { + + @Test + public void testSerializable() throws Exception { + PslDataWriterFactory obj = new PslDataWriterFactory( + Constants.DEFAULT_SCHEMA, + UnitTestExamples.exampleTopicPath(), + (t) -> null); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(bos); + oos.writeObject(obj); + oos.flush(); + byte[] data = bos.toByteArray(); + + PslDataWriterFactory obj2; + ByteArrayInputStream bis = new ByteArrayInputStream(data); + ObjectInput in = new ObjectInputStream(bis); + obj2 = (PslDataWriterFactory) in.readObject(); + obj2.createDataWriter(1, 1, 1); + } +} \ No newline at end of file diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java index 70e359a0..3b58a793 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java @@ -48,14 +48,13 @@ public class PslDataWriterTest { 2L, 3L, Constants.DEFAULT_SCHEMA, - PslWriteDataSourceOptions.builder() - .setTopicPath(UnitTestExamples.exampleTopicPath()) - .build(), + UnitTestExamples.exampleTopicPath(), + (t) -> null, cachedPublishers); @Test public void testAllSuccess() throws IOException { - when(cachedPublishers.getOrCreate(any())).thenReturn(publisher); + when(cachedPublishers.getOrCreate(any(), any())).thenReturn(publisher); when(publisher.publish(any())) .thenReturn( ApiFutures.immediateFuture(MessageMetadata.of(Partition.of(0L), Offset.of(0L)))); @@ -67,7 +66,7 @@ public void testAllSuccess() throws IOException { @Test public void testPartialFail() { - when(cachedPublishers.getOrCreate(any())).thenReturn(publisher); + when(cachedPublishers.getOrCreate(any(), any())).thenReturn(publisher); when(publisher.publish(any())) .thenReturn(ApiFutures.immediateFuture(MessageMetadata.of(Partition.of(0L), Offset.of(0L)))) .thenReturn(ApiFutures.immediateFailedFuture(new InternalError(""))); From cc0fd4f39507d17f43cadc9d51f93644333c3e73 Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Mon, 29 Mar 2021 22:36:16 -0400 Subject: [PATCH 03/16] lint --- .../pubsublite/spark/CachedPublishers.java | 10 ++- .../cloud/pubsublite/spark/PslDataSource.java | 7 ++- .../spark/PslDataWriterFactory.java | 9 +-- .../pubsublite/spark/PslStreamWriter.java | 3 +- .../spark/PslWriteDataSourceOptions.java | 63 +++++++++---------- .../pubsublite/spark/PublisherFactory.java | 4 +- .../spark/PslDataWriterFactoryTest.java | 39 ++++++------ 7 files changed, 65 insertions(+), 70 deletions(-) diff --git a/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java b/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java index 0568d8e6..4058ca16 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java @@ -21,12 +21,11 @@ import com.google.cloud.pubsublite.TopicPath; import com.google.cloud.pubsublite.internal.CloseableMonitor; import com.google.cloud.pubsublite.internal.Publisher; - -import javax.annotation.concurrent.GuardedBy; import java.util.HashMap; import java.util.Map; import java.util.concurrent.Executor; import java.util.concurrent.Executors; +import javax.annotation.concurrent.GuardedBy; /** Cached {@link Publisher}s to reuse publisher of same settings in the same task. */ public class CachedPublishers { @@ -36,10 +35,10 @@ public class CachedPublishers { private final Executor listenerExecutor = Executors.newSingleThreadExecutor(); @GuardedBy("monitor.monitor") - private static final Map> publishers = - new HashMap<>(); + private static final Map> publishers = new HashMap<>(); - public Publisher getOrCreate(TopicPath topicPath, PublisherFactory publisherFactory) { + public Publisher getOrCreate( + TopicPath topicPath, PublisherFactory publisherFactory) { try (CloseableMonitor.Hold h = monitor.enter()) { Publisher publisher = publishers.get(topicPath); if (publisher != null) { @@ -62,5 +61,4 @@ public void failed(ApiService.State s, Throwable t) { return publisher; } } - } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java index 5f45266d..cd3b2ba2 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java @@ -117,9 +117,10 @@ public MicroBatchReader createMicroBatchReader( public StreamWriter createStreamWriter( String queryId, StructType schema, OutputMode mode, DataSourceOptions options) { PslWriteDataSourceOptions pslWriteDataSourceOptions = - PslWriteDataSourceOptions.fromSparkDataSourceOptions(options); + PslWriteDataSourceOptions.fromSparkDataSourceOptions(options); return new PslStreamWriter( - schema, pslWriteDataSourceOptions.topicPath(), - pslWriteDataSourceOptions.getPublisherFactory()); + schema, + pslWriteDataSourceOptions.topicPath(), + pslWriteDataSourceOptions.getPublisherFactory()); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java index 99d9c331..ba8e19f1 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java @@ -16,9 +16,8 @@ package com.google.cloud.pubsublite.spark; -import java.io.Serializable; - import com.google.cloud.pubsublite.TopicPath; +import java.io.Serializable; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.sources.v2.writer.DataWriter; import org.apache.spark.sql.sources.v2.writer.DataWriterFactory; @@ -31,7 +30,8 @@ public class PslDataWriterFactory implements Serializable, DataWriterFactory createDataWriter(int partitionId, long taskId, long epochId) { - return new PslDataWriter(partitionId, taskId, epochId, inputSchema, topicPath, publisherFactory); + return new PslDataWriter( + partitionId, taskId, epochId, inputSchema, topicPath, publisherFactory); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java index 6e91feab..323473ac 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java @@ -34,7 +34,8 @@ public class PslStreamWriter implements StreamWriter { private final TopicPath topicPath; private final PublisherFactory publisherFactory; - public PslStreamWriter(StructType schema, TopicPath topicPath, PublisherFactory publisherFactory) { + public PslStreamWriter( + StructType schema, TopicPath topicPath, PublisherFactory publisherFactory) { this.inputSchema = schema; this.topicPath = topicPath; this.publisherFactory = publisherFactory; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java index 058c2721..96594711 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java @@ -16,6 +16,10 @@ package com.google.cloud.pubsublite.spark; +import static com.google.cloud.pubsublite.internal.ExtractStatus.toCanonical; +import static com.google.cloud.pubsublite.internal.wire.ServiceClients.addDefaultMetadata; +import static com.google.cloud.pubsublite.internal.wire.ServiceClients.addDefaultSettings; + import com.google.api.gax.rpc.ApiException; import com.google.auto.value.AutoValue; import com.google.cloud.pubsublite.AdminClient; @@ -23,8 +27,6 @@ import com.google.cloud.pubsublite.MessageMetadata; import com.google.cloud.pubsublite.Partition; import com.google.cloud.pubsublite.TopicPath; -import javax.annotation.Nullable; - import com.google.cloud.pubsublite.internal.Publisher; import com.google.cloud.pubsublite.internal.wire.PartitionCountWatchingPublisherSettings; import com.google.cloud.pubsublite.internal.wire.PubsubContext; @@ -34,12 +36,9 @@ import com.google.cloud.pubsublite.v1.AdminServiceSettings; import com.google.cloud.pubsublite.v1.PublisherServiceClient; import com.google.cloud.pubsublite.v1.PublisherServiceSettings; +import javax.annotation.Nullable; import org.apache.spark.sql.sources.v2.DataSourceOptions; -import static com.google.cloud.pubsublite.internal.ExtractStatus.toCanonical; -import static com.google.cloud.pubsublite.internal.wire.ServiceClients.addDefaultMetadata; -import static com.google.cloud.pubsublite.internal.wire.ServiceClients.addDefaultSettings; - @AutoValue public abstract class PslWriteDataSourceOptions { @@ -87,17 +86,17 @@ public PublisherFactory getPublisherFactory() { } private PublisherServiceClient newServiceClient( - PslWriteDataSourceOptions writeOptions, Partition partition) throws ApiException { + PslWriteDataSourceOptions writeOptions, Partition partition) throws ApiException { PublisherServiceSettings.Builder settingsBuilder = PublisherServiceSettings.newBuilder(); settingsBuilder = settingsBuilder.setCredentialsProvider(writeOptions.getCredentialProvider()); settingsBuilder = - addDefaultMetadata( - PubsubContext.of(Constants.FRAMEWORK), - RoutingMetadata.of(writeOptions.topicPath(), partition), - settingsBuilder); + addDefaultMetadata( + PubsubContext.of(Constants.FRAMEWORK), + RoutingMetadata.of(writeOptions.topicPath(), partition), + settingsBuilder); try { return PublisherServiceClient.create( - addDefaultSettings(writeOptions.topicPath().location().region(), settingsBuilder)); + addDefaultSettings(writeOptions.topicPath().location().region(), settingsBuilder)); } catch (Throwable t) { throw toCanonical(t).underlying; } @@ -106,15 +105,15 @@ private PublisherServiceClient newServiceClient( private AdminClient getAdminClient(PslWriteDataSourceOptions writeOptions) throws ApiException { try { return AdminClient.create( - AdminClientSettings.newBuilder() - .setServiceClient( - AdminServiceClient.create( - addDefaultSettings( - writeOptions.topicPath().location().region(), - AdminServiceSettings.newBuilder() - .setCredentialsProvider(writeOptions.getCredentialProvider())))) - .setRegion(writeOptions.topicPath().location().region()) - .build()); + AdminClientSettings.newBuilder() + .setServiceClient( + AdminServiceClient.create( + addDefaultSettings( + writeOptions.topicPath().location().region(), + AdminServiceSettings.newBuilder() + .setCredentialsProvider(writeOptions.getCredentialProvider())))) + .setRegion(writeOptions.topicPath().location().region()) + .build()); } catch (Throwable t) { throw toCanonical(t).underlying; } @@ -123,16 +122,16 @@ private AdminClient getAdminClient(PslWriteDataSourceOptions writeOptions) throw private Publisher createPublisherInternal( PslWriteDataSourceOptions writeOptions) { return PartitionCountWatchingPublisherSettings.newBuilder() - .setTopic(writeOptions.topicPath()) - .setPublisherFactory( - partition -> - SinglePartitionPublisherBuilder.newBuilder() - .setTopic(writeOptions.topicPath()) - .setPartition(partition) - .setServiceClient(newServiceClient(writeOptions, partition)) - .build()) - .setAdminClient(getAdminClient(writeOptions)) - .build() - .instantiate(); + .setTopic(writeOptions.topicPath()) + .setPublisherFactory( + partition -> + SinglePartitionPublisherBuilder.newBuilder() + .setTopic(writeOptions.topicPath()) + .setPartition(partition) + .setServiceClient(newServiceClient(writeOptions, partition)) + .build()) + .setAdminClient(getAdminClient(writeOptions)) + .build() + .instantiate(); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PublisherFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/PublisherFactory.java index 706da684..8c3a586a 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PublisherFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PublisherFactory.java @@ -20,11 +20,9 @@ import com.google.cloud.pubsublite.MessageMetadata; import com.google.cloud.pubsublite.TopicPath; import com.google.cloud.pubsublite.internal.Publisher; - import java.io.Serializable; interface PublisherFactory extends Serializable { - Publisher newPublisher( - TopicPath topicPath) throws ApiException; + Publisher newPublisher(TopicPath topicPath) throws ApiException; } diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterFactoryTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterFactoryTest.java index 605870c9..79aec022 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterFactoryTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterFactoryTest.java @@ -16,34 +16,31 @@ package com.google.cloud.pubsublite.spark; - import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; -import org.junit.Test; - import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.ObjectInput; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; +import org.junit.Test; public class PslDataWriterFactoryTest { - @Test - public void testSerializable() throws Exception { - PslDataWriterFactory obj = new PslDataWriterFactory( - Constants.DEFAULT_SCHEMA, - UnitTestExamples.exampleTopicPath(), - (t) -> null); - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(bos); - oos.writeObject(obj); - oos.flush(); - byte[] data = bos.toByteArray(); + @Test + public void testSerializable() throws Exception { + PslDataWriterFactory obj = + new PslDataWriterFactory( + Constants.DEFAULT_SCHEMA, UnitTestExamples.exampleTopicPath(), (t) -> null); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(bos); + oos.writeObject(obj); + oos.flush(); + byte[] data = bos.toByteArray(); - PslDataWriterFactory obj2; - ByteArrayInputStream bis = new ByteArrayInputStream(data); - ObjectInput in = new ObjectInputStream(bis); - obj2 = (PslDataWriterFactory) in.readObject(); - obj2.createDataWriter(1, 1, 1); - } -} \ No newline at end of file + PslDataWriterFactory obj2; + ByteArrayInputStream bis = new ByteArrayInputStream(data); + ObjectInput in = new ObjectInputStream(bis); + obj2 = (PslDataWriterFactory) in.readObject(); + obj2.createDataWriter(1, 1, 1); + } +} From ae221ffd57efb5127fecba822bd32c1eb738c6b5 Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Mon, 29 Mar 2021 22:52:46 -0400 Subject: [PATCH 04/16] update --- .../spark/PslWriteDataSourceOptions.java | 3 +- .../spark/PslDataWriterFactoryTest.java | 46 ------------------- .../spark/PslWriteDataSourceOptionsTest.java | 26 +++++++++++ 3 files changed, 28 insertions(+), 47 deletions(-) delete mode 100644 src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterFactoryTest.java diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java index 96594711..45214dae 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java @@ -36,11 +36,12 @@ import com.google.cloud.pubsublite.v1.AdminServiceSettings; import com.google.cloud.pubsublite.v1.PublisherServiceClient; import com.google.cloud.pubsublite.v1.PublisherServiceSettings; +import java.io.Serializable; import javax.annotation.Nullable; import org.apache.spark.sql.sources.v2.DataSourceOptions; @AutoValue -public abstract class PslWriteDataSourceOptions { +public abstract class PslWriteDataSourceOptions implements Serializable { @Nullable public abstract String credentialsKey(); diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterFactoryTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterFactoryTest.java deleted file mode 100644 index 79aec022..00000000 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterFactoryTest.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright 2020 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.google.cloud.pubsublite.spark; - -import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.ObjectInput; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; -import org.junit.Test; - -public class PslDataWriterFactoryTest { - - @Test - public void testSerializable() throws Exception { - PslDataWriterFactory obj = - new PslDataWriterFactory( - Constants.DEFAULT_SCHEMA, UnitTestExamples.exampleTopicPath(), (t) -> null); - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(bos); - oos.writeObject(obj); - oos.flush(); - byte[] data = bos.toByteArray(); - - PslDataWriterFactory obj2; - ByteArrayInputStream bis = new ByteArrayInputStream(data); - ObjectInput in = new ObjectInputStream(bis); - obj2 = (PslDataWriterFactory) in.readObject(); - obj2.createDataWriter(1, 1, 1); - } -} diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java index 5cf10f50..16b1a044 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java @@ -18,7 +18,13 @@ import static org.junit.Assert.assertThrows; +import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; import com.google.common.collect.ImmutableMap; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.ObjectInput; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; import org.apache.spark.sql.sources.v2.DataSourceOptions; import org.junit.Test; @@ -32,4 +38,24 @@ public void testInvalidTopicPath() { IllegalArgumentException.class, () -> PslWriteDataSourceOptions.fromSparkDataSourceOptions(options)); } + + @Test + public void testPublisherFactorySerializable() throws Exception { + PslWriteDataSourceOptions options = + PslWriteDataSourceOptions.builder() + .setTopicPath(UnitTestExamples.exampleTopicPath()) + .build(); + PublisherFactory obj = options.getPublisherFactory(); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + ObjectOutputStream oos = new ObjectOutputStream(bos); + oos.writeObject(obj); + oos.flush(); + byte[] data = bos.toByteArray(); + + PublisherFactory obj2; + ByteArrayInputStream bis = new ByteArrayInputStream(data); + ObjectInput in = new ObjectInputStream(bis); + obj2 = (PublisherFactory) in.readObject(); + obj2.newPublisher(UnitTestExamples.exampleTopicPath()); + } } From b713dc19ea14050d8370e7dd1fe768c09bb91dcb Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Tue, 30 Mar 2021 15:20:47 -0400 Subject: [PATCH 05/16] update --- .../cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java index 16b1a044..4cb21c30 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java @@ -56,6 +56,5 @@ public void testPublisherFactorySerializable() throws Exception { ByteArrayInputStream bis = new ByteArrayInputStream(data); ObjectInput in = new ObjectInputStream(bis); obj2 = (PublisherFactory) in.readObject(); - obj2.newPublisher(UnitTestExamples.exampleTopicPath()); } } From ebf8df5a921c859ecc0732d100ff5b4c524cf36d Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Tue, 30 Mar 2021 17:32:26 -0400 Subject: [PATCH 06/16] restructure --- .../google/cloud/pubsublite/spark/PslContinuousReader.java | 3 +++ .../com/google/cloud/pubsublite/spark/PslDataSource.java | 4 ++++ .../com/google/cloud/pubsublite/spark/PslDataWriter.java | 2 ++ .../google/cloud/pubsublite/spark/PslDataWriterFactory.java | 2 ++ .../google/cloud/pubsublite/spark/PslMicroBatchReader.java | 4 ++++ .../cloud/pubsublite/spark/PslReadDataSourceOptions.java | 3 +++ .../com/google/cloud/pubsublite/spark/PslStreamWriter.java | 1 + .../cloud/pubsublite/spark/PslWriteDataSourceOptions.java | 1 + .../spark/{ => internal}/CachedPartitionCountReader.java | 2 +- .../pubsublite/spark/{ => internal}/CachedPublishers.java | 3 ++- .../spark/{ => internal}/LimitingHeadOffsetReader.java | 3 ++- .../spark/{ => internal}/MultiPartitionCommitter.java | 4 +++- .../spark/{ => internal}/MultiPartitionCommitterImpl.java | 3 ++- .../spark/{ => internal}/PartitionCountReader.java | 2 +- .../spark/{ => internal}/PartitionSubscriberFactory.java | 2 +- .../spark/{ => internal}/PerTopicHeadOffsetReader.java | 4 +++- .../pubsublite/spark/{ => internal}/PublisherFactory.java | 4 ++-- .../cloud/pubsublite/spark/PslContinuousReaderTest.java | 3 +++ .../com/google/cloud/pubsublite/spark/PslDataWriterTest.java | 2 ++ .../cloud/pubsublite/spark/PslMicroBatchReaderTest.java | 3 +++ .../pubsublite/spark/PslWriteDataSourceOptionsTest.java | 1 + .../spark/{ => internal}/LimitingHeadOffsetReaderTest.java | 4 +++- .../{ => internal}/MultiPartitionCommitterImplTest.java | 5 ++++- 23 files changed, 53 insertions(+), 12 deletions(-) rename src/main/java/com/google/cloud/pubsublite/spark/{ => internal}/CachedPartitionCountReader.java (96%) rename src/main/java/com/google/cloud/pubsublite/spark/{ => internal}/CachedPublishers.java (97%) rename src/main/java/com/google/cloud/pubsublite/spark/{ => internal}/LimitingHeadOffsetReader.java (97%) rename src/main/java/com/google/cloud/pubsublite/spark/{ => internal}/MultiPartitionCommitter.java (89%) rename src/main/java/com/google/cloud/pubsublite/spark/{ => internal}/MultiPartitionCommitterImpl.java (97%) rename src/main/java/com/google/cloud/pubsublite/spark/{ => internal}/PartitionCountReader.java (93%) rename src/main/java/com/google/cloud/pubsublite/spark/{ => internal}/PartitionSubscriberFactory.java (95%) rename src/main/java/com/google/cloud/pubsublite/spark/{ => internal}/PerTopicHeadOffsetReader.java (88%) rename src/main/java/com/google/cloud/pubsublite/spark/{ => internal}/PublisherFactory.java (89%) rename src/test/java/com/google/cloud/pubsublite/spark/{ => internal}/LimitingHeadOffsetReaderTest.java (95%) rename src/test/java/com/google/cloud/pubsublite/spark/{ => internal}/MultiPartitionCommitterImplTest.java (96%) diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslContinuousReader.java b/src/main/java/com/google/cloud/pubsublite/spark/PslContinuousReader.java index 65953031..ad2ca3da 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslContinuousReader.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslContinuousReader.java @@ -22,6 +22,9 @@ import com.google.cloud.pubsublite.cloudpubsub.FlowControlSettings; import com.google.cloud.pubsublite.internal.CursorClient; import com.google.cloud.pubsublite.internal.wire.SubscriberFactory; +import com.google.cloud.pubsublite.spark.internal.MultiPartitionCommitter; +import com.google.cloud.pubsublite.spark.internal.PartitionCountReader; +import com.google.cloud.pubsublite.spark.internal.PartitionSubscriberFactory; import com.google.common.annotations.VisibleForTesting; import java.util.ArrayList; import java.util.Arrays; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java index cd3b2ba2..54dab51d 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java @@ -25,6 +25,10 @@ import com.google.cloud.pubsublite.TopicPath; import java.util.Objects; import java.util.Optional; + +import com.google.cloud.pubsublite.spark.internal.CachedPartitionCountReader; +import com.google.cloud.pubsublite.spark.internal.LimitingHeadOffsetReader; +import com.google.cloud.pubsublite.spark.internal.PartitionCountReader; import org.apache.spark.sql.sources.DataSourceRegister; import org.apache.spark.sql.sources.v2.ContinuousReadSupport; import org.apache.spark.sql.sources.v2.DataSourceOptions; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java index b82eb12e..422a9b70 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java @@ -19,6 +19,8 @@ import com.google.api.core.ApiFuture; import com.google.cloud.pubsublite.MessageMetadata; import com.google.cloud.pubsublite.TopicPath; +import com.google.cloud.pubsublite.spark.internal.CachedPublishers; +import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import com.google.common.annotations.VisibleForTesting; import com.google.common.flogger.GoogleLogger; import java.io.IOException; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java index ba8e19f1..96e2d25a 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java @@ -18,6 +18,8 @@ import com.google.cloud.pubsublite.TopicPath; import java.io.Serializable; + +import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.sources.v2.writer.DataWriter; import org.apache.spark.sql.sources.v2.writer.DataWriterFactory; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslMicroBatchReader.java b/src/main/java/com/google/cloud/pubsublite/spark/PslMicroBatchReader.java index b2a346c0..56c457c5 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslMicroBatchReader.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslMicroBatchReader.java @@ -28,6 +28,10 @@ import java.util.List; import java.util.Optional; import javax.annotation.Nullable; + +import com.google.cloud.pubsublite.spark.internal.MultiPartitionCommitter; +import com.google.cloud.pubsublite.spark.internal.PartitionSubscriberFactory; +import com.google.cloud.pubsublite.spark.internal.PerTopicHeadOffsetReader; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.sources.v2.reader.InputPartition; import org.apache.spark.sql.sources.v2.reader.streaming.MicroBatchReader; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java index f9d6c990..5f2a6d18 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java @@ -33,6 +33,9 @@ import com.google.cloud.pubsublite.internal.wire.RoutingMetadata; import com.google.cloud.pubsublite.internal.wire.ServiceClients; import com.google.cloud.pubsublite.internal.wire.SubscriberBuilder; +import com.google.cloud.pubsublite.spark.internal.MultiPartitionCommitter; +import com.google.cloud.pubsublite.spark.internal.MultiPartitionCommitterImpl; +import com.google.cloud.pubsublite.spark.internal.PartitionSubscriberFactory; import com.google.cloud.pubsublite.v1.AdminServiceClient; import com.google.cloud.pubsublite.v1.AdminServiceSettings; import com.google.cloud.pubsublite.v1.CursorServiceClient; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java index 323473ac..182b0322 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java @@ -19,6 +19,7 @@ import static com.google.common.base.Preconditions.checkArgument; import com.google.cloud.pubsublite.TopicPath; +import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import com.google.common.flogger.GoogleLogger; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.sources.v2.writer.DataWriterFactory; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java index 45214dae..10145de6 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java @@ -32,6 +32,7 @@ import com.google.cloud.pubsublite.internal.wire.PubsubContext; import com.google.cloud.pubsublite.internal.wire.RoutingMetadata; import com.google.cloud.pubsublite.internal.wire.SinglePartitionPublisherBuilder; +import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import com.google.cloud.pubsublite.v1.AdminServiceClient; import com.google.cloud.pubsublite.v1.AdminServiceSettings; import com.google.cloud.pubsublite.v1.PublisherServiceClient; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/CachedPartitionCountReader.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPartitionCountReader.java similarity index 96% rename from src/main/java/com/google/cloud/pubsublite/spark/CachedPartitionCountReader.java rename to src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPartitionCountReader.java index 35555805..a144d253 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/CachedPartitionCountReader.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPartitionCountReader.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.google.cloud.pubsublite.spark; +package com.google.cloud.pubsublite.spark.internal; import com.google.cloud.pubsublite.AdminClient; import com.google.cloud.pubsublite.PartitionLookupUtils; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java similarity index 97% rename from src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java rename to src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java index 4058ca16..0f14a591 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/CachedPublishers.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java @@ -14,13 +14,14 @@ * limitations under the License. */ -package com.google.cloud.pubsublite.spark; +package com.google.cloud.pubsublite.spark.internal; import com.google.api.core.ApiService; import com.google.cloud.pubsublite.MessageMetadata; import com.google.cloud.pubsublite.TopicPath; import com.google.cloud.pubsublite.internal.CloseableMonitor; import com.google.cloud.pubsublite.internal.Publisher; + import java.util.HashMap; import java.util.Map; import java.util.concurrent.Executor; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/LimitingHeadOffsetReader.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/LimitingHeadOffsetReader.java similarity index 97% rename from src/main/java/com/google/cloud/pubsublite/spark/LimitingHeadOffsetReader.java rename to src/main/java/com/google/cloud/pubsublite/spark/internal/LimitingHeadOffsetReader.java index 7bad0ffc..a974ba23 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/LimitingHeadOffsetReader.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/LimitingHeadOffsetReader.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.google.cloud.pubsublite.spark; +package com.google.cloud.pubsublite.spark.internal; import com.github.benmanes.caffeine.cache.AsyncLoadingCache; import com.github.benmanes.caffeine.cache.Caffeine; @@ -26,6 +26,7 @@ import com.google.cloud.pubsublite.TopicPath; import com.google.cloud.pubsublite.internal.TopicStatsClient; import com.google.cloud.pubsublite.proto.Cursor; +import com.google.cloud.pubsublite.spark.PslSourceOffset; import com.google.common.annotations.VisibleForTesting; import com.google.common.flogger.GoogleLogger; import com.google.common.util.concurrent.MoreExecutors; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/MultiPartitionCommitter.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitter.java similarity index 89% rename from src/main/java/com/google/cloud/pubsublite/spark/MultiPartitionCommitter.java rename to src/main/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitter.java index d42f33ca..0c01cc4a 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/MultiPartitionCommitter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitter.java @@ -14,10 +14,12 @@ * limitations under the License. */ -package com.google.cloud.pubsublite.spark; +package com.google.cloud.pubsublite.spark.internal; import com.google.cloud.pubsublite.Partition; import com.google.cloud.pubsublite.internal.wire.Committer; +import com.google.cloud.pubsublite.spark.PslSourceOffset; + import java.io.Closeable; public interface MultiPartitionCommitter extends Closeable { diff --git a/src/main/java/com/google/cloud/pubsublite/spark/MultiPartitionCommitterImpl.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitterImpl.java similarity index 97% rename from src/main/java/com/google/cloud/pubsublite/spark/MultiPartitionCommitterImpl.java rename to src/main/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitterImpl.java index 7ebec891..4c221f1d 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/MultiPartitionCommitterImpl.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitterImpl.java @@ -14,13 +14,14 @@ * limitations under the License. */ -package com.google.cloud.pubsublite.spark; +package com.google.cloud.pubsublite.spark.internal; import com.google.api.core.ApiFuture; import com.google.api.core.ApiFutureCallback; import com.google.api.core.ApiFutures; import com.google.cloud.pubsublite.Partition; import com.google.cloud.pubsublite.internal.wire.Committer; +import com.google.cloud.pubsublite.spark.PslSourceOffset; import com.google.common.annotations.VisibleForTesting; import com.google.common.flogger.GoogleLogger; import com.google.common.util.concurrent.MoreExecutors; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PartitionCountReader.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/PartitionCountReader.java similarity index 93% rename from src/main/java/com/google/cloud/pubsublite/spark/PartitionCountReader.java rename to src/main/java/com/google/cloud/pubsublite/spark/internal/PartitionCountReader.java index 934d40be..90991835 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PartitionCountReader.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/PartitionCountReader.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.google.cloud.pubsublite.spark; +package com.google.cloud.pubsublite.spark.internal; import java.io.Closeable; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PartitionSubscriberFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/PartitionSubscriberFactory.java similarity index 95% rename from src/main/java/com/google/cloud/pubsublite/spark/PartitionSubscriberFactory.java rename to src/main/java/com/google/cloud/pubsublite/spark/internal/PartitionSubscriberFactory.java index 9ea51670..d7a16257 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PartitionSubscriberFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/PartitionSubscriberFactory.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.google.cloud.pubsublite.spark; +package com.google.cloud.pubsublite.spark.internal; import com.google.api.gax.rpc.ApiException; import com.google.cloud.pubsublite.Partition; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PerTopicHeadOffsetReader.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/PerTopicHeadOffsetReader.java similarity index 88% rename from src/main/java/com/google/cloud/pubsublite/spark/PerTopicHeadOffsetReader.java rename to src/main/java/com/google/cloud/pubsublite/spark/internal/PerTopicHeadOffsetReader.java index 21e0bc63..fc60cf71 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PerTopicHeadOffsetReader.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/PerTopicHeadOffsetReader.java @@ -14,7 +14,9 @@ * limitations under the License. */ -package com.google.cloud.pubsublite.spark; +package com.google.cloud.pubsublite.spark.internal; + +import com.google.cloud.pubsublite.spark.PslSourceOffset; import java.io.Closeable; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PublisherFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java similarity index 89% rename from src/main/java/com/google/cloud/pubsublite/spark/PublisherFactory.java rename to src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java index 8c3a586a..ba2e5ee4 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PublisherFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.google.cloud.pubsublite.spark; +package com.google.cloud.pubsublite.spark.internal; import com.google.api.gax.rpc.ApiException; import com.google.cloud.pubsublite.MessageMetadata; @@ -22,7 +22,7 @@ import com.google.cloud.pubsublite.internal.Publisher; import java.io.Serializable; -interface PublisherFactory extends Serializable { +public interface PublisherFactory extends Serializable { Publisher newPublisher(TopicPath topicPath) throws ApiException; } diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslContinuousReaderTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslContinuousReaderTest.java index 3cfde250..d5cbc30e 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslContinuousReaderTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslContinuousReaderTest.java @@ -24,6 +24,9 @@ import com.google.cloud.pubsublite.*; import com.google.cloud.pubsublite.internal.CursorClient; import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; +import com.google.cloud.pubsublite.spark.internal.MultiPartitionCommitter; +import com.google.cloud.pubsublite.spark.internal.PartitionCountReader; +import com.google.cloud.pubsublite.spark.internal.PartitionSubscriberFactory; import com.google.common.collect.ImmutableMap; import java.util.Optional; import org.junit.Test; diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java index 3b58a793..dfa603eb 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java @@ -30,6 +30,8 @@ import com.google.cloud.pubsublite.internal.Publisher; import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; import java.io.IOException; + +import com.google.cloud.pubsublite.spark.internal.CachedPublishers; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.DataType; import org.junit.Test; diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslMicroBatchReaderTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslMicroBatchReaderTest.java index 4b1f6cac..23bee103 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslMicroBatchReaderTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslMicroBatchReaderTest.java @@ -28,6 +28,9 @@ import com.google.cloud.pubsublite.Partition; import com.google.cloud.pubsublite.internal.CursorClient; import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; +import com.google.cloud.pubsublite.spark.internal.MultiPartitionCommitter; +import com.google.cloud.pubsublite.spark.internal.PartitionSubscriberFactory; +import com.google.cloud.pubsublite.spark.internal.PerTopicHeadOffsetReader; import com.google.common.collect.ImmutableMap; import java.util.Optional; import org.junit.Test; diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java index 4cb21c30..b4ed0344 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java @@ -19,6 +19,7 @@ import static org.junit.Assert.assertThrows; import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; +import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import com.google.common.collect.ImmutableMap; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; diff --git a/src/test/java/com/google/cloud/pubsublite/spark/LimitingHeadOffsetReaderTest.java b/src/test/java/com/google/cloud/pubsublite/spark/internal/LimitingHeadOffsetReaderTest.java similarity index 95% rename from src/test/java/com/google/cloud/pubsublite/spark/LimitingHeadOffsetReaderTest.java rename to src/test/java/com/google/cloud/pubsublite/spark/internal/LimitingHeadOffsetReaderTest.java index dcc3025a..1a4f6b7b 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/LimitingHeadOffsetReaderTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/internal/LimitingHeadOffsetReaderTest.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.google.cloud.pubsublite.spark; +package com.google.cloud.pubsublite.spark.internal; import static com.google.common.truth.Truth.assertThat; import static org.mockito.ArgumentMatchers.any; @@ -30,6 +30,8 @@ import com.google.cloud.pubsublite.internal.TopicStatsClient; import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; import com.google.cloud.pubsublite.proto.Cursor; +import com.google.cloud.pubsublite.spark.internal.LimitingHeadOffsetReader; +import com.google.cloud.pubsublite.spark.internal.PartitionCountReader; import com.google.common.testing.FakeTicker; import java.util.concurrent.TimeUnit; import org.junit.Test; diff --git a/src/test/java/com/google/cloud/pubsublite/spark/MultiPartitionCommitterImplTest.java b/src/test/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitterImplTest.java similarity index 96% rename from src/test/java/com/google/cloud/pubsublite/spark/MultiPartitionCommitterImplTest.java rename to src/test/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitterImplTest.java index 65b4675a..5dcea0f0 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/MultiPartitionCommitterImplTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitterImplTest.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.google.cloud.pubsublite.spark; +package com.google.cloud.pubsublite.spark.internal; import static com.google.cloud.pubsublite.spark.TestingUtils.createPslSourceOffset; import static org.mockito.ArgumentMatchers.eq; @@ -27,6 +27,9 @@ import java.util.List; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; + +import com.google.cloud.pubsublite.spark.PslSourceOffset; +import com.google.cloud.pubsublite.spark.internal.MultiPartitionCommitterImpl; import org.junit.Test; import org.mockito.ArgumentCaptor; From 9a66d328b69d98899c0ef1f0923780de27392c7f Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Tue, 30 Mar 2021 18:10:05 -0400 Subject: [PATCH 07/16] use writeOptions --- .../cloud/pubsublite/spark/PslDataSource.java | 4 +-- .../cloud/pubsublite/spark/PslDataWriter.java | 16 ++++------ .../spark/PslDataWriterFactory.java | 12 +++----- .../spark/PslReadDataSourceOptions.java | 1 + .../pubsublite/spark/PslStreamWriter.java | 13 ++++----- .../spark/PslWriteDataSourceOptions.java | 10 ++++--- .../spark/internal/CachedPublishers.java | 29 +++++++++---------- .../PslCredentialsProvider.java | 5 +++- .../spark/internal/PublisherFactory.java | 5 ++-- .../pubsublite/spark/PslDataWriterTest.java | 10 ++++--- 10 files changed, 49 insertions(+), 56 deletions(-) rename src/main/java/com/google/cloud/pubsublite/spark/{ => internal}/PslCredentialsProvider.java (92%) diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java index 54dab51d..1f187388 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java @@ -123,8 +123,6 @@ public StreamWriter createStreamWriter( PslWriteDataSourceOptions pslWriteDataSourceOptions = PslWriteDataSourceOptions.fromSparkDataSourceOptions(options); return new PslStreamWriter( - schema, - pslWriteDataSourceOptions.topicPath(), - pslWriteDataSourceOptions.getPublisherFactory()); + schema, pslWriteDataSourceOptions); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java index 422a9b70..9dbaae39 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java @@ -42,8 +42,7 @@ public class PslDataWriter implements DataWriter { private final long partitionId, taskId, epochId; private final StructType inputSchema; - private final TopicPath topicPath; - private final PublisherFactory publisherFactory; + private final PslWriteDataSourceOptions writeOptions; private final CachedPublishers cachedPublishers; // just a reference @GuardedBy("this") @@ -54,9 +53,8 @@ public PslDataWriter( long taskId, long epochId, StructType schema, - TopicPath topicPath, - PublisherFactory publisherFactory) { - this(partitionId, taskId, epochId, schema, topicPath, publisherFactory, CACHED_PUBLISHERS); + PslWriteDataSourceOptions writeOptions) { + this(partitionId, taskId, epochId, schema, writeOptions, CACHED_PUBLISHERS); } @VisibleForTesting @@ -65,15 +63,13 @@ public PslDataWriter( long taskId, long epochId, StructType schema, - TopicPath topicPath, - PublisherFactory publisherFactory, + PslWriteDataSourceOptions writeOptions, CachedPublishers cachedPublishers) { this.partitionId = partitionId; this.taskId = taskId; this.epochId = epochId; this.inputSchema = schema; - this.topicPath = topicPath; - this.publisherFactory = publisherFactory; + this.writeOptions = writeOptions; this.cachedPublishers = cachedPublishers; } @@ -81,7 +77,7 @@ public PslDataWriter( public synchronized void write(InternalRow record) { futures.add( cachedPublishers - .getOrCreate(topicPath, publisherFactory) + .getOrCreate(writeOptions) .publish(Objects.requireNonNull(PslSparkUtils.toPubSubMessage(inputSchema, record)))); } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java index 96e2d25a..f8bd955f 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java @@ -16,10 +16,8 @@ package com.google.cloud.pubsublite.spark; -import com.google.cloud.pubsublite.TopicPath; import java.io.Serializable; -import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.sources.v2.writer.DataWriter; import org.apache.spark.sql.sources.v2.writer.DataWriterFactory; @@ -29,19 +27,17 @@ public class PslDataWriterFactory implements Serializable, DataWriterFactory createDataWriter(int partitionId, long taskId, long epochId) { return new PslDataWriter( - partitionId, taskId, epochId, inputSchema, topicPath, publisherFactory); + partitionId, taskId, epochId, inputSchema, writeOptions); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java index 5f2a6d18..3e112ae6 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java @@ -36,6 +36,7 @@ import com.google.cloud.pubsublite.spark.internal.MultiPartitionCommitter; import com.google.cloud.pubsublite.spark.internal.MultiPartitionCommitterImpl; import com.google.cloud.pubsublite.spark.internal.PartitionSubscriberFactory; +import com.google.cloud.pubsublite.spark.internal.PslCredentialsProvider; import com.google.cloud.pubsublite.v1.AdminServiceClient; import com.google.cloud.pubsublite.v1.AdminServiceSettings; import com.google.cloud.pubsublite.v1.CursorServiceClient; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java index 182b0322..481fc1b1 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java @@ -32,14 +32,11 @@ public class PslStreamWriter implements StreamWriter { private static final GoogleLogger log = GoogleLogger.forEnclosingClass(); private final StructType inputSchema; - private final TopicPath topicPath; - private final PublisherFactory publisherFactory; + private final PslWriteDataSourceOptions writeOptions; - public PslStreamWriter( - StructType schema, TopicPath topicPath, PublisherFactory publisherFactory) { - this.inputSchema = schema; - this.topicPath = topicPath; - this.publisherFactory = publisherFactory; + public PslStreamWriter(StructType inputSchema, PslWriteDataSourceOptions writeOptions) { + this.inputSchema = inputSchema; + this.writeOptions = writeOptions; } @Override @@ -65,6 +62,6 @@ private long countMessages(WriterCommitMessage[] messages) { @Override public DataWriterFactory createWriterFactory() { - return new PslDataWriterFactory(inputSchema, topicPath, publisherFactory); + return new PslDataWriterFactory(inputSchema, writeOptions); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java index 10145de6..519c5e0f 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java @@ -32,6 +32,7 @@ import com.google.cloud.pubsublite.internal.wire.PubsubContext; import com.google.cloud.pubsublite.internal.wire.RoutingMetadata; import com.google.cloud.pubsublite.internal.wire.SinglePartitionPublisherBuilder; +import com.google.cloud.pubsublite.spark.internal.PslCredentialsProvider; import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import com.google.cloud.pubsublite.v1.AdminServiceClient; import com.google.cloud.pubsublite.v1.AdminServiceSettings; @@ -84,10 +85,10 @@ public PslCredentialsProvider getCredentialProvider() { } public PublisherFactory getPublisherFactory() { - return (topicPath) -> createPublisherInternal(this); + return PslWriteDataSourceOptions::createPublisherInternal; } - private PublisherServiceClient newServiceClient( + private static PublisherServiceClient newServiceClient( PslWriteDataSourceOptions writeOptions, Partition partition) throws ApiException { PublisherServiceSettings.Builder settingsBuilder = PublisherServiceSettings.newBuilder(); settingsBuilder = settingsBuilder.setCredentialsProvider(writeOptions.getCredentialProvider()); @@ -104,7 +105,8 @@ private PublisherServiceClient newServiceClient( } } - private AdminClient getAdminClient(PslWriteDataSourceOptions writeOptions) throws ApiException { + private static AdminClient getAdminClient(PslWriteDataSourceOptions writeOptions) + throws ApiException { try { return AdminClient.create( AdminClientSettings.newBuilder() @@ -121,7 +123,7 @@ private AdminClient getAdminClient(PslWriteDataSourceOptions writeOptions) throw } } - private Publisher createPublisherInternal( + private static Publisher createPublisherInternal( PslWriteDataSourceOptions writeOptions) { return PartitionCountWatchingPublisherSettings.newBuilder() .setTopic(writeOptions.topicPath()) diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java index 0f14a591..48116c5a 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java @@ -18,9 +18,8 @@ import com.google.api.core.ApiService; import com.google.cloud.pubsublite.MessageMetadata; -import com.google.cloud.pubsublite.TopicPath; -import com.google.cloud.pubsublite.internal.CloseableMonitor; import com.google.cloud.pubsublite.internal.Publisher; +import com.google.cloud.pubsublite.spark.PslWriteDataSourceOptions; import java.util.HashMap; import java.util.Map; @@ -31,35 +30,33 @@ /** Cached {@link Publisher}s to reuse publisher of same settings in the same task. */ public class CachedPublishers { - private final CloseableMonitor monitor = new CloseableMonitor(); - private final Executor listenerExecutor = Executors.newSingleThreadExecutor(); - @GuardedBy("monitor.monitor") - private static final Map> publishers = new HashMap<>(); + @GuardedBy("this") + private static final Map> publishers = new HashMap<>(); - public Publisher getOrCreate( - TopicPath topicPath, PublisherFactory publisherFactory) { - try (CloseableMonitor.Hold h = monitor.enter()) { - Publisher publisher = publishers.get(topicPath); + public synchronized Publisher getOrCreate( + PslWriteDataSourceOptions writeOptions) { + Publisher publisher = publishers.get(writeOptions); if (publisher != null) { return publisher; } - publisher = publisherFactory.newPublisher(topicPath); - publishers.put(topicPath, publisher); + publisher = writeOptions.getPublisherFactory().newPublisher(writeOptions); + publishers.put(writeOptions, publisher); publisher.addListener( new ApiService.Listener() { @Override public void failed(ApiService.State s, Throwable t) { - try (CloseableMonitor.Hold h = monitor.enter()) { - publishers.remove(topicPath); - } + removePublisher(writeOptions); } }, listenerExecutor); publisher.startAsync().awaitRunning(); return publisher; - } + } + + private synchronized void removePublisher(PslWriteDataSourceOptions writeOptions) { + publishers.remove(writeOptions); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslCredentialsProvider.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/PslCredentialsProvider.java similarity index 92% rename from src/main/java/com/google/cloud/pubsublite/spark/PslCredentialsProvider.java rename to src/main/java/com/google/cloud/pubsublite/spark/internal/PslCredentialsProvider.java index 53eac0a2..a445eadd 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslCredentialsProvider.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/PslCredentialsProvider.java @@ -14,12 +14,15 @@ * limitations under the License. */ -package com.google.cloud.pubsublite.spark; +package com.google.cloud.pubsublite.spark.internal; import com.google.api.client.util.Base64; import com.google.api.gax.core.CredentialsProvider; import com.google.auth.Credentials; import com.google.auth.oauth2.GoogleCredentials; +import com.google.cloud.pubsublite.spark.PslReadDataSourceOptions; +import com.google.cloud.pubsublite.spark.PslWriteDataSourceOptions; + import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.UncheckedIOException; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java index ba2e5ee4..ca34055c 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java @@ -18,11 +18,12 @@ import com.google.api.gax.rpc.ApiException; import com.google.cloud.pubsublite.MessageMetadata; -import com.google.cloud.pubsublite.TopicPath; import com.google.cloud.pubsublite.internal.Publisher; +import com.google.cloud.pubsublite.spark.PslWriteDataSourceOptions; + import java.io.Serializable; public interface PublisherFactory extends Serializable { - Publisher newPublisher(TopicPath topicPath) throws ApiException; + Publisher newPublisher(PslWriteDataSourceOptions writeOptions) throws ApiException; } diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java index dfa603eb..820c35f9 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java @@ -40,6 +40,9 @@ public class PslDataWriterTest { private final InternalRow row = mock(InternalRow.class); + private final PslWriteDataSourceOptions writeOptions = PslWriteDataSourceOptions.builder() + .setTopicPath(UnitTestExamples.exampleTopicPath()) + .build(); @SuppressWarnings("unchecked") private final Publisher publisher = mock(Publisher.class); @@ -50,13 +53,12 @@ public class PslDataWriterTest { 2L, 3L, Constants.DEFAULT_SCHEMA, - UnitTestExamples.exampleTopicPath(), - (t) -> null, + writeOptions, cachedPublishers); @Test public void testAllSuccess() throws IOException { - when(cachedPublishers.getOrCreate(any(), any())).thenReturn(publisher); + when(cachedPublishers.getOrCreate(any())).thenReturn(publisher); when(publisher.publish(any())) .thenReturn( ApiFutures.immediateFuture(MessageMetadata.of(Partition.of(0L), Offset.of(0L)))); @@ -68,7 +70,7 @@ public void testAllSuccess() throws IOException { @Test public void testPartialFail() { - when(cachedPublishers.getOrCreate(any(), any())).thenReturn(publisher); + when(cachedPublishers.getOrCreate(any())).thenReturn(publisher); when(publisher.publish(any())) .thenReturn(ApiFutures.immediateFuture(MessageMetadata.of(Partition.of(0L), Offset.of(0L)))) .thenReturn(ApiFutures.immediateFailedFuture(new InternalError(""))); From 8f2027bcf09350b1a8f44296dacb0a9945e868f0 Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Tue, 30 Mar 2021 18:54:15 -0400 Subject: [PATCH 08/16] update --- .../cloud/pubsublite/spark/PslDataSource.java | 8 ++-- .../cloud/pubsublite/spark/PslDataWriter.java | 2 - .../spark/PslDataWriterFactory.java | 7 +--- .../pubsublite/spark/PslMicroBatchReader.java | 7 ++-- .../pubsublite/spark/PslStreamWriter.java | 2 - .../spark/PslWriteDataSourceOptions.java | 35 ++++++++--------- .../spark/internal/CachedPublishers.java | 38 +++++++++---------- .../internal/MultiPartitionCommitter.java | 1 - .../internal/PerTopicHeadOffsetReader.java | 1 - .../internal/PslCredentialsProvider.java | 1 - .../spark/internal/PublisherFactory.java | 29 -------------- .../pubsublite/spark/PslDataWriterTest.java | 17 +++------ .../spark/PslWriteDataSourceOptionsTest.java | 1 - .../LimitingHeadOffsetReaderTest.java | 2 - .../MultiPartitionCommitterImplTest.java | 4 +- 15 files changed, 48 insertions(+), 107 deletions(-) delete mode 100644 src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java index 1f187388..de4142e6 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java @@ -23,12 +23,11 @@ import com.google.cloud.pubsublite.AdminClient; import com.google.cloud.pubsublite.SubscriptionPath; import com.google.cloud.pubsublite.TopicPath; -import java.util.Objects; -import java.util.Optional; - import com.google.cloud.pubsublite.spark.internal.CachedPartitionCountReader; import com.google.cloud.pubsublite.spark.internal.LimitingHeadOffsetReader; import com.google.cloud.pubsublite.spark.internal.PartitionCountReader; +import java.util.Objects; +import java.util.Optional; import org.apache.spark.sql.sources.DataSourceRegister; import org.apache.spark.sql.sources.v2.ContinuousReadSupport; import org.apache.spark.sql.sources.v2.DataSourceOptions; @@ -122,7 +121,6 @@ public StreamWriter createStreamWriter( String queryId, StructType schema, OutputMode mode, DataSourceOptions options) { PslWriteDataSourceOptions pslWriteDataSourceOptions = PslWriteDataSourceOptions.fromSparkDataSourceOptions(options); - return new PslStreamWriter( - schema, pslWriteDataSourceOptions); + return new PslStreamWriter(schema, pslWriteDataSourceOptions); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java index 9dbaae39..dc2c4e12 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java @@ -18,9 +18,7 @@ import com.google.api.core.ApiFuture; import com.google.cloud.pubsublite.MessageMetadata; -import com.google.cloud.pubsublite.TopicPath; import com.google.cloud.pubsublite.spark.internal.CachedPublishers; -import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import com.google.common.annotations.VisibleForTesting; import com.google.common.flogger.GoogleLogger; import java.io.IOException; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java index f8bd955f..0a1e0cb7 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java @@ -17,7 +17,6 @@ package com.google.cloud.pubsublite.spark; import java.io.Serializable; - import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.sources.v2.writer.DataWriter; import org.apache.spark.sql.sources.v2.writer.DataWriterFactory; @@ -29,15 +28,13 @@ public class PslDataWriterFactory implements Serializable, DataWriterFactory createDataWriter(int partitionId, long taskId, long epochId) { - return new PslDataWriter( - partitionId, taskId, epochId, inputSchema, writeOptions); + return new PslDataWriter(partitionId, taskId, epochId, inputSchema, writeOptions); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslMicroBatchReader.java b/src/main/java/com/google/cloud/pubsublite/spark/PslMicroBatchReader.java index 56c457c5..a0f0dfee 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslMicroBatchReader.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslMicroBatchReader.java @@ -24,14 +24,13 @@ import com.google.cloud.pubsublite.cloudpubsub.FlowControlSettings; import com.google.cloud.pubsublite.internal.CursorClient; import com.google.cloud.pubsublite.internal.wire.SubscriberFactory; +import com.google.cloud.pubsublite.spark.internal.MultiPartitionCommitter; +import com.google.cloud.pubsublite.spark.internal.PartitionSubscriberFactory; +import com.google.cloud.pubsublite.spark.internal.PerTopicHeadOffsetReader; import java.util.ArrayList; import java.util.List; import java.util.Optional; import javax.annotation.Nullable; - -import com.google.cloud.pubsublite.spark.internal.MultiPartitionCommitter; -import com.google.cloud.pubsublite.spark.internal.PartitionSubscriberFactory; -import com.google.cloud.pubsublite.spark.internal.PerTopicHeadOffsetReader; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.sources.v2.reader.InputPartition; import org.apache.spark.sql.sources.v2.reader.streaming.MicroBatchReader; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java index 481fc1b1..557a2993 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java @@ -18,8 +18,6 @@ import static com.google.common.base.Preconditions.checkArgument; -import com.google.cloud.pubsublite.TopicPath; -import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import com.google.common.flogger.GoogleLogger; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.sources.v2.writer.DataWriterFactory; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java index 519c5e0f..eb7334c6 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java @@ -33,7 +33,6 @@ import com.google.cloud.pubsublite.internal.wire.RoutingMetadata; import com.google.cloud.pubsublite.internal.wire.SinglePartitionPublisherBuilder; import com.google.cloud.pubsublite.spark.internal.PslCredentialsProvider; -import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import com.google.cloud.pubsublite.v1.AdminServiceClient; import com.google.cloud.pubsublite.v1.AdminServiceSettings; import com.google.cloud.pubsublite.v1.PublisherServiceClient; @@ -84,8 +83,20 @@ public PslCredentialsProvider getCredentialProvider() { return new PslCredentialsProvider(this); } - public PublisherFactory getPublisherFactory() { - return PslWriteDataSourceOptions::createPublisherInternal; + public static Publisher createNewPublisher( + PslWriteDataSourceOptions writeOptions) { + return PartitionCountWatchingPublisherSettings.newBuilder() + .setTopic(writeOptions.topicPath()) + .setPublisherFactory( + partition -> + SinglePartitionPublisherBuilder.newBuilder() + .setTopic(writeOptions.topicPath()) + .setPartition(partition) + .setServiceClient(newServiceClient(writeOptions, partition)) + .build()) + .setAdminClient(getAdminClient(writeOptions)) + .build() + .instantiate(); } private static PublisherServiceClient newServiceClient( @@ -106,7 +117,7 @@ private static PublisherServiceClient newServiceClient( } private static AdminClient getAdminClient(PslWriteDataSourceOptions writeOptions) - throws ApiException { + throws ApiException { try { return AdminClient.create( AdminClientSettings.newBuilder() @@ -122,20 +133,4 @@ private static AdminClient getAdminClient(PslWriteDataSourceOptions writeOptions throw toCanonical(t).underlying; } } - - private static Publisher createPublisherInternal( - PslWriteDataSourceOptions writeOptions) { - return PartitionCountWatchingPublisherSettings.newBuilder() - .setTopic(writeOptions.topicPath()) - .setPublisherFactory( - partition -> - SinglePartitionPublisherBuilder.newBuilder() - .setTopic(writeOptions.topicPath()) - .setPartition(partition) - .setServiceClient(newServiceClient(writeOptions, partition)) - .build()) - .setAdminClient(getAdminClient(writeOptions)) - .build() - .instantiate(); - } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java index 48116c5a..9de8ca29 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java @@ -20,7 +20,6 @@ import com.google.cloud.pubsublite.MessageMetadata; import com.google.cloud.pubsublite.internal.Publisher; import com.google.cloud.pubsublite.spark.PslWriteDataSourceOptions; - import java.util.HashMap; import java.util.Map; import java.util.concurrent.Executor; @@ -33,27 +32,28 @@ public class CachedPublishers { private final Executor listenerExecutor = Executors.newSingleThreadExecutor(); @GuardedBy("this") - private static final Map> publishers = new HashMap<>(); + private static final Map> publishers = + new HashMap<>(); public synchronized Publisher getOrCreate( - PslWriteDataSourceOptions writeOptions) { - Publisher publisher = publishers.get(writeOptions); - if (publisher != null) { - return publisher; - } - - publisher = writeOptions.getPublisherFactory().newPublisher(writeOptions); - publishers.put(writeOptions, publisher); - publisher.addListener( - new ApiService.Listener() { - @Override - public void failed(ApiService.State s, Throwable t) { - removePublisher(writeOptions); - } - }, - listenerExecutor); - publisher.startAsync().awaitRunning(); + PslWriteDataSourceOptions writeOptions) { + Publisher publisher = publishers.get(writeOptions); + if (publisher != null) { return publisher; + } + + publisher = PslWriteDataSourceOptions.createNewPublisher(writeOptions); + publishers.put(writeOptions, publisher); + publisher.addListener( + new ApiService.Listener() { + @Override + public void failed(ApiService.State s, Throwable t) { + removePublisher(writeOptions); + } + }, + listenerExecutor); + publisher.startAsync().awaitRunning(); + return publisher; } private synchronized void removePublisher(PslWriteDataSourceOptions writeOptions) { diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitter.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitter.java index 0c01cc4a..bf6441e8 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitter.java @@ -19,7 +19,6 @@ import com.google.cloud.pubsublite.Partition; import com.google.cloud.pubsublite.internal.wire.Committer; import com.google.cloud.pubsublite.spark.PslSourceOffset; - import java.io.Closeable; public interface MultiPartitionCommitter extends Closeable { diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/PerTopicHeadOffsetReader.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/PerTopicHeadOffsetReader.java index fc60cf71..9ccd72c5 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/internal/PerTopicHeadOffsetReader.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/PerTopicHeadOffsetReader.java @@ -17,7 +17,6 @@ package com.google.cloud.pubsublite.spark.internal; import com.google.cloud.pubsublite.spark.PslSourceOffset; - import java.io.Closeable; public interface PerTopicHeadOffsetReader extends Closeable { diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/PslCredentialsProvider.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/PslCredentialsProvider.java index a445eadd..e730750c 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/internal/PslCredentialsProvider.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/PslCredentialsProvider.java @@ -22,7 +22,6 @@ import com.google.auth.oauth2.GoogleCredentials; import com.google.cloud.pubsublite.spark.PslReadDataSourceOptions; import com.google.cloud.pubsublite.spark.PslWriteDataSourceOptions; - import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.UncheckedIOException; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java deleted file mode 100644 index ca34055c..00000000 --- a/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright 2020 Google LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.google.cloud.pubsublite.spark.internal; - -import com.google.api.gax.rpc.ApiException; -import com.google.cloud.pubsublite.MessageMetadata; -import com.google.cloud.pubsublite.internal.Publisher; -import com.google.cloud.pubsublite.spark.PslWriteDataSourceOptions; - -import java.io.Serializable; - -public interface PublisherFactory extends Serializable { - - Publisher newPublisher(PslWriteDataSourceOptions writeOptions) throws ApiException; -} diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java index 820c35f9..7ec5efee 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java @@ -29,9 +29,8 @@ import com.google.cloud.pubsublite.Partition; import com.google.cloud.pubsublite.internal.Publisher; import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; -import java.io.IOException; - import com.google.cloud.pubsublite.spark.internal.CachedPublishers; +import java.io.IOException; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.DataType; import org.junit.Test; @@ -40,21 +39,15 @@ public class PslDataWriterTest { private final InternalRow row = mock(InternalRow.class); - private final PslWriteDataSourceOptions writeOptions = PslWriteDataSourceOptions.builder() - .setTopicPath(UnitTestExamples.exampleTopicPath()) - .build(); + private final PslWriteDataSourceOptions writeOptions = + PslWriteDataSourceOptions.builder().setTopicPath(UnitTestExamples.exampleTopicPath()).build(); + @SuppressWarnings("unchecked") private final Publisher publisher = mock(Publisher.class); private final CachedPublishers cachedPublishers = mock(CachedPublishers.class); private final PslDataWriter writer = - new PslDataWriter( - 1L, - 2L, - 3L, - Constants.DEFAULT_SCHEMA, - writeOptions, - cachedPublishers); + new PslDataWriter(1L, 2L, 3L, Constants.DEFAULT_SCHEMA, writeOptions, cachedPublishers); @Test public void testAllSuccess() throws IOException { diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java index b4ed0344..4cb21c30 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java @@ -19,7 +19,6 @@ import static org.junit.Assert.assertThrows; import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; -import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import com.google.common.collect.ImmutableMap; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; diff --git a/src/test/java/com/google/cloud/pubsublite/spark/internal/LimitingHeadOffsetReaderTest.java b/src/test/java/com/google/cloud/pubsublite/spark/internal/LimitingHeadOffsetReaderTest.java index 1a4f6b7b..944f86b0 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/internal/LimitingHeadOffsetReaderTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/internal/LimitingHeadOffsetReaderTest.java @@ -30,8 +30,6 @@ import com.google.cloud.pubsublite.internal.TopicStatsClient; import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; import com.google.cloud.pubsublite.proto.Cursor; -import com.google.cloud.pubsublite.spark.internal.LimitingHeadOffsetReader; -import com.google.cloud.pubsublite.spark.internal.PartitionCountReader; import com.google.common.testing.FakeTicker; import java.util.concurrent.TimeUnit; import org.junit.Test; diff --git a/src/test/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitterImplTest.java b/src/test/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitterImplTest.java index 5dcea0f0..9d801ea2 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitterImplTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/internal/MultiPartitionCommitterImplTest.java @@ -23,13 +23,11 @@ import com.google.api.core.SettableApiFuture; import com.google.cloud.pubsublite.*; import com.google.cloud.pubsublite.internal.wire.Committer; +import com.google.cloud.pubsublite.spark.PslSourceOffset; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; - -import com.google.cloud.pubsublite.spark.PslSourceOffset; -import com.google.cloud.pubsublite.spark.internal.MultiPartitionCommitterImpl; import org.junit.Test; import org.mockito.ArgumentCaptor; From fbd165f32b546e55e1c54d841388cb18d0c0f93d Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Tue, 30 Mar 2021 19:15:42 -0400 Subject: [PATCH 09/16] update --- .../spark/PslReadDataSourceOptions.java | 8 +++---- .../cloud/pubsublite/spark/PslSparkUtils.java | 22 +++++++++---------- .../pubsublite/spark/PslStreamWriter.java | 12 +++++----- .../spark/PslWriteDataSourceOptions.java | 2 +- .../spark/internal/CachedPublishers.java | 2 ++ .../internal/PslCredentialsProvider.java | 22 +++++-------------- 6 files changed, 29 insertions(+), 39 deletions(-) diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java index 3e112ae6..f5987788 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslReadDataSourceOptions.java @@ -139,7 +139,7 @@ PartitionSubscriberFactory getSubscriberFactory() { PubsubContext context = PubsubContext.of(Constants.FRAMEWORK); SubscriberServiceSettings.Builder settingsBuilder = SubscriberServiceSettings.newBuilder() - .setCredentialsProvider(new PslCredentialsProvider(this)); + .setCredentialsProvider(new PslCredentialsProvider(credentialsKey())); ServiceClients.addDefaultMetadata( context, RoutingMetadata.of(this.subscriptionPath(), partition), settingsBuilder); try { @@ -165,7 +165,7 @@ private CursorServiceClient newCursorServiceClient() { addDefaultSettings( this.subscriptionPath().location().region(), CursorServiceSettings.newBuilder() - .setCredentialsProvider(new PslCredentialsProvider(this)))); + .setCredentialsProvider(new PslCredentialsProvider(credentialsKey())))); } catch (IOException e) { throw new IllegalStateException("Unable to create CursorServiceClient."); } @@ -185,7 +185,7 @@ private AdminServiceClient newAdminServiceClient() { addDefaultSettings( this.subscriptionPath().location().region(), AdminServiceSettings.newBuilder() - .setCredentialsProvider(new PslCredentialsProvider(this)))); + .setCredentialsProvider(new PslCredentialsProvider(credentialsKey())))); } catch (IOException e) { throw new IllegalStateException("Unable to create AdminServiceClient."); } @@ -205,7 +205,7 @@ private TopicStatsServiceClient newTopicStatsServiceClient() { addDefaultSettings( this.subscriptionPath().location().region(), TopicStatsServiceSettings.newBuilder() - .setCredentialsProvider(new PslCredentialsProvider(this)))); + .setCredentialsProvider(new PslCredentialsProvider(credentialsKey())))); } catch (IOException e) { throw new IllegalStateException("Unable to create TopicStatsServiceClient."); } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java b/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java index e3df2f6f..22f38683 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java @@ -95,16 +95,17 @@ public static InternalRow toInternalRow( return InternalRow.apply(asScalaBufferConverter(list).asScala()); } - private static void extractVal( + @SuppressWarnings("unchecked") + private static void extractVal( StructType inputSchema, InternalRow row, String fieldName, DataType expectedDataType, - Consumer consumer) { + Consumer consumer) { if (!inputSchema.getFieldIndex(fieldName).isEmpty()) { Integer idx = (Integer) inputSchema.getFieldIndex(fieldName).get(); try { - consumer.accept(row.get(idx, expectedDataType)); + consumer.accept((T) row.get(idx, expectedDataType)); } catch (ClassCastException e) { // This means the field has a wrong class type. } @@ -118,28 +119,27 @@ public static Message toPubSubMessage(StructType inputSchema, InternalRow row) { row, "key", DataTypes.BinaryType, - o -> builder.setKey(ByteString.copyFrom((byte[]) o))); + (byte[] o) -> builder.setKey(ByteString.copyFrom(o))); extractVal( inputSchema, row, "data", DataTypes.BinaryType, - o -> builder.setData(ByteString.copyFrom((byte[]) o))); + (byte[] o) -> builder.setData(ByteString.copyFrom(o))); extractVal( inputSchema, row, "event_timestamp", DataTypes.TimestampType, - o -> builder.setEventTime(Timestamps.fromMicros((long) o))); + (Long o) -> builder.setEventTime(Timestamps.fromMicros(o))); extractVal( inputSchema, row, "attributes", Constants.ATTRIBUTES_DATATYPE, - o -> { - MapData mapData = (MapData) o; + (MapData o) -> { ListMultimap attributeMap = ArrayListMultimap.create(); - mapData.foreach( + o.foreach( DataTypes.StringType, Constants.ATTRIBUTES_PER_KEY_DATATYPE, new FromJavaBiConsumer<>( @@ -149,9 +149,7 @@ public static Message toPubSubMessage(StructType inputSchema, InternalRow row) { values.foreach( DataTypes.BinaryType, new FromJavaBiConsumer<>( - (idx, a) -> { - attributeMap.put(key, ByteString.copyFrom((byte[]) a)); - })); + (idx, a) -> attributeMap.put(key, ByteString.copyFrom((byte[]) a)))); })); builder.setAttributes(ImmutableListMultimap.copyOf(attributeMap)); }); diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java index 557a2993..b2efaf80 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslStreamWriter.java @@ -16,8 +16,6 @@ package com.google.cloud.pubsublite.spark; -import static com.google.common.base.Preconditions.checkArgument; - import com.google.common.flogger.GoogleLogger; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.sources.v2.writer.DataWriterFactory; @@ -45,15 +43,17 @@ public void commit(long epochId, WriterCommitMessage[] messages) { @Override public void abort(long epochId, WriterCommitMessage[] messages) { log.atWarning().log( - "Epoch id: %d is aborted, including %d messages.", epochId, countMessages(messages)); + "Epoch id: %d is aborted, %d messages might have been published.", + epochId, countMessages(messages)); } private long countMessages(WriterCommitMessage[] messages) { long cnt = 0; for (WriterCommitMessage m : messages) { - checkArgument( - m instanceof PslWriterCommitMessage, "commit message not typed PslWriterCommitMessage"); - cnt += ((PslWriterCommitMessage) m).numMessages(); + // It's not guaranteed to be typed PslWriterCommitMessage when abort. + if (m instanceof PslWriterCommitMessage) { + cnt += ((PslWriterCommitMessage) m).numMessages(); + } } return cnt; } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java index eb7334c6..abd524d6 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java @@ -80,7 +80,7 @@ public static PslWriteDataSourceOptions fromSparkDataSourceOptions(DataSourceOpt } public PslCredentialsProvider getCredentialProvider() { - return new PslCredentialsProvider(this); + return new PslCredentialsProvider(credentialsKey()); } public static Publisher createNewPublisher( diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java index 9de8ca29..f5dabc66 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java @@ -29,6 +29,8 @@ /** Cached {@link Publisher}s to reuse publisher of same settings in the same task. */ public class CachedPublishers { + // TODO(b/182322450): Use com.google.cloud.pubsublite.internal.wire.SystemExecutors + // once new PSL client library is released. private final Executor listenerExecutor = Executors.newSingleThreadExecutor(); @GuardedBy("this") diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/PslCredentialsProvider.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/PslCredentialsProvider.java index e730750c..6022a655 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/internal/PslCredentialsProvider.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/PslCredentialsProvider.java @@ -20,30 +20,20 @@ import com.google.api.gax.core.CredentialsProvider; import com.google.auth.Credentials; import com.google.auth.oauth2.GoogleCredentials; -import com.google.cloud.pubsublite.spark.PslReadDataSourceOptions; -import com.google.cloud.pubsublite.spark.PslWriteDataSourceOptions; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.UncheckedIOException; +import javax.annotation.Nullable; public class PslCredentialsProvider implements CredentialsProvider { private final Credentials credentials; - public PslCredentialsProvider(PslReadDataSourceOptions options) { - if (options.credentialsKey() != null) { - this.credentials = createCredentialsFromKey(options.credentialsKey()); - } else { - this.credentials = createDefaultCredentials(); - } - } - - public PslCredentialsProvider(PslWriteDataSourceOptions options) { - if (options.credentialsKey() != null) { - this.credentials = createCredentialsFromKey(options.credentialsKey()); - } else { - this.credentials = createDefaultCredentials(); - } + public PslCredentialsProvider(@Nullable String credentialsKey) { + this.credentials = + credentialsKey != null + ? createCredentialsFromKey(credentialsKey) + : createDefaultCredentials(); } private static Credentials createCredentialsFromKey(String key) { From 128c0f031539da102738f4f1318386ee389d4026 Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Tue, 30 Mar 2021 19:19:22 -0400 Subject: [PATCH 10/16] update --- .../spark/PslWriteDataSourceOptionsTest.java | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java index 4cb21c30..5cf10f50 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptionsTest.java @@ -18,13 +18,7 @@ import static org.junit.Assert.assertThrows; -import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; import com.google.common.collect.ImmutableMap; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.ObjectInput; -import java.io.ObjectInputStream; -import java.io.ObjectOutputStream; import org.apache.spark.sql.sources.v2.DataSourceOptions; import org.junit.Test; @@ -38,23 +32,4 @@ public void testInvalidTopicPath() { IllegalArgumentException.class, () -> PslWriteDataSourceOptions.fromSparkDataSourceOptions(options)); } - - @Test - public void testPublisherFactorySerializable() throws Exception { - PslWriteDataSourceOptions options = - PslWriteDataSourceOptions.builder() - .setTopicPath(UnitTestExamples.exampleTopicPath()) - .build(); - PublisherFactory obj = options.getPublisherFactory(); - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - ObjectOutputStream oos = new ObjectOutputStream(bos); - oos.writeObject(obj); - oos.flush(); - byte[] data = bos.toByteArray(); - - PublisherFactory obj2; - ByteArrayInputStream bis = new ByteArrayInputStream(data); - ObjectInput in = new ObjectInputStream(bis); - obj2 = (PublisherFactory) in.readObject(); - } } From fd4977e6b1aad7a2438ce52471abf8949d619762 Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Wed, 31 Mar 2021 20:04:39 -0400 Subject: [PATCH 11/16] update --- .../cloud/pubsublite/spark/PslSparkUtils.java | 23 ++++++--- .../spark/PslWriteDataSourceOptions.java | 29 +++++------ .../spark/internal/CachedPublishers.java | 4 +- .../pubsublite/spark/PslStreamWriterTest.java | 50 +++++++++++++++++++ 4 files changed, 82 insertions(+), 24 deletions(-) create mode 100644 src/test/java/com/google/cloud/pubsublite/spark/PslStreamWriterTest.java diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java b/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java index 22f38683..71c1c004 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java @@ -26,9 +26,9 @@ import com.google.cloud.pubsublite.SubscriptionPath; import com.google.cloud.pubsublite.internal.CursorClient; import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.ImmutableListMultimap; import com.google.common.collect.ListMultimap; +import com.google.common.flogger.GoogleLogger; import com.google.common.math.LongMath; import com.google.protobuf.ByteString; import com.google.protobuf.util.Timestamps; @@ -38,6 +38,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; import java.util.function.Consumer; import java.util.stream.Collectors; import org.apache.spark.sql.catalyst.InternalRow; @@ -50,9 +51,13 @@ import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.types.ByteArray; import org.apache.spark.unsafe.types.UTF8String; +import scala.Option; import scala.compat.java8.functionConverterImpls.FromJavaBiConsumer; public class PslSparkUtils { + + private static final GoogleLogger log = GoogleLogger.forEnclosingClass(); + @VisibleForTesting public static ArrayBasedMapData convertAttributesToSparkMap( ListMultimap attributeMap) { @@ -102,12 +107,16 @@ private static void extractVal( String fieldName, DataType expectedDataType, Consumer consumer) { - if (!inputSchema.getFieldIndex(fieldName).isEmpty()) { - Integer idx = (Integer) inputSchema.getFieldIndex(fieldName).get(); + Option idxOr; + if (!(idxOr = inputSchema.getFieldIndex(fieldName)).isEmpty()) { + Integer idx = (Integer) idxOr.get(); try { consumer.accept((T) row.get(idx, expectedDataType)); } catch (ClassCastException e) { // This means the field has a wrong class type. + log.atInfo().atMostEvery(5, TimeUnit.MINUTES).log( + "Col %s was dropped since the type doesn't match. Actual type: %s, expected type: %s.", + fieldName, inputSchema.apply(idx).dataType(), expectedDataType); } } } @@ -138,7 +147,8 @@ public static Message toPubSubMessage(StructType inputSchema, InternalRow row) { "attributes", Constants.ATTRIBUTES_DATATYPE, (MapData o) -> { - ListMultimap attributeMap = ArrayListMultimap.create(); + ImmutableListMultimap.Builder attributeMapBuilder = + ImmutableListMultimap.builder(); o.foreach( DataTypes.StringType, Constants.ATTRIBUTES_PER_KEY_DATATYPE, @@ -149,9 +159,10 @@ public static Message toPubSubMessage(StructType inputSchema, InternalRow row) { values.foreach( DataTypes.BinaryType, new FromJavaBiConsumer<>( - (idx, a) -> attributeMap.put(key, ByteString.copyFrom((byte[]) a)))); + (idx, a) -> + attributeMapBuilder.put(key, ByteString.copyFrom((byte[]) a)))); })); - builder.setAttributes(ImmutableListMultimap.copyOf(attributeMap)); + builder.setAttributes(attributeMapBuilder.build()); }); return builder.build(); } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java index abd524d6..44b6d95d 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslWriteDataSourceOptions.java @@ -83,51 +83,48 @@ public PslCredentialsProvider getCredentialProvider() { return new PslCredentialsProvider(credentialsKey()); } - public static Publisher createNewPublisher( - PslWriteDataSourceOptions writeOptions) { + public Publisher createNewPublisher() { return PartitionCountWatchingPublisherSettings.newBuilder() - .setTopic(writeOptions.topicPath()) + .setTopic(topicPath()) .setPublisherFactory( partition -> SinglePartitionPublisherBuilder.newBuilder() - .setTopic(writeOptions.topicPath()) + .setTopic(topicPath()) .setPartition(partition) - .setServiceClient(newServiceClient(writeOptions, partition)) + .setServiceClient(newServiceClient(partition)) .build()) - .setAdminClient(getAdminClient(writeOptions)) + .setAdminClient(getAdminClient()) .build() .instantiate(); } - private static PublisherServiceClient newServiceClient( - PslWriteDataSourceOptions writeOptions, Partition partition) throws ApiException { + private PublisherServiceClient newServiceClient(Partition partition) throws ApiException { PublisherServiceSettings.Builder settingsBuilder = PublisherServiceSettings.newBuilder(); - settingsBuilder = settingsBuilder.setCredentialsProvider(writeOptions.getCredentialProvider()); + settingsBuilder = settingsBuilder.setCredentialsProvider(getCredentialProvider()); settingsBuilder = addDefaultMetadata( PubsubContext.of(Constants.FRAMEWORK), - RoutingMetadata.of(writeOptions.topicPath(), partition), + RoutingMetadata.of(topicPath(), partition), settingsBuilder); try { return PublisherServiceClient.create( - addDefaultSettings(writeOptions.topicPath().location().region(), settingsBuilder)); + addDefaultSettings(topicPath().location().region(), settingsBuilder)); } catch (Throwable t) { throw toCanonical(t).underlying; } } - private static AdminClient getAdminClient(PslWriteDataSourceOptions writeOptions) - throws ApiException { + private AdminClient getAdminClient() throws ApiException { try { return AdminClient.create( AdminClientSettings.newBuilder() .setServiceClient( AdminServiceClient.create( addDefaultSettings( - writeOptions.topicPath().location().region(), + topicPath().location().region(), AdminServiceSettings.newBuilder() - .setCredentialsProvider(writeOptions.getCredentialProvider())))) - .setRegion(writeOptions.topicPath().location().region()) + .setCredentialsProvider(getCredentialProvider())))) + .setRegion(topicPath().location().region()) .build()); } catch (Throwable t) { throw toCanonical(t).underlying; diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java index f5dabc66..774fa68f 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java @@ -29,7 +29,7 @@ /** Cached {@link Publisher}s to reuse publisher of same settings in the same task. */ public class CachedPublishers { - // TODO(b/182322450): Use com.google.cloud.pubsublite.internal.wire.SystemExecutors + // TODO(jiangmichaellll): Use com.google.cloud.pubsublite.internal.wire.SystemExecutors // once new PSL client library is released. private final Executor listenerExecutor = Executors.newSingleThreadExecutor(); @@ -44,7 +44,7 @@ public synchronized Publisher getOrCreate( return publisher; } - publisher = PslWriteDataSourceOptions.createNewPublisher(writeOptions); + publisher = writeOptions.createNewPublisher(); publishers.put(writeOptions, publisher); publisher.addListener( new ApiService.Listener() { diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslStreamWriterTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslStreamWriterTest.java new file mode 100644 index 00000000..35b525d7 --- /dev/null +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslStreamWriterTest.java @@ -0,0 +1,50 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.pubsublite.spark; + +import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; +import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage; +import org.junit.Test; + +public class PslStreamWriterTest { + + private final PslStreamWriter writer = + new PslStreamWriter( + Constants.DEFAULT_SCHEMA, + PslWriteDataSourceOptions.builder() + .setTopicPath(UnitTestExamples.exampleTopicPath()) + .build()); + private final PslWriterCommitMessage message1 = PslWriterCommitMessage.create(10); + private final PslWriterCommitMessage message2 = PslWriterCommitMessage.create(5); + + private static class AbortCommitMessage implements WriterCommitMessage {} + + @Test + public void testCommit() { + writer.commit(100, new WriterCommitMessage[] {message1, message2}); + } + + @Test + public void testAbort() { + writer.abort(100, new WriterCommitMessage[] {message1, message2, new AbortCommitMessage()}); + } + + @Test + public void testCreateFactory() { + writer.createWriterFactory(); + } +} From 2726ecae8e51d185cbdbc8d7fc022d12d949b32f Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Thu, 1 Apr 2021 13:10:00 -0400 Subject: [PATCH 12/16] update --- .../java/com/google/cloud/pubsublite/spark/PslSparkUtils.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java b/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java index 71c1c004..fac11d02 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java @@ -107,8 +107,8 @@ private static void extractVal( String fieldName, DataType expectedDataType, Consumer consumer) { - Option idxOr; - if (!(idxOr = inputSchema.getFieldIndex(fieldName)).isEmpty()) { + Option idxOr = inputSchema.getFieldIndex(fieldName); + if (!idxOr.isEmpty()) { Integer idx = (Integer) idxOr.get(); try { consumer.accept((T) row.get(idx, expectedDataType)); From 6ad7373f2d8a84befd3fd16f8a0e815bddb18250 Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Thu, 1 Apr 2021 15:55:54 -0400 Subject: [PATCH 13/16] update --- .../cloud/pubsublite/spark/Constants.java | 23 +++++++-- .../cloud/pubsublite/spark/PslDataSource.java | 1 + .../cloud/pubsublite/spark/PslDataWriter.java | 35 ++++++------- .../spark/PslDataWriterFactory.java | 4 +- .../cloud/pubsublite/spark/PslSparkUtils.java | 49 ++++++++++++++----- .../spark/internal/CachedPublishers.java | 2 +- .../spark/internal/PublisherFactory.java | 26 ++++++++++ .../pubsublite/spark/PslDataWriterTest.java | 15 +++--- .../pubsublite/spark/PslSparkUtilsTest.java | 46 ++++++++++++----- 9 files changed, 144 insertions(+), 57 deletions(-) create mode 100644 src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java diff --git a/src/main/java/com/google/cloud/pubsublite/spark/Constants.java b/src/main/java/com/google/cloud/pubsublite/spark/Constants.java index b8877745..9ad29b23 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/Constants.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/Constants.java @@ -17,7 +17,10 @@ package com.google.cloud.pubsublite.spark; import com.google.cloud.pubsublite.internal.wire.PubsubContext; +import com.google.common.collect.ImmutableMap; +import java.util.Map; import org.apache.spark.sql.types.ArrayType; +import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.MapType; import org.apache.spark.sql.types.Metadata; @@ -28,21 +31,33 @@ public class Constants { public static long DEFAULT_BYTES_OUTSTANDING = 50_000_000; public static long DEFAULT_MESSAGES_OUTSTANDING = Long.MAX_VALUE; public static long DEFAULT_MAX_MESSAGES_PER_BATCH = Long.MAX_VALUE; + public static ArrayType ATTRIBUTES_PER_KEY_DATATYPE = DataTypes.createArrayType(DataTypes.BinaryType); public static MapType ATTRIBUTES_DATATYPE = DataTypes.createMapType(DataTypes.StringType, ATTRIBUTES_PER_KEY_DATATYPE); + public static Map PUBLISH_FIELD_TYPES = + ImmutableMap.of( + "key", DataTypes.BinaryType, + "data", DataTypes.BinaryType, + "attributes", ATTRIBUTES_DATATYPE, + "event_timestamp", DataTypes.TimestampType); public static StructType DEFAULT_SCHEMA = new StructType( new StructField[] { new StructField("subscription", DataTypes.StringType, false, Metadata.empty()), new StructField("partition", DataTypes.LongType, false, Metadata.empty()), new StructField("offset", DataTypes.LongType, false, Metadata.empty()), - new StructField("key", DataTypes.BinaryType, false, Metadata.empty()), - new StructField("data", DataTypes.BinaryType, false, Metadata.empty()), + new StructField("key", PUBLISH_FIELD_TYPES.get("key"), false, Metadata.empty()), + new StructField("data", PUBLISH_FIELD_TYPES.get("data"), false, Metadata.empty()), new StructField("publish_timestamp", DataTypes.TimestampType, false, Metadata.empty()), - new StructField("event_timestamp", DataTypes.TimestampType, true, Metadata.empty()), - new StructField("attributes", ATTRIBUTES_DATATYPE, true, Metadata.empty()) + new StructField( + "event_timestamp", + PUBLISH_FIELD_TYPES.get("event_timestamp"), + true, + Metadata.empty()), + new StructField( + "attributes", PUBLISH_FIELD_TYPES.get("attributes"), true, Metadata.empty()) }); public static final PubsubContext.Framework FRAMEWORK = PubsubContext.Framework.of("SPARK"); diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java index de4142e6..2ef2535d 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataSource.java @@ -119,6 +119,7 @@ public MicroBatchReader createMicroBatchReader( @Override public StreamWriter createStreamWriter( String queryId, StructType schema, OutputMode mode, DataSourceOptions options) { + PslSparkUtils.verifyWriteInputSchema(schema); PslWriteDataSourceOptions pslWriteDataSourceOptions = PslWriteDataSourceOptions.fromSparkDataSourceOptions(options); return new PslStreamWriter(schema, pslWriteDataSourceOptions); diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java index dc2c4e12..34433234 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java @@ -17,14 +17,17 @@ package com.google.cloud.pubsublite.spark; import com.google.api.core.ApiFuture; +import com.google.api.core.ApiService; import com.google.cloud.pubsublite.MessageMetadata; +import com.google.cloud.pubsublite.internal.Publisher; import com.google.cloud.pubsublite.spark.internal.CachedPublishers; -import com.google.common.annotations.VisibleForTesting; +import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import com.google.common.flogger.GoogleLogger; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Objects; +import java.util.Optional; import java.util.concurrent.ExecutionException; import javax.annotation.concurrent.GuardedBy; import org.apache.spark.sql.catalyst.InternalRow; @@ -40,42 +43,35 @@ public class PslDataWriter implements DataWriter { private final long partitionId, taskId, epochId; private final StructType inputSchema; - private final PslWriteDataSourceOptions writeOptions; - private final CachedPublishers cachedPublishers; // just a reference + private final PublisherFactory publisherFactory; @GuardedBy("this") - private final List> futures = new ArrayList<>(); + private Optional> publisher = Optional.empty(); - public PslDataWriter( - long partitionId, - long taskId, - long epochId, - StructType schema, - PslWriteDataSourceOptions writeOptions) { - this(partitionId, taskId, epochId, schema, writeOptions, CACHED_PUBLISHERS); - } + @GuardedBy("this") + private final List> futures = new ArrayList<>(); - @VisibleForTesting public PslDataWriter( long partitionId, long taskId, long epochId, StructType schema, - PslWriteDataSourceOptions writeOptions, - CachedPublishers cachedPublishers) { + PublisherFactory publisherFactory) { this.partitionId = partitionId; this.taskId = taskId; this.epochId = epochId; this.inputSchema = schema; - this.writeOptions = writeOptions; - this.cachedPublishers = cachedPublishers; + this.publisherFactory = publisherFactory; } @Override public synchronized void write(InternalRow record) { + if (!publisher.isPresent() || publisher.get().state() != ApiService.State.RUNNING) { + publisher = Optional.of(publisherFactory.newPublisher(CACHED_PUBLISHERS)); + } futures.add( - cachedPublishers - .getOrCreate(writeOptions) + publisher + .get() .publish(Objects.requireNonNull(PslSparkUtils.toPubSubMessage(inputSchema, record)))); } @@ -85,6 +81,7 @@ public synchronized WriterCommitMessage commit() throws IOException { try { f.get(); } catch (InterruptedException | ExecutionException e) { + publisher = Optional.empty(); throw new IOException(e); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java index 0a1e0cb7..dac10a15 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java @@ -16,6 +16,7 @@ package com.google.cloud.pubsublite.spark; +import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import java.io.Serializable; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.sources.v2.writer.DataWriter; @@ -35,6 +36,7 @@ public PslDataWriterFactory(StructType inputSchema, PslWriteDataSourceOptions wr @Override public DataWriter createDataWriter(int partitionId, long taskId, long epochId) { - return new PslDataWriter(partitionId, taskId, epochId, inputSchema, writeOptions); + PublisherFactory pg = (cp) -> cp.getOrCreate(writeOptions); + return new PslDataWriter(partitionId, taskId, epochId, inputSchema, pg); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java b/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java index fac11d02..2510315a 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslSparkUtils.java @@ -48,6 +48,7 @@ import org.apache.spark.sql.catalyst.util.MapData; import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.types.ByteArray; import org.apache.spark.unsafe.types.UTF8String; @@ -110,14 +111,9 @@ private static void extractVal( Option idxOr = inputSchema.getFieldIndex(fieldName); if (!idxOr.isEmpty()) { Integer idx = (Integer) idxOr.get(); - try { - consumer.accept((T) row.get(idx, expectedDataType)); - } catch (ClassCastException e) { - // This means the field has a wrong class type. - log.atInfo().atMostEvery(5, TimeUnit.MINUTES).log( - "Col %s was dropped since the type doesn't match. Actual type: %s, expected type: %s.", - fieldName, inputSchema.apply(idx).dataType(), expectedDataType); - } + // DateType should match and not throw ClassCastException, as we already verified + // type match in driver node. + consumer.accept((T) row.get(idx, expectedDataType)); } } @@ -127,25 +123,25 @@ public static Message toPubSubMessage(StructType inputSchema, InternalRow row) { inputSchema, row, "key", - DataTypes.BinaryType, + Constants.PUBLISH_FIELD_TYPES.get("key"), (byte[] o) -> builder.setKey(ByteString.copyFrom(o))); extractVal( inputSchema, row, "data", - DataTypes.BinaryType, + Constants.PUBLISH_FIELD_TYPES.get("data"), (byte[] o) -> builder.setData(ByteString.copyFrom(o))); extractVal( inputSchema, row, "event_timestamp", - DataTypes.TimestampType, + Constants.PUBLISH_FIELD_TYPES.get("event_timestamp"), (Long o) -> builder.setEventTime(Timestamps.fromMicros(o))); extractVal( inputSchema, row, "attributes", - Constants.ATTRIBUTES_DATATYPE, + Constants.PUBLISH_FIELD_TYPES.get("attributes"), (MapData o) -> { ImmutableListMultimap.Builder attributeMapBuilder = ImmutableListMultimap.builder(); @@ -167,6 +163,35 @@ public static Message toPubSubMessage(StructType inputSchema, InternalRow row) { return builder.build(); } + /** + * Make sure data fields for publish have expected Spark DataType if they exist. + * + * @param inputSchema input table schema to write to Pub/Sub Lite. + * @throws IllegalArgumentException if any DataType mismatch detected. + */ + public static void verifyWriteInputSchema(StructType inputSchema) { + Constants.PUBLISH_FIELD_TYPES.forEach( + (k, v) -> { + Option idxOr = inputSchema.getFieldIndex(k); + if (!idxOr.isEmpty()) { + StructField f = inputSchema.apply((int) idxOr.get()); + if (f.dataType() != v) { + throw new IllegalArgumentException( + String.format( + "Column %s in input schema to write to " + + "Pub/Sub Lite has a wrong DataType. Actual: %s, expected: %s.", + k, f.dataType(), v)); + } + } else { + log.atInfo().atMostEvery(5, TimeUnit.MINUTES).log( + "Input schema to write " + + "to Pub/Sub Lite doesn't contain %s column, this field for all rows will " + + "be set to empty.", + k); + } + }); + } + public static SparkSourceOffset toSparkSourceOffset(PslSourceOffset pslSourceOffset) { return new SparkSourceOffset( pslSourceOffset.partitionOffsetMap().entrySet().stream() diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java index 774fa68f..711a241a 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/CachedPublishers.java @@ -40,7 +40,7 @@ public class CachedPublishers { public synchronized Publisher getOrCreate( PslWriteDataSourceOptions writeOptions) { Publisher publisher = publishers.get(writeOptions); - if (publisher != null) { + if (publisher != null && publisher.state() == ApiService.State.RUNNING) { return publisher; } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java new file mode 100644 index 00000000..a8c93906 --- /dev/null +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java @@ -0,0 +1,26 @@ +/* + * Copyright 2020 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.cloud.pubsublite.spark.internal; + +import com.google.cloud.pubsublite.MessageMetadata; +import com.google.cloud.pubsublite.internal.Publisher; +import java.io.Serializable; + +public interface PublisherFactory extends Serializable { + + Publisher newPublisher(CachedPublishers cp); +} diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java index 7ec5efee..1574f968 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java @@ -28,8 +28,7 @@ import com.google.cloud.pubsublite.Offset; import com.google.cloud.pubsublite.Partition; import com.google.cloud.pubsublite.internal.Publisher; -import com.google.cloud.pubsublite.internal.testing.UnitTestExamples; -import com.google.cloud.pubsublite.spark.internal.CachedPublishers; +import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import java.io.IOException; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.types.DataType; @@ -39,19 +38,17 @@ public class PslDataWriterTest { private final InternalRow row = mock(InternalRow.class); - private final PslWriteDataSourceOptions writeOptions = - PslWriteDataSourceOptions.builder().setTopicPath(UnitTestExamples.exampleTopicPath()).build(); - @SuppressWarnings("unchecked") private final Publisher publisher = mock(Publisher.class); - private final CachedPublishers cachedPublishers = mock(CachedPublishers.class); + private final PublisherFactory publisherFactory = mock(PublisherFactory.class); + private final PslDataWriter writer = - new PslDataWriter(1L, 2L, 3L, Constants.DEFAULT_SCHEMA, writeOptions, cachedPublishers); + new PslDataWriter(1L, 2L, 3L, Constants.DEFAULT_SCHEMA, publisherFactory); @Test public void testAllSuccess() throws IOException { - when(cachedPublishers.getOrCreate(any())).thenReturn(publisher); + when(publisherFactory.newPublisher(any())).thenReturn(publisher); when(publisher.publish(any())) .thenReturn( ApiFutures.immediateFuture(MessageMetadata.of(Partition.of(0L), Offset.of(0L)))); @@ -63,7 +60,7 @@ public void testAllSuccess() throws IOException { @Test public void testPartialFail() { - when(cachedPublishers.getOrCreate(any())).thenReturn(publisher); + when(publisherFactory.newPublisher(any())).thenReturn(publisher); when(publisher.publish(any())) .thenReturn(ApiFutures.immediateFuture(MessageMetadata.of(Partition.of(0L), Offset.of(0L)))) .thenReturn(ApiFutures.immediateFailedFuture(new InternalError(""))); diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslSparkUtilsTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslSparkUtilsTest.java index d49876e6..7081082f 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslSparkUtilsTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslSparkUtilsTest.java @@ -17,6 +17,7 @@ package com.google.cloud.pubsublite.spark; import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertThrows; import static scala.collection.JavaConverters.asScalaBufferConverter; import com.google.cloud.pubsublite.Message; @@ -153,32 +154,55 @@ public void testToPubSubMessage() { } @Test - public void testToPubSubMessageTypeMismatch() { + public void testToPubSubMessageLongForEventTimestamp() { + Message expectedMsg = Message.builder().setEventTime(Timestamps.fromMicros(100000L)).build(); + StructType structType = new StructType( new StructField[] { - new StructField("key", DataTypes.TimestampType, false, Metadata.empty()) + new StructField("event_timestamp", DataTypes.LongType, false, Metadata.empty()) }); List list = Collections.singletonList(/*Timestamp=*/ 100000L); InternalRow row = InternalRow.apply(asScalaBufferConverter(list).asScala()); Message message = PslSparkUtils.toPubSubMessage(structType, row); - assertThat(message).isEqualTo(Message.builder().build()); + assertThat(message).isEqualTo(expectedMsg); } @Test - public void testToPubSubMessageLongForEventTimestamp() { - Message expectedMsg = Message.builder().setEventTime(Timestamps.fromMicros(100000L)).build(); + public void testVerifyWriteInputSchema() { + PslSparkUtils.verifyWriteInputSchema(Constants.DEFAULT_SCHEMA); - StructType structType = + StructType goodThoughMissing = new StructType( new StructField[] { - new StructField("event_timestamp", DataTypes.LongType, false, Metadata.empty()) + new StructField("offset", DataTypes.LongType, false, Metadata.empty()), + new StructField( + "key", Constants.PUBLISH_FIELD_TYPES.get("key"), false, Metadata.empty()), + new StructField( + "publish_timestamp", DataTypes.TimestampType, false, Metadata.empty()), + new StructField( + "attributes", + Constants.PUBLISH_FIELD_TYPES.get("attributes"), + true, + Metadata.empty()) }); - List list = Collections.singletonList(/*Timestamp=*/ 100000L); - InternalRow row = InternalRow.apply(asScalaBufferConverter(list).asScala()); + PslSparkUtils.verifyWriteInputSchema(goodThoughMissing); - Message message = PslSparkUtils.toPubSubMessage(structType, row); - assertThat(message).isEqualTo(expectedMsg); + StructType bad = + new StructType( + new StructField[] { + new StructField("offset", DataTypes.LongType, false, Metadata.empty()), + // Key field wrong DataType + new StructField("key", DataTypes.StringType, false, Metadata.empty()), + new StructField( + "publish_timestamp", DataTypes.TimestampType, false, Metadata.empty()), + new StructField( + "attributes", + Constants.PUBLISH_FIELD_TYPES.get("attributes"), + true, + Metadata.empty()) + }); + assertThrows(IllegalArgumentException.class, () -> PslSparkUtils.verifyWriteInputSchema(bad)); } } From 4610c0ca39d93492945c2be033e8ef064486b862 Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Thu, 1 Apr 2021 17:21:17 -0400 Subject: [PATCH 14/16] upadte --- clirr-ignored-differences.xml | 25 +++++++++++++++++++ .../pubsublite/spark/PslDataWriterTest.java | 22 ++++++++++++---- 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/clirr-ignored-differences.xml b/clirr-ignored-differences.xml index 1aa41e4f..6aa9dcf9 100644 --- a/clirr-ignored-differences.xml +++ b/clirr-ignored-differences.xml @@ -12,4 +12,29 @@ * * + + 8001 + com/google/cloud/pubsublite/spark/LimitingHeadOffsetReader + + + 8001 + com/google/cloud/pubsublite/spark/MultiPartitionCommitter* + + + 8001 + com/google/cloud/pubsublite/spark/PartitionSubscriberFactory + + + 8001 + com/google/cloud/pubsublite/spark/PerTopicHeadOffsetReader + + + 8001 + com/google/cloud/pubsublite/spark/PslCredentialsProvider + + + 8001 + com/google/cloud/pubsublite/spark/PslDataSourceOptions* + + \ No newline at end of file diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java index 1574f968..528b280f 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java @@ -20,6 +20,7 @@ import static org.junit.Assert.assertThrows; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -30,8 +31,12 @@ import com.google.cloud.pubsublite.internal.Publisher; import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import java.io.IOException; +import java.nio.charset.StandardCharsets; import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; import org.junit.Test; public class PslDataWriterTest { @@ -42,9 +47,14 @@ public class PslDataWriterTest { private final Publisher publisher = mock(Publisher.class); private final PublisherFactory publisherFactory = mock(PublisherFactory.class); + private final StructType keyOnly = + new StructType( + new StructField[] { + new StructField( + "key", Constants.PUBLISH_FIELD_TYPES.get("key"), false, Metadata.empty()), + }); - private final PslDataWriter writer = - new PslDataWriter(1L, 2L, 3L, Constants.DEFAULT_SCHEMA, publisherFactory); + private final PslDataWriter writer = new PslDataWriter(1L, 2L, 3L, keyOnly, publisherFactory); @Test public void testAllSuccess() throws IOException { @@ -52,7 +62,8 @@ public void testAllSuccess() throws IOException { when(publisher.publish(any())) .thenReturn( ApiFutures.immediateFuture(MessageMetadata.of(Partition.of(0L), Offset.of(0L)))); - when(row.get(anyInt(), any(DataType.class))).thenReturn(0); + when(row.get(anyInt(), eq(DataTypes.BinaryType))) + .thenReturn("abc".getBytes(StandardCharsets.UTF_8)); writer.write(row); writer.write(row); assertThat(writer.commit()).isEqualTo(PslWriterCommitMessage.create(2)); @@ -64,7 +75,8 @@ public void testPartialFail() { when(publisher.publish(any())) .thenReturn(ApiFutures.immediateFuture(MessageMetadata.of(Partition.of(0L), Offset.of(0L)))) .thenReturn(ApiFutures.immediateFailedFuture(new InternalError(""))); - when(row.get(anyInt(), any(DataType.class))).thenReturn(0); + when(row.get(anyInt(), eq(DataTypes.BinaryType))) + .thenReturn("abc".getBytes(StandardCharsets.UTF_8)); writer.write(row); writer.write(row); assertThrows(IOException.class, writer::commit); From 2c3a87d74b40313d43f7aed6fcb443bc85cd8e61 Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Fri, 2 Apr 2021 11:50:37 -0400 Subject: [PATCH 15/16] update --- .../com/google/cloud/pubsublite/spark/PslDataWriter.java | 5 +---- .../google/cloud/pubsublite/spark/PslDataWriterFactory.java | 5 ++++- .../cloud/pubsublite/spark/internal/PublisherFactory.java | 2 +- .../com/google/cloud/pubsublite/spark/PslDataWriterTest.java | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java index 34433234..631fb2d3 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriter.java @@ -20,7 +20,6 @@ import com.google.api.core.ApiService; import com.google.cloud.pubsublite.MessageMetadata; import com.google.cloud.pubsublite.internal.Publisher; -import com.google.cloud.pubsublite.spark.internal.CachedPublishers; import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import com.google.common.flogger.GoogleLogger; import java.io.IOException; @@ -39,8 +38,6 @@ public class PslDataWriter implements DataWriter { private static final GoogleLogger log = GoogleLogger.forEnclosingClass(); - private static final CachedPublishers CACHED_PUBLISHERS = new CachedPublishers(); - private final long partitionId, taskId, epochId; private final StructType inputSchema; private final PublisherFactory publisherFactory; @@ -67,7 +64,7 @@ public PslDataWriter( @Override public synchronized void write(InternalRow record) { if (!publisher.isPresent() || publisher.get().state() != ApiService.State.RUNNING) { - publisher = Optional.of(publisherFactory.newPublisher(CACHED_PUBLISHERS)); + publisher = Optional.of(publisherFactory.newPublisher()); } futures.add( publisher diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java index dac10a15..51713c3d 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java @@ -16,6 +16,7 @@ package com.google.cloud.pubsublite.spark; +import com.google.cloud.pubsublite.spark.internal.CachedPublishers; import com.google.cloud.pubsublite.spark.internal.PublisherFactory; import java.io.Serializable; import org.apache.spark.sql.catalyst.InternalRow; @@ -26,6 +27,8 @@ public class PslDataWriterFactory implements Serializable, DataWriterFactory { private static final long serialVersionUID = -6904546364310978844L; + private static final CachedPublishers CACHED_PUBLISHERS = new CachedPublishers(); + private final StructType inputSchema; private final PslWriteDataSourceOptions writeOptions; @@ -36,7 +39,7 @@ public PslDataWriterFactory(StructType inputSchema, PslWriteDataSourceOptions wr @Override public DataWriter createDataWriter(int partitionId, long taskId, long epochId) { - PublisherFactory pg = (cp) -> cp.getOrCreate(writeOptions); + PublisherFactory pg = () -> CACHED_PUBLISHERS.getOrCreate(writeOptions); return new PslDataWriter(partitionId, taskId, epochId, inputSchema, pg); } } diff --git a/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java index a8c93906..81750def 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/internal/PublisherFactory.java @@ -22,5 +22,5 @@ public interface PublisherFactory extends Serializable { - Publisher newPublisher(CachedPublishers cp); + Publisher newPublisher(); } diff --git a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java index 528b280f..a3f6f1a8 100644 --- a/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java +++ b/src/test/java/com/google/cloud/pubsublite/spark/PslDataWriterTest.java @@ -58,7 +58,7 @@ public class PslDataWriterTest { @Test public void testAllSuccess() throws IOException { - when(publisherFactory.newPublisher(any())).thenReturn(publisher); + when(publisherFactory.newPublisher()).thenReturn(publisher); when(publisher.publish(any())) .thenReturn( ApiFutures.immediateFuture(MessageMetadata.of(Partition.of(0L), Offset.of(0L)))); @@ -71,7 +71,7 @@ public void testAllSuccess() throws IOException { @Test public void testPartialFail() { - when(publisherFactory.newPublisher(any())).thenReturn(publisher); + when(publisherFactory.newPublisher()).thenReturn(publisher); when(publisher.publish(any())) .thenReturn(ApiFutures.immediateFuture(MessageMetadata.of(Partition.of(0L), Offset.of(0L)))) .thenReturn(ApiFutures.immediateFailedFuture(new InternalError(""))); From 165ee803f8771780f43d821457c9836484bce8bd Mon Sep 17 00:00:00 2001 From: Michael Jiang Date: Fri, 2 Apr 2021 11:55:42 -0400 Subject: [PATCH 16/16] update --- .../google/cloud/pubsublite/spark/PslDataWriterFactory.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java index 51713c3d..12d95921 100644 --- a/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java +++ b/src/main/java/com/google/cloud/pubsublite/spark/PslDataWriterFactory.java @@ -39,7 +39,7 @@ public PslDataWriterFactory(StructType inputSchema, PslWriteDataSourceOptions wr @Override public DataWriter createDataWriter(int partitionId, long taskId, long epochId) { - PublisherFactory pg = () -> CACHED_PUBLISHERS.getOrCreate(writeOptions); - return new PslDataWriter(partitionId, taskId, epochId, inputSchema, pg); + PublisherFactory pf = () -> CACHED_PUBLISHERS.getOrCreate(writeOptions); + return new PslDataWriter(partitionId, taskId, epochId, inputSchema, pf); } }