pachyderm · lukemarsden · Aug 12, 2022 · Oct 7, 2022 · Oct 11, 2022 · Oct 11, 2022
diff --git a/examples/spark/s3-out/Dockerfile b/examples/spark/s3-out/Dockerfile
@@ -0,0 +1,6 @@
+FROM jupyter/pyspark-notebook:spark-3.3.0
+WORKDIR /home/jovyan
+ADD spark.py /home/jovyan/
+RUN wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.3/hadoop-aws-3.3.3.jar
+RUN wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.264/aws-java-sdk-bundle-1.12.264.jar
+ENTRYPOINT ["spark-submit", "--executor-memory=15g", "--driver-memory=15g", "--jars='hadoop-aws-3.3.3.jar,aws-java-sdk-bundle-1.12.264.jar'", "/home/jovyan/spark.py"]
diff --git a/examples/spark/s3-out/README.md b/examples/spark/s3-out/README.md
@@ -0,0 +1,48 @@
+# Spark writing to `s3_out` in `raw_s3_out` mode
+
+Spark likes to write to `s3_out` in ways that the normal Pachyderm S3 gateway doesn't like.
+
+We have a special alpha `s3_out` feature mode called `raw_s3_out`.
+You can trigger it by adding an annotation to your pipeline json:
+
+```
+  "metadata": {
+    "annotations": {
+      "raw_s3_out": "true"
+    }
+  },
+```
+
+Writes to `s3_out` will then work with Spark, especially when Spark is writing a large amount of data. (With the normal S3 gateway, you see slow-downs and errors relating to "copyFile" failing.)
+
+This directory contains a worked example. We've built and pushed the Docker image for you already, so all you need to do is run:
+
+```
+pachctl create repo poke_s3
+pachctl create branch poke_s3@master
+```
+This gives us a pipeline to poke to make spark start.
+
+```
+pachctl create pipeline -f s3-spark.json
+```
+
+And observe that the result (about 190MB of the phrase "INFINITEIMPROBABILITY" repeated over and over again, a homage to [The Guide](https://sites.google.com/site/h2g2theguide/Index/i/149246)) is written to the output repo:
+
+```
+pachctl put file -f /etc/passwd poke_s3@master:/test01
+```
+Pokes the pipeline to start it.
+```
+pachctl list file spark_s3_demo@master
+```
+And observe the final files are written therein!
+
+
+## How it works (advanced details 🤓)
+
+You don't need to know how this mode works to use it, but in case you're interested.
+
+We implement this by passing those S3 requests directly through to the backing S3 store (with some light protocol hacking to rewrite bucket names, paths and authentication credentials).
+
+This means that real S3 (or minio) is processing the complex things that Spark does with the S3 protocol, and when it's finished, we just copy the result back into the output repo in PFS.
diff --git a/examples/spark/s3-out/build-and-push.sh b/examples/spark/s3-out/build-and-push.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -xeuo pipefail
+export IMAGE=quay.io/lukemarsden/spark_s3_demo:v0.0.13
+docker buildx build -t $IMAGE .
+docker push $IMAGE
diff --git a/examples/spark/s3-out/s3-spark.json b/examples/spark/s3-out/s3-spark.json
@@ -0,0 +1,29 @@
+{
+  "pipeline": {
+    "name": "spark_s3_demo"
+  },
+  "metadata": {
+    "annotations": {
+      "raw_s3_out": "true"
+    }
+  },
+  "input": {
+    "pfs": {
+      "glob": "/",
+      "repo": "poke_s3",
+      "name": "poke_s3"
+    }
+  },
+  "transform": {
+    "cmd": [
+      "spark-submit",
+      "--executor-memory=15g",
+      "--driver-memory=15g",
+      "--jars", "hadoop-aws-3.3.3.jar,aws-java-sdk-bundle-1.12.264.jar",
+      "spark.py"
+    ],
+    "image": "quay.io/lukemarsden/spark_s3_demo:v0.0.13",
+    "working_dir": "/home/jovyan"
+  },
+  "s3_out": true
+}
diff --git a/examples/spark/s3-out/spark.py b/examples/spark/s3-out/spark.py
@@ -0,0 +1,75 @@
+from pyspark.sql import SparkSession, Row, DataFrame
+from pyspark.context import SparkContext
+from pyspark import SparkConf
+import time
+import os
+
+conf = SparkConf()
+minio = False
+if minio:
+    conf.set('spark.hadoop.fs.s3a.endpoint', "http://localhost:9000")
+else:
+    # conf.set('spark.hadoop.fs.s3a.endpoint', "http://192.168.49.2:30600")
+    endpoint = os.getenv('S3_ENDPOINT')
+    conf.set('spark.hadoop.fs.s3a.endpoint', endpoint)
+    print(f"endpoint is {endpoint}")
+    # conf.set('spark.hadoop.fs.s3a.endpoint', "http://localhost:30600")
+
+conf.set('spark.hadoop.fs.s3a.impl', "org.apache.hadoop.fs.s3a.S3AFileSystem")
+
+# XXX I don't think the following line actually turns on the magic committer. What else needs to happen?
+# conf.set('spark.hadoop.fs.s3a.committer.name', 'magic')
+
+if minio:
+    conf.set('spark.hadoop.fs.s3a.access.key', 'admin')
+    conf.set('spark.hadoop.fs.s3a.secret.key', 'password')
+else:
+    conf.set('spark.hadoop.fs.s3a.access.key', "anything_will_do")
+    conf.set('spark.hadoop.fs.s3a.secret.key', "anything_will_do")
+
+conf.set('spark.hadoop.fs.s3a.path.style.access', 'true')
+conf.set('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false')
+conf.set("spark.hadoop.fs.s3a.change.detection.mode", 'none')
+conf.set("spark.hadoop.fs.s3a.change.detection.version.required", 'false')
+
+sc = SparkContext(conf=conf)
+sc.setLogLevel("ERROR")
+# sc.setLogLevel("DEBUG")
+sc.setSystemProperty("com.amazonaws.services.s3.disablePutObjectMD5Validation", "true")
+
+# confirm config is applied to this session
+spark = SparkSession.builder.getOrCreate()
+conf = spark.sparkContext.getConf()
+sc = spark.sparkContext
+conf = sc.getConf()
+print(sc.getConf().getAll())
+
+# create some example data
+# big = "INFINITEIMPROBABILITY"*1024*100
+big = "INFINITEIMPROBABILITY"*1024*100
+zs = [ Row(a=big, b=big,) for _ in range(1000) ]
+df = spark.createDataFrame(zs)
+df.explain()
+# df.show()
+df.repartition(200)
+df.explain()
+
+repo = "spark-s3g-demo2"
+branch = "master"
+
+path = "example-data-24"
+if minio:
+    url = f"s3a://foo/{path}"
+else:
+    url = f"s3a://out/{path}"
+
+print("Starting write...")
+(df.coalesce(1)
+    .write
+#    .option("fs.s3a.committer.name", "magic")
+    .format("parquet")
+    .mode("overwrite")
+    .save(url))
+print("Finished write!")
+
+df.explain()
diff --git a/examples/spark/s3gateway/spark.py b/examples/spark/s3gateway/spark.py
@@ -0,0 +1,74 @@
+from pyspark.sql import SparkSession, Row, DataFrame
+from pyspark.context import SparkContext
+from pyspark import SparkConf
+import time
+import os
+import python_pachyderm
+
+conf = SparkConf()
+minio = False
+if minio:
+    conf.set('spark.hadoop.fs.s3a.endpoint', "http://localhost:9000")
+else:
+    conf.set('spark.hadoop.fs.s3a.endpoint', "http://192.168.49.2:30600")
+    # conf.set('spark.hadoop.fs.s3a.endpoint', "http://localhost:30600")
+
+conf.set('spark.hadoop.fs.s3a.impl', "org.apache.hadoop.fs.s3a.S3AFileSystem")
+
+# XXX I don't think the following line actually turns on the magic committer. What else needs to happen?
+# conf.set('spark.hadoop.fs.s3a.committer.name', 'magic')
+
+if minio:
+    conf.set('spark.hadoop.fs.s3a.access.key', 'admin')
+    conf.set('spark.hadoop.fs.s3a.secret.key', 'password')
+else:
+    conf.set('spark.hadoop.fs.s3a.access.key', 'anything_matching')
+    conf.set('spark.hadoop.fs.s3a.secret.key', 'anything_matching')
+
+conf.set('spark.hadoop.fs.s3a.path.style.access', 'true')
+conf.set('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false')
+conf.set("spark.hadoop.fs.s3a.change.detection.mode", 'none')
+conf.set("spark.hadoop.fs.s3a.change.detection.version.required", 'false')
+
+sc = SparkContext(conf=conf)
+sc.setLogLevel("ERROR")
+# sc.setLogLevel("DEBUG")
+sc.setSystemProperty("com.amazonaws.services.s3.disablePutObjectMD5Validation", "true")
+
+# confirm config is applied to this session
+spark = SparkSession.builder.getOrCreate()
+conf = spark.sparkContext.getConf()
+sc = spark.sparkContext
+conf = sc.getConf()
+print(sc.getConf().getAll())
+
+# create some example data
+# big = "INFINITEIMPROBABILITY"*1024*100
+big = "INFINITEIMPROBABILITY"*1024*100
+zs = [ Row(a=big, b=big,) for _ in range(1000) ]
+df = spark.createDataFrame(zs)
+df.explain()
+# df.show()
+df.repartition(200)
+df.explain()
+
+repo = "spark-s3g-demo2"
+branch = "master"
+
+client = python_pachyderm.Client()
+
+with client.commit(repo, branch) as commit:
+    print(f"Opening commit {commit} for spark job")
+    path = "example-data-24"
+    if minio:
+        url = f"s3a://foo/{path}"
+    else:
+        url = f"s3a://{branch}.{repo}/{path}"
+    (df.coalesce(1)
+       .write
+    #    .option("fs.s3a.committer.name", "magic")
+       .format("parquet")
+       .mode("overwrite")
+       .save(url))
+    df.explain()
+    print(f"Closing {commit}")
diff --git a/go.mod b/go.mod
@@ -240,6 +240,7 @@ require (
 	github.com/segmentio/backo-go v0.0.0-20160424052352-204274ad699c // indirect
 	github.com/shopspring/decimal v1.2.0 // indirect
 	github.com/smartystreets/assertions v1.0.1 // indirect
+	github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9
 	github.com/snowflakedb/gosnowflake v1.6.11
 	github.com/soheilhy/cmux v0.1.5 // indirect
 	github.com/spf13/cast v1.3.1 // indirect

diff --git a/go.sum b/go.sum
@@ -1328,6 +1328,8 @@ github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVs
 github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
 github.com/smartystreets/assertions v1.0.1 h1:voD4ITNjPL5jjBfgR/r8fPIIBrliWrWHeiJApdr3r4w=
 github.com/smartystreets/assertions v1.0.1/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM=
+github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9 h1:hp2CYQUINdZMHdvTdXtPOY2ainKl4IoMcpAXEf2xj3Q=
+github.com/smartystreets/go-aws-auth v0.0.0-20180515143844-0c1422d1fdb9/go.mod h1:SnhjPscd9TpLiy1LpzGSKh3bXCfxxXuqd9xmQJy3slM=
 github.com/smartystreets/goconvey v0.0.0-20190330032615-68dc04aab96a/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
 github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s=
 github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=