Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP][DO NOT MERGE] Draft implementation of the new PySpark API for support of both Spark Classic and Spark Connect #814

Draft
wants to merge 16 commits into
base: sparkConnect
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 6 additions & 1 deletion .github/workflows/codeql.yml
Expand Up @@ -13,11 +13,16 @@ name: "CodeQL"

on:
push:

branches: [ main, 0.4.0 ]
pull_request:
# The branches below must be a subset of the branches above
branches: [ main, 0.4.0 ]
paths-ignore:
- '**/*.md'
- '**/*.txt'
>>>>>>> 0.4.0
schedule:
- cron: '22 3 * * 5'

jobs:
analyze:
Expand Down
27 changes: 27 additions & 0 deletions .gitignore
Expand Up @@ -23,3 +23,30 @@ python/docs/_build/_doctrees
**/python/build/*
**/assembly/.classpath
**/.DS_Store

# Python stuff
.env
.venv

# Sphinx _build
**/_build

# Helix stuff
.helix

# Emacs stuff
.dir-locals.el

# JDTLS stuff
.package
.classpath
.project
.settings
.factorypath

# Metals LSP
.metals
.bloop

# Hadoop & Spark binaries
spark-*
13 changes: 13 additions & 0 deletions buf.gen.yaml
@@ -0,0 +1,13 @@
version: v1
plugins:
# Building the Java classes
- plugin: buf.build/protocolbuffers/java:v25.3
out: spark/client/src/main/java
# Building the Python build and building the mypy interfaces.
- plugin: buf.build/protocolbuffers/python:v25.3
out: python/zingg_v2/proto
- plugin: buf.build/grpc/python:v1.62.0
out: python/zingg_v2/proto
- plugin: buf.build/community/nipunn1313-mypy:v3.5.0
out: python/zingg_v2/proto

3 changes: 3 additions & 0 deletions buf.work.yaml
@@ -0,0 +1,3 @@
version: v1
directories:
- protobuf
Expand Up @@ -154,4 +154,4 @@ protected Pipe<D,R,C> getOutputPipe() {
}

protected abstract Pipe setSaveModeOnPipe(Pipe<D,R,C> p);
}
}
25 changes: 23 additions & 2 deletions pom.xml
Expand Up @@ -48,7 +48,7 @@
<profile>
<id>spark-3.5</id>
<activation>
<activeByDefault>true</activeByDefault>
<activeByDefault>false</activeByDefault>
<property>
<name>spark</name>
<value>3.5</value>
Expand All @@ -62,6 +62,23 @@
<graphframes.version>0.8.3-spark3.5-s_2.12</graphframes.version>
</properties>
</profile>
<profile>
<id>spark-4.0</id>
<activation>
<activeByDefault>true</activeByDefault>
<property>
<name>spark</name>
<value>4.0</value>
</property>
</activation>
<properties>
<spark.version>4.0.0-SNAPSHOT</spark.version>
<scala.version>2.13.13</scala.version>
<spark.binary.version>4.0</spark.binary.version>
<scala.binary.version>2.13</scala.binary.version>
<graphframes.version>0.8.3-spark3.5-s_2.13</graphframes.version>
</properties>
</profile>
</profiles>
<properties>
<zingg.version>0.4.0</zingg.version>
Expand Down Expand Up @@ -89,7 +106,11 @@
<repository>
<id>SparkPackagesRepo</id>
<url>https://repos.spark-packages.org/</url>
</repository>
</repository>
<repository>
<id>Apache Snapshots</id>
<url>https://repository.apache.org/snapshots/</url>
</repository>
</repositories>

<dependencies>
Expand Down
9 changes: 9 additions & 0 deletions protobuf/connect_plugins.proto
@@ -0,0 +1,9 @@
syntax = 'proto3';

option java_multiple_files = true;
option java_package = "zingg.spark.connect.proto";

message SubmitZinggJob {
string args = 1;
string options = 2;
}
2 changes: 2 additions & 0 deletions python/docs/Makefile
Expand Up @@ -17,4 +17,6 @@ help:
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
export ZINGG_DRY_RUN=1
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
unset ZINGG_DRY_RUN
21 changes: 21 additions & 0 deletions python/pyproject.toml
@@ -0,0 +1,21 @@
[build-system]
requires = ["setuptools >= 61.0"]
build-backend = "setuptools.build_meta"

[project]
name = "zingg"
dynamic = ["version"]
dependencies = [
"pandas",
"seaborn",
"matplotlib",
"sphinx",
"sphinx-rtd-theme",
"pyspark>=3.5",
"pydantic",
]
readme = "README.md"
requires-python = ">=3.11"

[tool.ruff]
line-length = 110
3 changes: 2 additions & 1 deletion python/requirements.txt
Expand Up @@ -3,4 +3,5 @@ seaborn
matplotlib
sphinx
sphinx-rtd-theme
pyspark
pyspark[connect]>=3.5
pydantic
11 changes: 11 additions & 0 deletions python/test_spark_connect.py
@@ -0,0 +1,11 @@
from zingg_v2.client import Zingg, Arguments, ClientOptions
from pyspark.sql.connect.session import SparkSession


if __name__ == "__main__":
spark = SparkSession.builder.remote("sc://localhost").getOrCreate()
print(hasattr(spark, "_jvm"))
opts = ClientOptions(None)
args = Arguments.createArgumentsFromJSON(fileName="../examples/febrl/config.json", phase="peekModel")
zingg = Zingg(args=args, options=opts)
zingg.execute()