Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the scalability of the join between the LHS and GroupBys by breaking up the join #621

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 12 additions & 1 deletion spark/src/main/scala/ai/chronon/spark/Join.scala
Original file line number Diff line number Diff line change
Expand Up @@ -261,9 +261,20 @@ class Join(joinConf: api.Join,
// combine bootstrap table and join part tables
// sequentially join bootstrap table and each join part table. some column may exist both on left and right because
// a bootstrap source can cover a partial date range. we combine the columns using coalesce-rule
var previous: Option[DataFrame] = None
rightResults
.foldLeft(bootstrapDf) {
case (partialDf, (rightPart, rightDf)) => joinWithLeft(partialDf, rightDf, rightPart)
case (partialDf, ((rightPart, rightDf), i)) =>
val next = joinWithLeft(partialDf, rightDf, rightPart)
// Join breaks are added to prevent the Spark app from stalling on a Join that involves too many
// rightParts.
if (((i + 1) % tableUtils.finalJoinParallelism) == 0 && (i != (rightResults.size - 1))) {
tableUtils.addJoinBreak(next)
previous.map(_.unpersist())
previous = Some(next)
} else {
next
}
}
// drop all processing metadata columns
.drop(Constants.MatchedHashes, Constants.TimePartitionColumn)
Expand Down
4 changes: 4 additions & 0 deletions spark/src/main/scala/ai/chronon/spark/TableUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ case class TableUtils(sparkSession: SparkSession) {

val joinPartParallelism: Int = sparkSession.conf.get("spark.chronon.join.part.parallelism", "1").toInt
val aggregationParallelism: Int = sparkSession.conf.get("spark.chronon.group_by.parallelism", "1000").toInt
val finalJoinParallelism: Int = sparkSession.conf.get("spark.chronon.join.final_join_parallelism", "8").toInt
val maxWait: Int = sparkSession.conf.get("spark.chronon.wait.hours", "48").toInt

sparkSession.sparkContext.setLogLevel("ERROR")
Expand Down Expand Up @@ -324,6 +325,9 @@ case class TableUtils(sparkSession: SparkSession) {
df
}

def addJoinBreak(dataFrame: DataFrame): DataFrame =
dataFrame.persist(cacheLevel.getOrElse(StorageLevel.MEMORY_AND_DISK))

def insertUnPartitioned(df: DataFrame,
tableName: String,
tableProperties: Map[String, String] = null,
Expand Down