Skip to content

Commit

Permalink
feat: Add a job to detect new raw schema fields to add to safe schema…
Browse files Browse the repository at this point in the history
… manual models
  • Loading branch information
AbdouSeck committed Oct 25, 2022
1 parent 0467898 commit 96d2cae
Show file tree
Hide file tree
Showing 3 changed files with 181 additions and 0 deletions.
80 changes: 80 additions & 0 deletions dataeng/jobs/analytics/DetectNewDBTManualModelsFields.groovy
@@ -0,0 +1,80 @@
package analytics

import static org.edx.jenkins.dsl.AnalyticsConstants.common_log_rotator
import static org.edx.jenkins.dsl.AnalyticsConstants.common_publishers
import static org.edx.jenkins.dsl.AnalyticsConstants.common_triggers
import static org.edx.jenkins.dsl.AnalyticsConstants.secure_scm
import static org.edx.jenkins.dsl.AnalyticsConstants.secure_scm_parameters


class DetectNewDBTManualModelsFields {
public static def job = { dslFactory, allVars ->
dslFactory.job("detect-new-dbt-manual-models-fields") {
// If the DISABLED is set to true by the job's extra vars, then disable the job.
disabled(allVars.get('DISABLED', false))
description("This job detects new columns in tables in raw schemas that have yet to be manually added to safe schema models.")
// Set a definite log rotation, if defined.
logRotator common_log_rotator(allVars)
// Set the analytics-secure parameters for repo and branch from the common helpers
parameters secure_scm_parameters(allVars)
// Add parameters to use analytics-tools and warehouse-transforms
parameters {
stringParam('ANALYTICS_TOOLS_URL', allVars.get('ANALYTICS_TOOLS_URL'), 'URL for the analytics tools repo.')
stringParam('ANALYTICS_TOOLS_BRANCH', allVars.get('ANALYTICS_TOOLS_BRANCH'), 'Branch of analytics tools repo to use.')
stringParam('WAREHOUSE_TRANSFORMS_URL', allVars.get('WAREHOUSE_TRANSFORMS_URL'), 'URL for the warehouse-transforms repository.')
stringParam('WAREHOUSE_TRANSFORMS_BRANCH', allVars.get('WAREHOUSE_TRANSFORMS_BRANCH'), 'Branch of warehouse-transforms repository to use.')
stringParam('DBT_TARGET', allVars.get('DBT_TARGET'), 'DBT target from profiles.yml in analytics-secure.')
stringParam('DBT_PROFILE', allVars.get('DBT_PROFILE'), 'DBT profile from profiles.yml in analytics-secure.')
stringParam('DBT_PROJECT_PATH', allVars.get('DBT_PROJECT_PATH'), 'Path in warehouse-transforms to use as the dbt project, relative to "projects" (usually automated/applications or reporting).')
stringParam('NOTIFY', allVars.get('NOTIFY','$PAGER_NOTIFY'), 'Space separated list of emails to send notifications to.')
}
// Set the necessary VAULT kv paths of credentials as environment variables
environmentVariables {
env('JIRA_WEBHOOK_VAULT_KV_PATH', allVars.get('JIRA_WEBHOOK_VAULT_KV_PATH'))
env('JIRA_WEBHOOK_VAULT_KV_VERSION', allVars.get('JIRA_WEBHOOK_VAULT_KV_VERSION'))
env('AUTOMATION_TASK_USER_VAULT_KV_PATH', allVars.get('AUTOMATION_TASK_USER_VAULT_KV_PATH'))
env('AUTOMATION_TASK_USER_VAULT_KV_VERSION', allVars.get('AUTOMATION_TASK_USER_VAULT_KV_VERSION'))
}
// SCM settings for analytics-secure and analytics-tools
multiscm secure_scm(allVars) << {
git {
remote {
url('$WAREHOUSE_TRANSFORMS_URL')
branch('$WAREHOUSE_TRANSFORMS_BRANCH')
}
extensions {
relativeTargetDirectory('warehouse-transforms')
pruneBranches()
cleanAfterCheckout()
}
}
git {
remote {
url('$ANALYTICS_TOOLS_URL')
branch('$ANALYTICS_TOOLS_BRANCH')
credentials('1')
}
extensions {
relativeTargetDirectory('analytics-tools')
pruneBranches()
cleanAfterCheckout()
}
}
}
wrappers {
colorizeOutput('xterm')
timestamps()
credentialsBinding {
usernamePassword('ANALYTICS_VAULT_ROLE_ID', 'ANALYTICS_VAULT_SECRET_ID', 'analytics-vault')
}
}
// Set the trigger using cron
triggers common_triggers(allVars)
// Notifications on build failures
publishers common_publishers(allVars)
steps {
shell(dslFactory.readFileFromWorkspace('dataeng/resources/detect-new-dbt-manual-models-fields.sh'))
}
}
}
}
2 changes: 2 additions & 0 deletions dataeng/jobs/createJobsNew.groovy
Expand Up @@ -2,6 +2,7 @@ import static analytics.DBTDocs.job as DBTDocsJob
import static analytics.DBTRun.job as DBTRunJob
import static analytics.DBTSourceFreshness.job as DBTSourceFreshnessJob
import static analytics.DeployCluster.job as DeployClusterJob
import static analytics.DetectNewDBTManualModelsFields.job as DetectNewDBTManualModelsFields
import static analytics.EmrCostReporter.job as EmrCostReporterJob
import static analytics.ModelTransfers.job as ModelTransfersJob
import static analytics.RetirementJobEdxTriggers.job as RetirementJobEdxTriggersJob
Expand Down Expand Up @@ -45,6 +46,7 @@ def taskMap = [
DBT_RUN_JOB: DBTRunJob,
DBT_SOURCE_FRESHNESS_JOB: DBTSourceFreshnessJob,
DEPLOY_CLUSTER_JOB: DeployClusterJob,
DETECT_NEW_DBT_MANUAL_MODELS_FIELDS_JOB: DetectNewDBTManualModelsFields,
EMR_COST_REPORTER_JOB: EmrCostReporterJob,
MODEL_TRANSFERS_JOB: ModelTransfersJob,
RETIREMENT_JOB_EDX_TRIGGERS_JOB: RetirementJobEdxTriggersJob,
Expand Down
99 changes: 99 additions & 0 deletions dataeng/resources/detect-new-dbt-manual-models-fields.sh
@@ -0,0 +1,99 @@
#!/usr/bin/env bash
set -ex

# Global variables
DBT_PROFILE_ARGS="--profiles-dir ${WORKSPACE}/analytics-secure/warehouse-transforms/ --profile ${DBT_PROFILE} --target ${DBT_TARGET}"
DBT_TARGET_PATH=${WORKSPACE}/warehouse-transforms/projects/${DBT_PROJECT_PATH}/target

# Set up a virtual environment for warehouse-transforms
PYTHON38_VENV="py38_venv_warehouse_transforms"
virtualenv --python=python3.8 --clear "${PYTHON38_VENV}"
source "${PYTHON38_VENV}/bin/activate"
pip install -U pip
pip install -r ${WORKSPACE}/warehouse-transforms/requirements.txt

# Go into the automated/applications project and compile the manual models
cd ${WORKSPACE}/warehouse-transforms/projects/${DBT_PROJECT_PATH}
dbt clean ${DBT_PROFILE_ARGS}
dbt deps ${DBT_PROFILE_ARGS}
dbt compile ${DBT_PROFILE_ARGS} --select tag:manual

# Deactivate the virtual environment, so we can create and activate another one
deactivate

# Setup a virtual environment for analytics-tools
PYTHON38_VENV="py38_venv_analytics_tools"
virtualenv --python=python3.8 --clear "${PYTHON38_VENV}"
source "${PYTHON38_VENV}/bin/activate"

# Go into analytics-tools and install the dependencies
cd ${WORKSPACE}/analytics-tools/snowflake
make requirements

# Create a function to clean up the credential files, and trap EXIT with it
function clean_up_files() {
rm -rf .snowflake_private_key .snowflake_private_key_passphrase
}
trap clean_up_files EXIT

# Fetch credentials from vault
# Do not print commands in this function since they may contain secrets.
set +x

# Retrieve a vault token corresponding to the jenkins AppRole. The token is then stored in the VAULT_TOKEN variable
# which is implicitly used by subsequent vault commands within this script.
# Instructions followed: https://learn.hashicorp.com/tutorials/vault/approle#step-4-login-with-roleid-secretid
export VAULT_TOKEN=$(vault write -field=token auth/approle/login \
role_id=${ANALYTICS_VAULT_ROLE_ID} \
secret_id=${ANALYTICS_VAULT_SECRET_ID}
)

set -x

# JIRA webhook URL and secret string from vault
JIRA_WEBHOOK_URL=$(
vault kv get \
-version=${JIRA_WEBHOOK_VAULT_KV_VERSION} \
-field=JIRA_WEBHOOK_URL \
${JIRA_WEBHOOK_VAULT_KV_PATH} \
)
JIRA_WEBHOOK_SECRET=$(
vault kv get \
-version=${JIRA_WEBHOOK_VAULT_KV_VERSION} \
-field=JIRA_WEBHOOK_SECRET \
${JIRA_WEBHOOK_VAULT_KV_PATH} \
)

# Snowflake credentials from vault
SNOWFLAKE_ACCOUNT=$(
vault kv get \
-version=${AUTOMATION_TASK_USER_VAULT_KV_VERSION} \
-field=account \
${AUTOMATION_TASK_USER_VAULT_KV_PATH} \
)

SNOWFLAKE_USER=$(
vault kv get \
-version=${AUTOMATION_TASK_USER_VAULT_KV_VERSION} \
-field=user \
${AUTOMATION_TASK_USER_VAULT_KV_PATH} \
)
# The detect_new_raw_columns.py script, much like all other scripts that connect to Snowflake,
# expects the private key and the privarte key passphrase to be in files.
# As a result, SNOWFLAKE_PRIVATE_KEY and SNOWFLAKE_PRIVATE_KEY_PASSPHRASE are stored in files.
vault kv get \
-version=${AUTOMATION_TASK_USER_VAULT_KV_VERSION} \
-field=private_key \
${AUTOMATION_TASK_USER_VAULT_KV_PATH} > .snowflake_private_key

vault kv get \
-version=${AUTOMATION_TASK_USER_VAULT_KV_VERSION} \
-field=private_key_passphrase \
${AUTOMATION_TASK_USER_VAULT_KV_PATH} > .snowflake_private_key_passphrase

# Invoke the script to detect new fields that need to be added manually
python detect_new_raw_columns.py ${DBT_TARGET_PATH} \
--user ${SNOWFLAKE_USER} --account ${SNOWFLAKE_ACCOUNT} \
--key-path .snowflake_private_key --passphrase-path .snowflake_private_key_passphrase \
--jira-webhook-url ${JIRA_WEBHOOK_URL} \
--jira-webhook-secret ${JIRA_WEBHOOK_SECRET}

0 comments on commit 96d2cae

Please sign in to comment.