Skip to content

Commit

Permalink
feat: Add a job to detect new raw schema fields to add to safe schema…
Browse files Browse the repository at this point in the history
… manual models
  • Loading branch information
AbdouSeck committed Oct 4, 2022
1 parent db88007 commit a98a1f8
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 0 deletions.
64 changes: 64 additions & 0 deletions dataeng/jobs/analytics/DetectNewDBTManualModelsFields.groovy
@@ -0,0 +1,64 @@
package analytics

import static org.edx.jenkins.dsl.AnalyticsConstants.common_log_rotator
import static org.edx.jenkins.dsl.AnalyticsConstants.common_publishers
import static org.edx.jenkins.dsl.AnalyticsConstants.common_triggers
import static org.edx.jenkins.dsl.AnalyticsConstants.secure_scm
import static org.edx.jenkins.dsl.AnalyticsConstants.secure_scm_parameters


class DetectNewDBTManualModelsFields {
public static def job = { dslFactory, allVars ->
dslFactory.job("detect-new-dbt-manual-models-fields") {
// If the DISABLED is set to true by the job's extra vars, then disable the job.
disabled(allVars.get('DISABLED', false))
description("This job detects new columns in tables in raw schemas that have yet to be manually added to safe schema models.")
// Set a definite log rotation, if defined.
logRotator common_log_rotator(allVars)
// Set the analytics-secure parameters for repo and branch from the common helpers
parameters secure_scm_parameters(allVars)
// Add the analytics-tools parameters for repo and branch information
parameters {
stringParam('ANALYTICS_TOOLS_URL', allVars.get('ANALYTICS_TOOLS_URL'), 'URL for the analytics tools repo.')
stringParam('ANALYTICS_TOOLS_BRANCH', allVars.get('ANALYTICS_TOOLS_BRANCH'), 'Branch of analytics tools repo to use.')
stringParam('NOTIFY', allVars.get('NOTIFY','$PAGER_NOTIFY'), 'Space separated list of emails to send notifications to.')
}
// Set the necessary VAULT kv paths of credentials as environment variables
environmentVariables {
env('JIRA_WEBHOOK_VAULT_KV_PATH', allVars.get('JIRA_WEBHOOK_VAULT_KV_PATH'))
env('JIRA_WEBHOOK_VAULT_KV_VERSION', allVars.get('JIRA_WEBHOOK_VAULT_KV_VERSION'))
env('AUTOMATION_TASK_VAULT_KV_PATH', allVars.get('AUTOMATION_TASK_VAULT_KV_PATH'))
env('AUTOMATION_TASK_VAULT_KV_VERSION', allVars.get('AUTOMATION_TASK_VAULT_KV_VERSION'))
}
// SCM settings for analytics-secure and analytics-tools
multiscm secure_scm(allVars) << {
git {
remote {
url('$ANALYTICS_TOOLS_URL')
branch('$ANALYTICS_TOOLS_BRANCH')
credentials('1')
}
extensions {
relativeTargetDirectory('analytics-tools')
pruneBranches()
cleanAfterCheckout()
}
}
}
wrappers {
colorizeOutput('xterm')
timestamps()
credentialsBinding {
usernamePassword('ANALYTICS_VAULT_ROLE_ID', 'ANALYTICS_VAULT_SECRET_ID', 'analytics-vault')
}
}
// Set the trigger using cron
triggers common_triggers(allVars)
// Notifications on build failures
publishers common_publishers(allVars)
steps {
shell(dslFactory.readFileFromWorkspace('dataeng/resources/detect-new-dbt-manual-models-fields.sh'))
}
}
}
}
2 changes: 2 additions & 0 deletions dataeng/jobs/createJobsNew.groovy
Expand Up @@ -2,6 +2,7 @@ import static analytics.DBTDocs.job as DBTDocsJob
import static analytics.DBTRun.job as DBTRunJob
import static analytics.DBTSourceFreshness.job as DBTSourceFreshnessJob
import static analytics.DeployCluster.job as DeployClusterJob
import static analytics.DetectNewDBTManualModelsFields.job as DetectNewDBTManualModelsFields
import static analytics.EmrCostReporter.job as EmrCostReporterJob
import static analytics.ModelTransfers.job as ModelTransfersJob
import static analytics.RetirementJobEdxTriggers.job as RetirementJobEdxTriggersJob
Expand Down Expand Up @@ -45,6 +46,7 @@ def taskMap = [
DBT_RUN_JOB: DBTRunJob,
DBT_SOURCE_FRESHNESS_JOB: DBTSourceFreshnessJob,
DEPLOY_CLUSTER_JOB: DeployClusterJob,
DETECT_NEW_DBT_MANUAL_MODELS_FIELDS_JOB: DetectNewDBTManualModelsFields,
EMR_COST_REPORTER_JOB: EmrCostReporterJob,
MODEL_TRANSFERS_JOB: ModelTransfersJob,
RETIREMENT_JOB_EDX_TRIGGERS_JOB: RetirementJobEdxTriggersJob,
Expand Down
79 changes: 79 additions & 0 deletions dataeng/resources/detect-new-dbt-manual-models-fields.sh
@@ -0,0 +1,79 @@
#!/usr/bin/env bash
set -ex

# Setup a virtual environment
PYTHON38_VENV="py38_venv"
virtualenv --python=python3.8 --clear "${PYTHON38_VENV}"
source "${PYTHON38_VENV}/bin/activate"

# Go into analytics-tools and install the dependencies
cd ${WORKSPACE}/analytics-tools/snowflake
make requirements

# Fetch credentials from vault
# Do not print commands in this function since they may contain secrets.
set +x

# Retrieve a vault token corresponding to the jenkins AppRole. The token is then stored in the VAULT_TOKEN variable
# which is implicitly used by subsequent vault commands within this script.
# Instructions followed: https://learn.hashicorp.com/tutorials/vault/approle#step-4-login-with-roleid-secretid
export VAULT_TOKEN=$(vault write -field=token auth/approle/login \
role_id=${ANALYTICS_VAULT_ROLE_ID} \
secret_id=${ANALYTICS_VAULT_SECRET_ID}
)

# JIRA webhook URL and secret string from vault
WEBHOOK_URL=$(
vault kv get \
-version=${JIRA_WEBHOOK_VAULT_KV_VERSION} \
-field=JIRA_WEBHOOK_URL \
${JIRA_WEBHOOK_VAULT_KV_PATH} \
)
WEBHOOK_SECRET=$(
vault kv get \
-version=${JIRA_WEBHOOK_VAULT_KV_VERSION} \
-field=JIRA_WEBHOOK_SECRET \
${JIRA_WEBHOOK_VAULT_KV_PATH} \
)

# Snowflake credentials from vault
SNOWFLAKE_ACCOUNT=$(
vault kv get \
-version=${AUTOMATION_TASK_VAULT_KV_VERSION} \
-field=account \
${AUTOMATION_TASK_VAULT_KV_PATH} \
)

SNOWFLAKE_USER=$(
vault kv get \
-version=${AUTOMATION_TASK_VAULT_KV_VERSION} \
-field=user \
${AUTOMATION_TASK_VAULT_KV_PATH} \
)
# The detect_new_raw_columns.py script, much like all other scripts that connect to Snowflake,
# expects the private key and the privarte key passphrase to be in files.
# As a result, SNOWFLAKE_PRIVATE_KEY and SNOWFLAKE_PRIVATE_KEY_PASSPHRASE are stored in files.
vault kv get \
-version=${AUTOMATION_TASK_VAULT_KV_VERSION} \
-field=private_key \
${AUTOMATION_TASK_VAULT_KV_PATH} > .private_key_file

vault kv get \
-version=${AUTOMATION_TASK_VAULT_KV_VERSION} \
-field=private_key_passphrase \
${AUTOMATION_TASK_VAULT_KV_PATH} > .private_key_passphrase_file
set -x

# The extra vars file for this job contains both field mappings and the necessary credentials for Snowflake and Jenkins.
# Therefore, the options to the script are read from the config file.
CONFIG_PATH=${WORKSPACE}/analytics-secure/job-configs/DETECT_NEW_DBT_MANUAL_MODELS_FIELDS_JOB_MAPPINGS.yaml

# Invoke the script to detect new fields that need to be added manually
python detect_new_raw_columns.py ${CONFIG_PATH} \
--user ${SNOWFLAKE_USER} --account ${SNOWFLAKE_ACCOUNT} \
--key-path .private_key_file --passphrase-path .private_key_passphrase_file \
--jira-webhook-url ${WEBHOOK_URL} \
--jira-webhook-secret ${WEBHOOK_SECRET}

# Clean up the temporary files with SNOWFLAKE_PRIVATE_KEY and SNOWFLAKE_PRIVATE_KEY_PASSPHRASE
rm -rf .private_key_file .private_key_passphrase_file

0 comments on commit a98a1f8

Please sign in to comment.