Skip to content

Commit

Permalink
feat: Support specifying an alternate BQ dataset_id for BQ tables (#203)
Browse files Browse the repository at this point in the history
* feat: Prepend dataset ID in TF resource names for BQ tables

* feat: Support explicitly specifying the dataset_id for BQ tables

* feat: refactored generate TF tests to setup and test multiple BQ dataset IDs
  • Loading branch information
adlersantos committed Oct 8, 2021
1 parent d0425cd commit 9115e82
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 41 deletions.
12 changes: 9 additions & 3 deletions scripts/generate_terraform.py
Expand Up @@ -119,7 +119,11 @@ def generate_dataset_tf(dataset_id: str, project_id: str, config: dict, env: str

contents = ""
for resource in config["resources"]:
contents += tf_resource_contents(resource, {**resource, **subs})
if resource.get("dataset_id"):
_subs = {**subs, **{"dataset_id": resource["dataset_id"]}}
else:
_subs = subs
contents += tf_resource_contents(resource, {**resource, **_subs})

create_file_in_dir_tree(
dataset_id, contents, f"{dataset_id}_dataset.tf", PROJECT_ROOT / f".{env}"
Expand Down Expand Up @@ -204,13 +208,15 @@ def customize_template_subs(resource: dict, subs: dict) -> dict:
"uniform_bucket_level_access"
)
elif resource["type"] == "bigquery_table":
dataset_table = f"{subs['dataset_id']}_{resource['table_id']}"

# Terraform resource names cannot start with digits, but BigQuery allows
# table names that start with digits. We prepend `bqt_` to table names
# that doesn't comply with Terraform's naming rule.
if resource["table_id"][0].isdigit():
subs["tf_resource_name"] = "bqt_" + resource["table_id"]
subs["tf_resource_name"] = f"bqt_{dataset_table}"
else:
subs["tf_resource_name"] = resource["table_id"]
subs["tf_resource_name"] = dataset_table
return subs


Expand Down
4 changes: 2 additions & 2 deletions templates/terraform/google_bigquery_table.tf.jinja2
Expand Up @@ -46,10 +46,10 @@ resource "google_bigquery_table" "{{ tf_resource_name }}" {
]
}

output "bigquery_table-{{ table_id }}-table_id" {
output "bigquery_table-{{ tf_resource_name }}-table_id" {
value = google_bigquery_table.{{ tf_resource_name }}.table_id
}

output "bigquery_table-{{ table_id }}-id" {
output "bigquery_table-{{ tf_resource_name }}-id" {
value = google_bigquery_table.{{ tf_resource_name }}.id
}
177 changes: 141 additions & 36 deletions tests/scripts/test_generate_terraform.py
Expand Up @@ -26,9 +26,10 @@
from scripts import generate_terraform

PROJECT_ROOT = generate_terraform.PROJECT_ROOT
SAMPLE_YAML_PATHS = {
FILE_PATHS = {
"dataset": PROJECT_ROOT / "samples" / "dataset.yaml",
"pipeline": PROJECT_ROOT / "samples" / "pipeline.yaml",
"license": PROJECT_ROOT / "templates" / "airflow" / "license_header.py.jinja2",
}

ENV_PATH = PROJECT_ROOT / ".test"
Expand All @@ -42,13 +43,19 @@ def dataset_path():
with tempfile.TemporaryDirectory(
dir=generate_terraform.DATASETS_PATH, suffix="_dataset"
) as dir_path:
yield pathlib.Path(dir_path)
try:
yield pathlib.Path(dir_path)
finally:
shutil.rmtree(dir_path)


@pytest.fixture
def pipeline_path(dataset_path, suffix="_pipeline"):
with tempfile.TemporaryDirectory(dir=dataset_path, suffix=suffix) as dir_path:
yield pathlib.Path(dir_path)
try:
yield pathlib.Path(dir_path)
finally:
shutil.rmtree(dir_path)


@pytest.fixture
Expand Down Expand Up @@ -109,6 +116,29 @@ def env() -> str:
return "test"


def set_dataset_ids_in_config_files(
dataset_path: pathlib.Path, pipeline_path: pathlib.Path
):
shutil.copyfile(FILE_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(FILE_PATHS["pipeline"], pipeline_path / "pipeline.yaml")

dataset_config = yaml.load(dataset_path / "dataset.yaml")
dataset_config["dataset"]["name"] = dataset_path.name

for resource in dataset_config["resources"]:
if resource["type"] == "bigquery_dataset":
resource["dataset_id"] = dataset_path.name

yaml.dump(dataset_config, dataset_path / "dataset.yaml")

pipeline_config = yaml.load(pipeline_path / "pipeline.yaml")
for resource in pipeline_config["resources"]:
if resource["type"] == "bigquery_table":
resource["dataset_id"] = dataset_path.name

yaml.dump(pipeline_config, pipeline_path / "pipeline.yaml")


def test_tf_templates_exist():
for _, filepath in generate_terraform.TEMPLATE_PATHS.items():
assert filepath.exists()
Expand All @@ -125,8 +155,7 @@ def test_main_generates_tf_files(
tf_state_bucket,
tf_state_prefix,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
set_dataset_ids_in_config_files(dataset_path, pipeline_path)

generate_terraform.main(
dataset_path.name,
Expand Down Expand Up @@ -180,8 +209,7 @@ def test_main_without_tf_remote_state_generates_tf_files_except_backend_tf(
impersonating_acct,
env,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
set_dataset_ids_in_config_files(dataset_path, pipeline_path)

generate_terraform.main(
dataset_path.name,
Expand Down Expand Up @@ -233,9 +261,9 @@ def test_main_with_multiple_pipelines(
):
assert pipeline_path.name != pipeline_path_2.name

shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path_2 / "pipeline.yaml")
shutil.copyfile(FILE_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(FILE_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
shutil.copyfile(FILE_PATHS["pipeline"], pipeline_path_2 / "pipeline.yaml")

generate_terraform.main(
dataset_path.name,
Expand Down Expand Up @@ -281,6 +309,82 @@ def test_main_with_multiple_pipelines(
).exists()


def test_main_with_multiple_bq_dataset_ids(
dataset_path,
pipeline_path,
project_id,
bucket_name_prefix,
region,
impersonating_acct,
env,
):
set_dataset_ids_in_config_files(dataset_path, pipeline_path)

# First, declare an additional custom BQ dataset in dataset.yaml
another_dataset_id = "another_dataset"
assert another_dataset_id != dataset_path.name

dataset_config = yaml.load(dataset_path / "dataset.yaml")
dataset_config["resources"].append(
{"type": "bigquery_dataset", "dataset_id": another_dataset_id}
)
yaml.dump(dataset_config, dataset_path / "dataset.yaml")

# Then, add a BQ table under the additional BQ dataset
pipeline_config = yaml.load(pipeline_path / "pipeline.yaml")
pipeline_config["resources"].append(
{
"type": "bigquery_table",
"table_id": "another_table",
"dataset_id": another_dataset_id,
}
)
yaml.dump(pipeline_config, pipeline_path / "pipeline.yaml")

generate_terraform.main(
dataset_path.name,
project_id,
bucket_name_prefix,
region,
impersonating_acct,
env,
None,
None,
)

for path_prefix in (
ENV_DATASETS_PATH / dataset_path.name / "_terraform",
generate_terraform.DATASETS_PATH / dataset_path.name / "_terraform",
):
assert (path_prefix / f"{dataset_path.name}_dataset.tf").exists()
assert (path_prefix / f"{pipeline_path.name}_pipeline.tf").exists()

# Match the "google_bigquery_dataset" properties, i.e. any lines between the
# curly braces, in the *_dataset.tf file
regexp = r"\"google_bigquery_dataset\" \"" + r"[A-Za-z0-9_]+" + r"\" \{(.*?)\}"
bq_dataset_tf_string = re.compile(regexp, flags=re.MULTILINE | re.DOTALL)

for path_prefix in (
ENV_DATASETS_PATH / dataset_path.name / "_terraform",
generate_terraform.DATASETS_PATH / dataset_path.name / "_terraform",
):
matches = bq_dataset_tf_string.findall(
(path_prefix / f"{dataset_path.name}_dataset.tf").read_text()
)

dataset_ids = set()
for match in matches:
result = re.search(r"dataset_id\s+\=\s+\"([A-Za-z0-9_]+)\"", match)
assert result.group(1)
dataset_ids.add(result.group(1))

# Assert that the dataset_ids are unique
assert len(dataset_ids) == len(matches)

assert another_dataset_id in dataset_ids
assert dataset_path.name in dataset_ids


def test_dataset_without_any_pipelines(
dataset_path,
project_id,
Expand All @@ -291,7 +395,7 @@ def test_dataset_without_any_pipelines(
tf_state_bucket,
tf_state_prefix,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(FILE_PATHS["dataset"], dataset_path / "dataset.yaml")

generate_terraform.main(
dataset_path.name,
Expand Down Expand Up @@ -367,8 +471,7 @@ def test_generated_tf_files_contain_license_headers(
tf_state_bucket,
tf_state_prefix,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
set_dataset_ids_in_config_files(dataset_path, pipeline_path)

generate_terraform.main(
dataset_path.name,
Expand Down Expand Up @@ -416,8 +519,7 @@ def test_dataset_tf_file_contains_description_when_specified(
impersonating_acct,
env,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
set_dataset_ids_in_config_files(dataset_path, pipeline_path)

generate_terraform.main(
dataset_path.name,
Expand Down Expand Up @@ -463,8 +565,8 @@ def test_bq_dataset_can_have_a_description_with_newlines_and_quotes(
impersonating_acct,
env,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
shutil.copyfile(FILE_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(FILE_PATHS["pipeline"], pipeline_path / "pipeline.yaml")

config = yaml.load(open(dataset_path / "dataset.yaml"))

Expand Down Expand Up @@ -501,8 +603,7 @@ def test_dataset_tf_has_no_bq_dataset_description_when_unspecified(
impersonating_acct,
env,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
set_dataset_ids_in_config_files(dataset_path, pipeline_path)

config = yaml.load(open(dataset_path / "dataset.yaml"))

Expand Down Expand Up @@ -551,8 +652,7 @@ def test_pipeline_tf_contains_optional_properties_when_specified(
impersonating_acct,
env,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
set_dataset_ids_in_config_files(dataset_path, pipeline_path)

generate_terraform.main(
dataset_path.name,
Expand All @@ -577,7 +677,13 @@ def test_pipeline_tf_contains_optional_properties_when_specified(

# Match the "google_bigquery_table" properties, i.e. any lines between the
# curly braces, in the *_pipeline.tf file
regexp = r"\"google_bigquery_table\" \"" + bq_table["table_id"] + r"\" \{(.*?)^\}"
regexp = (
r"\"google_bigquery_table\" \""
+ bq_table["dataset_id"]
+ "_"
+ bq_table["table_id"]
+ r"\" \{(.*?)^\}"
)
bq_table_tf_string = re.compile(regexp, flags=re.MULTILINE | re.DOTALL)

for path_prefix in (
Expand All @@ -604,15 +710,12 @@ def test_pipeline_tf_has_no_optional_properties_when_unspecified(
impersonating_acct,
env,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
set_dataset_ids_in_config_files(dataset_path, pipeline_path)

config = yaml.load(open(pipeline_path / "pipeline.yaml"))

# Get the first bigquery_table resource and delete the `description` field
bq_table = next(
(r for r in config["resources"] if r["type"] == "bigquery_table"), None
)
bq_table = next((r for r in config["resources"] if r["type"] == "bigquery_table"))
del bq_table["description"]
del bq_table["time_partitioning"]
del bq_table["clustering"]
Expand All @@ -633,7 +736,13 @@ def test_pipeline_tf_has_no_optional_properties_when_unspecified(

# Match the "google_bigquery_table" properties, i.e. any lines between the
# curly braces, in the *_pipeline.tf file
regexp = r"\"google_bigquery_table\" \"" + bq_table["table_id"] + r"\" \{(.*?)^\}"
regexp = (
r"\"google_bigquery_table\" \""
+ bq_table["dataset_id"]
+ "_"
+ bq_table["table_id"]
+ r"\" \{(.*?)^\}"
)
bq_table_tf_string = re.compile(regexp, flags=re.MULTILINE | re.DOTALL)

for path_prefix in (
Expand All @@ -660,8 +769,7 @@ def test_bq_table_can_have_a_description_with_newlines_and_quotes(
impersonating_acct,
env,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
set_dataset_ids_in_config_files(dataset_path, pipeline_path)

config = yaml.load(open(pipeline_path / "pipeline.yaml"))

Expand Down Expand Up @@ -697,8 +805,7 @@ def test_bq_table_name_starts_with_digits_but_tf_resource_name_does_not(
impersonating_acct,
env,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
set_dataset_ids_in_config_files(dataset_path, pipeline_path)

config = yaml.load(open(pipeline_path / "pipeline.yaml"))
table_name_starting_with_digit = f"{str(random.randint(0, 9))}_table"
Expand Down Expand Up @@ -810,8 +917,7 @@ def test_validation_on_generated_tf_files_in_dot_env_dir(
impersonating_acct,
env,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
set_dataset_ids_in_config_files(dataset_path, pipeline_path)

generate_terraform.main(
dataset_path.name,
Expand Down Expand Up @@ -840,8 +946,7 @@ def test_validation_on_generated_tf_files_in_project_dir(
impersonating_acct,
env,
):
shutil.copyfile(SAMPLE_YAML_PATHS["dataset"], dataset_path / "dataset.yaml")
shutil.copyfile(SAMPLE_YAML_PATHS["pipeline"], pipeline_path / "pipeline.yaml")
set_dataset_ids_in_config_files(dataset_path, pipeline_path)

generate_terraform.main(
dataset_path.name,
Expand Down

0 comments on commit 9115e82

Please sign in to comment.