Skip to content

Commit

Permalink
Add downstream referencing views to raw data gitbook documentation (R…
Browse files Browse the repository at this point in the history
…ecidiviz/recidiviz-data#29334)

## Description of the change

- Update the gitbook state raw data page to rename the “Referencing
Views” column to “Referencing Ingest Views” and add a second column,
“Referencing Downstream Views” which is hydrated with the contents of
raw_data_reference_reasons.yaml to list any downstream views that
reference each raw data table.

- Refactor raw_data_reference_reasons.yaml parsing to a separate class

## Type of change

> All pull requests must have at least one of the following labels
applied (otherwise the PR will fail):

| Label | Description |
|-----------------------------
|-----------------------------------------------------------------------------------------------------------
|
| Type: Bug | non-breaking change that fixes an issue |
| Type: Feature | non-breaking change that adds functionality |
| Type: Breaking Change | fix or feature that would cause existing
functionality to not work as expected |
| Type: Non-breaking refactor | change addresses some tech debt item or
prepares for a later change, but does not change functionality |
| Type: Configuration Change | adjusts configuration to achieve some end
related to functionality, development, performance, or security |
| Type: Dependency Upgrade | upgrades a project dependency - these
changes are not included in release notes |

## Related issues

Closes Recidiviz/recidiviz-data#29124

## Checklists

### Development

**This box MUST be checked by the submitter prior to merging**:
- [x] **Double- and triple-checked that there is no Personally
Identifiable Information (PII) being mistakenly added in this pull
request**

These boxes should be checked by the submitter prior to merging:
- [x] Tests have been written to cover the code changed/added as part of
this pull request

### Code review

These boxes should be checked by reviewers prior to merging:

- [x] This pull request has a descriptive title and information useful
to a reviewer
- [x] Potential security implications or infrastructural changes have
been considered, if relevant

GitOrigin-RevId: 3377905846046554656398d68823f23135dcc2fe
  • Loading branch information
emilyemilyemilyemilyemilyemily authored and Helper Bot committed May 10, 2024
1 parent ca05752 commit b838304
Show file tree
Hide file tree
Showing 5 changed files with 276 additions and 36 deletions.
33 changes: 32 additions & 1 deletion recidiviz/ingest/direct/direct_ingest_documentation_generator.py
Expand Up @@ -36,6 +36,9 @@
from recidiviz.ingest.direct.views.direct_ingest_view_query_builder_collector import (
DirectIngestViewQueryBuilderCollector,
)
from recidiviz.tools.raw_data_reference_reasons_yaml_loader import (
RawDataReferenceReasonsYamlLoader,
)
from recidiviz.utils.string import StrictStringFormatter

STATE_RAW_DATA_FILE_HEADER_TEMPLATE = """# {state_name} Raw Data Description
Expand Down Expand Up @@ -70,6 +73,10 @@ def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]:
state_code = StateCode(region_code.upper())
state_name = state_code.get_state().name

downstream_views_by_raw_file = self.get_downstream_referencing_views(
state_code
)

file_header = StrictStringFormatter().format(
STATE_RAW_DATA_FILE_HEADER_TEMPLATE,
state_name=state_name,
Expand All @@ -85,6 +92,7 @@ def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]:
),
)
else:
downstream_views_by_raw_file = defaultdict(list)
file_header = ""

raw_file_configs = [
Expand All @@ -109,6 +117,7 @@ def generate_raw_file_docs_for_region(self, region_code: str) -> Dict[str, str]:
config_paths_by_file_tag,
file_tags_with_raw_file_configs,
views_by_raw_file,
downstream_views_by_raw_file,
)

docs_per_file: Dict[str, str] = {
Expand Down Expand Up @@ -205,6 +214,7 @@ def _generate_raw_file_table(
config_paths_by_file_tag: Dict[str, str],
file_tags_with_raw_file_configs: List[str],
views_by_raw_file: Dict[str, List[str]],
downstream_views_by_raw_file: Dict[str, List[str]],
) -> str:
"""Generates a Markdown-formatted table of contents to be included in a raw file specification."""
table_matrix = [
Expand All @@ -215,11 +225,16 @@ def _generate_raw_file_table(
else f"{file_tag}"
),
",<br />".join(sorted(views_by_raw_file[file_tag])),
",<br />".join(sorted(downstream_views_by_raw_file[file_tag])),
]
for file_tag in sorted(config_paths_by_file_tag)
]
writer = MarkdownTableWriter(
headers=["**Table**", "**Referencing Views**"],
headers=[
"**Table**",
"**Referencing Ingest Views**",
"**Referencing Downstream Views**",
],
value_matrix=table_matrix,
# Margin values other than 0 have nondeterministic spacing. Do not change.
margin=0,
Expand All @@ -240,3 +255,19 @@ def get_referencing_views(
views_by_raw_file[config.file_tag].append(ingest_view.ingest_view_name)

return views_by_raw_file

@staticmethod
def get_downstream_referencing_views(
state_code: StateCode,
) -> Dict[str, List[str]]:
"""Generates a dictionary mapping raw files to downstream views that reference them."""
raw_data_references = (
RawDataReferenceReasonsYamlLoader.get_downstream_referencing_views(
state_code
)
)
downstream_views_by_raw_file = defaultdict(list)
for file_tag, views in raw_data_references.items():
downstream_views_by_raw_file[file_tag] = [view.to_str() for view in views]

return downstream_views_by_raw_file
Expand Up @@ -16,10 +16,12 @@
# =============================================================================
"""Tests for DirectIngestDocumentationGenerator."""
import unittest
from collections import defaultdict
from typing import List

from mock import MagicMock, patch

from recidiviz.big_query.big_query_address import BigQueryAddress
from recidiviz.common.constants import states
from recidiviz.common.constants.states import TEST_STATE_CODE_DOCS
from recidiviz.ingest.direct.direct_ingest_documentation_generator import (
Expand Down Expand Up @@ -94,8 +96,13 @@ def tearDown(self) -> None:
"recidiviz.ingest.direct.direct_ingest_documentation_generator.DirectIngestDocumentationGenerator"
".get_referencing_views"
)
@patch(
"recidiviz.ingest.direct.direct_ingest_documentation_generator.RawDataReferenceReasonsYamlLoader"
".get_downstream_referencing_views"
)
def test_generate_raw_file_docs_for_region(
self,
mock_downstream_referencing_views: MagicMock,
mock_referencing_views: MagicMock,
_mock_region: MagicMock,
mock_raw_config: MagicMock,
Expand All @@ -112,6 +119,16 @@ def test_generate_raw_file_docs_for_region(
"tagNotHistorical": [],
"tagPrimaryKeyColsMissing": [],
}
mock_downstream_referencing_views.return_value = defaultdict(
set,
{
"multiLineDescription": {
BigQueryAddress.from_str("dataset.view_three"),
BigQueryAddress.from_str("dataset.view_four"),
},
"tagColumnsMissing": {BigQueryAddress.from_str("dataset.view_four")},
},
)

documentation_generator = DirectIngestDocumentationGenerator()
documentation = documentation_generator.generate_raw_file_docs_for_region(
Expand All @@ -126,12 +143,12 @@ def test_generate_raw_file_docs_for_region(
## Table of Contents
| **Table** | **Referencing Views** |
|----------------------------------------------------------------|-----------------------|
|[multiLineDescription](raw_data/multiLineDescription.md) |view_one,<br />view_two|
|[tagColumnsMissing](raw_data/tagColumnsMissing.md) |view_one |
|[tagNotHistorical](raw_data/tagNotHistorical.md) | |
|[tagPrimaryKeyColsMissing](raw_data/tagPrimaryKeyColsMissing.md)| |
| **Table** |**Referencing Ingest Views**| **Referencing Downstream Views** |
|----------------------------------------------------------------|----------------------------|------------------------------------------|
|[multiLineDescription](raw_data/multiLineDescription.md) |view_one,<br />view_two |dataset.view_four,<br />dataset.view_three|
|[tagColumnsMissing](raw_data/tagColumnsMissing.md) |view_one |dataset.view_four |
|[tagNotHistorical](raw_data/tagNotHistorical.md) | | |
|[tagPrimaryKeyColsMissing](raw_data/tagPrimaryKeyColsMissing.md)| | |
"""

expected_multi_line = """## multiLineDescription
Expand Down
Expand Up @@ -15,26 +15,20 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.
# =============================================================================
"""Tests for enforcing documentation of views that reference raw data."""
import os
import unittest
from typing import Any, Dict, List, Set, Tuple
from unittest.mock import patch

import yaml

import recidiviz
from recidiviz.big_query.big_query_address import BigQueryAddress
from recidiviz.common.constants.states import StateCode
from recidiviz.tools.find_direct_raw_data_references import (
find_direct_raw_data_references,
)
from recidiviz.view_registry.deployed_views import all_deployed_view_builders

RAW_DATA_REFERENCES_YAML = "view_registry/raw_data_reference_reasons.yaml"
RAW_DATA_REFERENCES_YAML_PATH = os.path.join(
os.path.dirname(recidiviz.__file__),
from recidiviz.tools.raw_data_reference_reasons_yaml_loader import (
RAW_DATA_REFERENCES_YAML,
RawDataReferenceReasonsYamlLoader,
)
from recidiviz.view_registry.deployed_views import all_deployed_view_builders


class TestEnforceRawDataReferenceDocumentation(unittest.TestCase):
Expand All @@ -50,13 +44,7 @@ class TestEnforceRawDataReferenceDocumentation(unittest.TestCase):
def setUpClass(cls) -> None:
cls.project_id_patcher = patch("recidiviz.utils.metadata.project_id")
cls.project_id_patcher.start().return_value = "recidiviz-testing"
with open(RAW_DATA_REFERENCES_YAML_PATH, "r", encoding="utf-8") as yaml_file:
cls.yaml_raw_data = yaml.safe_load(yaml_file)
cls.yaml_data = (
TestEnforceRawDataReferenceDocumentation._convert_raw_yaml_data_to_objs(
cls.yaml_raw_data
)
)
cls.yaml_data = RawDataReferenceReasonsYamlLoader.get_yaml_data()
cls.deployed_views_references = find_direct_raw_data_references(
all_deployed_view_builders()
)
Expand All @@ -67,7 +55,9 @@ def tearDownClass(cls) -> None:

def test_verify_yaml_entries_in_alphabetical_order(self) -> None:
self.assertTrue(
TestEnforceRawDataReferenceDocumentation._is_sorted(self.yaml_raw_data),
TestEnforceRawDataReferenceDocumentation._is_sorted(
RawDataReferenceReasonsYamlLoader.get_raw_yaml_data()
),
f"Entries in {RAW_DATA_REFERENCES_YAML} must be in alphabetical order.",
)

Expand Down Expand Up @@ -130,15 +120,3 @@ def _find_missing_references(
for view in views
if view not in actual.get(state, {}).get(file_tag, set())
]

@staticmethod
def _convert_raw_yaml_data_to_objs(
references: Dict[str, Dict[str, Set[str]]]
) -> Dict[StateCode, Dict[str, Set[BigQueryAddress]]]:
return {
StateCode(state_code): {
file_tag: {BigQueryAddress.from_str(view) for view in views}
for file_tag, views in file_tags.items()
}
for state_code, file_tags in references.items()
}
121 changes: 121 additions & 0 deletions recidiviz/tests/tools/raw_data_reference_reasons_yaml_loader_test.py
@@ -0,0 +1,121 @@
# Recidiviz - a data platform for criminal justice reform
# Copyright (C) 2023 Recidiviz, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
# =============================================================================
"""Tests for raw_data_reference_reasons_yaml_loader.py."""
import unittest
from collections import defaultdict
from unittest.mock import mock_open, patch

import yaml
from mock import MagicMock

from recidiviz.big_query.big_query_address import BigQueryAddress
from recidiviz.common.constants.states import StateCode
from recidiviz.tools.raw_data_reference_reasons_yaml_loader import (
RawDataReferenceReasonsYamlLoader,
)

mock_yaml_content = """
US_XX:
table1:
dataset1.table1: |-
Usage reason unknown.
dataset2.table2: |-
Usage reason unknown.
US_YY:
table2:
dataset3.table3: |-
Usage reason unknown.
"""
mock_yaml_invalid_content = """
US_NOT_REAL:
table1:
dataset1.table1: |-
Usage reason unknown.
"""
mock_raw_data = {
"US_XX": {
"table1": {
"dataset1.table1": "Usage reason unknown.",
"dataset2.table2": "Usage reason unknown.",
}
},
"US_YY": {"table2": {"dataset3.table3": "Usage reason unknown."}},
}
mock_converted_data = {
StateCode.US_XX: {
"table1": {
BigQueryAddress.from_str("dataset1.table1"),
BigQueryAddress.from_str("dataset2.table2"),
}
},
StateCode.US_YY: {"table2": {BigQueryAddress.from_str("dataset3.table3")}},
}


class TestRawDataReferenceReasonsYamlLoader(unittest.TestCase):
"""Test raw data reference reasons yaml loader."""

def setUp(self) -> None:
RawDataReferenceReasonsYamlLoader.reset_data()

@patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_content)
@patch("yaml.safe_load", side_effect=yaml.YAMLError("error parsing YAML"))
def test_load_yaml_failure(self, _1: MagicMock, _2: MagicMock) -> None:
with self.assertRaises(RuntimeError):
RawDataReferenceReasonsYamlLoader.get_yaml_data()
with self.assertRaises(RuntimeError):
RawDataReferenceReasonsYamlLoader.get_raw_yaml_data()

@patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_invalid_content)
def test_parse_yaml_failure(self, _: MagicMock) -> None:
with self.assertRaises(RuntimeError):
RawDataReferenceReasonsYamlLoader.get_yaml_data()

@patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_content)
def test_load_yaml(self, _: MagicMock) -> None:
self.assertEqual(
RawDataReferenceReasonsYamlLoader.get_yaml_data(), mock_converted_data
)
self.assertEqual(
RawDataReferenceReasonsYamlLoader.get_raw_yaml_data(), mock_raw_data
)

@patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_content)
def test_get_downstream_referencing_views(self, _: MagicMock) -> None:
result = RawDataReferenceReasonsYamlLoader.get_downstream_referencing_views(
StateCode.US_XX
)
self.assertEqual(
result,
{
"table1": {
BigQueryAddress.from_str("dataset1.table1"),
BigQueryAddress.from_str("dataset2.table2"),
}
},
)

@patch("builtins.open", new_callable=mock_open, read_data=mock_yaml_content)
def test_get_downstream_referencing_views_invalid_state(self, _: MagicMock) -> None:
result = RawDataReferenceReasonsYamlLoader.get_downstream_referencing_views(
StateCode.US_WW
)
self.assertEqual(
result,
defaultdict(set),
)
self.assertEqual(result["non_existent_file_tag"], set())

0 comments on commit b838304

Please sign in to comment.