Skip to content

Commit

Permalink
feat: column level badges cont. (#381)
Browse files Browse the repository at this point in the history
* made create_relation meth for BadgeMetadata

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* small fix

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* started setting up badge to be entity

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* made standalone badge metadata file and changed table metadata to use this for column badges, can use with any other entity really

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* fixed an oopsie in column metadata

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* added tests for badge.py and fixed none type issue in cloumn metadata

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* replaced with None cause badges are optional

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* fixed all unit tests

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* lint :/

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* fixed typing issues

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* bumped minor version on setup.py

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* removed leftover comments and prints

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* implemented check and pattern matching for start label and key on badge

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* fixed circular dep and regex issue

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>

* tests for exceptions

Signed-off-by: Allison Suarez Miranda <asuarezmiranda@lyft.com>
  • Loading branch information
allisonsuarez committed Oct 16, 2020
1 parent 20c2fd2 commit af4b512
Show file tree
Hide file tree
Showing 6 changed files with 241 additions and 69 deletions.
116 changes: 116 additions & 0 deletions databuilder/models/badge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# Copyright Contributors to the Amundsen project.
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Dict, List, Optional
import re

from databuilder.models.neo4j_csv_serde import Neo4jCsvSerializable, NODE_KEY, \
NODE_LABEL, RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \
RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE


class Badge:
def __init__(self, name: str, category: str):
self.name = name
self.category = category

def __repr__(self) -> str:
return 'Badge({!r}, {!r})'.format(self.name,
self.category)


class BadgeMetadata(Neo4jCsvSerializable):
"""
Badge model.
"""
BADGE_NODE_LABEL = 'Badge'
BADGE_KEY_FORMAT = '{badge}'
BADGE_CATEGORY = 'category'

# Relation between entity and badge
BADGE_RELATION_TYPE = 'HAS_BADGE'
INVERSE_BADGE_RELATION_TYPE = 'BADGE_FOR'

def __init__(self,
db_name: str,
schema: str,
start_label: str, # Table, Dashboard, Column
start_key: str,
badges: List[Badge],
cluster: str = 'gold', # is this what we want as default for badges..?
):
self.badges = badges

self.db = db_name.lower()
self.schema = schema.lower()
self.cluster = cluster.lower()

table_key_pattern = re.compile('[a-z]+://[a-zA-Z0-9_.-]+.[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+')
dashboard_key_pattern = re.compile('[a-z]+_dashboard://[a-zA-Z0-9_.-]+.[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+')
column_key_pattern = re.compile('[a-z]+://[a-zA-Z0-9_.-]+.[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+')
map_label_to_key_pattern = {
'Table': table_key_pattern,
'Dashboard': dashboard_key_pattern,
'Column': column_key_pattern,
}
if start_label in map_label_to_key_pattern.keys():
self.start_label = start_label
if map_label_to_key_pattern[start_label].match(start_key):
self.start_key = start_key
else:
raise Exception(start_key + ' does not match the key pattern for a ' + start_label)
else:
raise Exception(start_label + ' is not a valid start_label for a Badge relation')

self._node_iter = iter(self.create_nodes())
self._relation_iter = iter(self.create_relation())

def create_next_node(self) -> Optional[Dict[str, Any]]:
# return the string representation of the data
try:
return next(self._node_iter)
except StopIteration:
return None

def create_next_relation(self) -> Optional[Dict[str, Any]]:
try:
return next(self._relation_iter)
except StopIteration:
return None

@staticmethod
def get_badge_key(name: str) -> str:
if not name:
return ''
return BadgeMetadata.BADGE_KEY_FORMAT.format(badge=name)

def get_metadata_model_key(self) -> str:
return self.start_key

def create_nodes(self) -> List[Dict[str, Any]]:
"""
Create a list of Neo4j node records
:return:
"""
results = []
for badge in self.badges:
if badge:
results.append({
NODE_KEY: self.get_badge_key(badge.name),
NODE_LABEL: self.BADGE_NODE_LABEL,
self.BADGE_CATEGORY: badge.category
})
return results

def create_relation(self) -> List[Dict[str, Any]]:
results = []
for badge in self.badges:
results.append({
RELATION_START_LABEL: self.start_label,
RELATION_END_LABEL: self.BADGE_NODE_LABEL,
RELATION_START_KEY: self.start_key,
RELATION_END_KEY: self.get_badge_key(badge.name),
RELATION_TYPE: self.BADGE_RELATION_TYPE,
RELATION_REVERSE_TYPE: self.INVERSE_BADGE_RELATION_TYPE,
})
return results
91 changes: 24 additions & 67 deletions databuilder/models/table_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,60 +12,12 @@
RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE)
from databuilder.publisher.neo4j_csv_publisher import UNQUOTED_SUFFIX
from databuilder.models.schema import schema_constant
from databuilder.models.badge import BadgeMetadata, Badge

DESCRIPTION_NODE_LABEL_VAL = 'Description'
DESCRIPTION_NODE_LABEL = DESCRIPTION_NODE_LABEL_VAL


class BadgeMetadata(Neo4jCsvSerializable):
BADGE_NODE_LABEL = 'Badge'
BADGE_KEY_FORMAT = '{badge}'
BADGE_CATEGORY = 'category'
DASHBOARD_TYPE = 'dashboard'
METRIC_TYPE = 'metric'

def __init__(self,
name: str,
category: str,
):
self._name = name
self._category = category
self._nodes = iter([self.create_badge_node(self._name)])
self._relations: Iterator[Dict[str, Any]] = iter([])

def __repr__(self) -> str:
return 'BadgeMetadata({!r}, {!r})'.format(self._name,
self._category)

@staticmethod
def get_badge_key(name: str) -> str:
if not name:
return ''
return BadgeMetadata.BADGE_KEY_FORMAT.format(badge=name)

@staticmethod
def create_badge_node(name: str,
category: str = 'column',
) -> Dict[str, str]:
return {NODE_LABEL: BadgeMetadata.BADGE_NODE_LABEL,
NODE_KEY: BadgeMetadata.get_badge_key(name),
BadgeMetadata.BADGE_CATEGORY: category}

def create_next_node(self) -> Optional[Dict[str, Any]]:
# return the string representation of the data
try:
return next(self._nodes)
except StopIteration:
return None

def create_next_relation(self) -> Optional[Dict[str, Any]]:
# We don't emit any relations for Badge ingestion
try:
return next(self._relations)
except StopIteration:
return None


class TagMetadata(Neo4jCsvSerializable):
TAG_NODE_LABEL = 'Tag'
TAG_KEY_FORMAT = '{tag}'
Expand All @@ -92,7 +44,7 @@ def get_tag_key(name: str) -> str:

@staticmethod
def create_tag_node(name: str,
tag_type: str =DEFAULT_TYPE
tag_type: str = DEFAULT_TYPE
) -> Dict[str, str]:
return {NODE_LABEL: TagMetadata.TAG_NODE_LABEL,
NODE_KEY: TagMetadata.get_tag_key(name),
Expand Down Expand Up @@ -199,10 +151,6 @@ class ColumnMetadata:
COLUMN_DESCRIPTION = 'description'
COLUMN_DESCRIPTION_FORMAT = '{db}://{cluster}.{schema}/{tbl}/{col}/{description_id}'

# Relation between column and badge
COL_BADGE_RELATION_TYPE = 'HAS_BADGE'
BADGE_COL_RELATION_TYPE = 'BADGE_FOR'

def __init__(self,
name: str,
description: Union[str, None],
Expand All @@ -222,7 +170,10 @@ def __init__(self,
text=description)
self.type = col_type
self.sort_order = sort_order
self.badges = badges
if badges:
self.badges = [Badge(badge, 'column') for badge in badges]
else:
self.badges = []

def __repr__(self) -> str:
return 'ColumnMetadata({!r}, {!r}, {!r}, {!r}, {!r})'.format(self.name,
Expand Down Expand Up @@ -427,8 +378,15 @@ def _create_next_node(self) -> Iterator[Any]: # noqa: C901
yield col.description.get_node_dict(node_key)

if col.badges:
for badge in col.badges:
yield BadgeMetadata.create_badge_node(badge)
badge_metadata = BadgeMetadata(db_name=self._get_database_key(),
schema=self._get_schema_key(),
start_label=ColumnMetadata.COLUMN_NODE_LABEL,
start_key=self._get_col_key(col),
badges=col.badges,
cluster=self._get_cluster_key())
badge_nodes = badge_metadata.create_nodes()
for node in badge_nodes:
yield node

# Database, cluster, schema
others = [NodeTuple(key=self._get_database_key(),
Expand Down Expand Up @@ -498,17 +456,16 @@ def _create_next_relation(self) -> Iterator[Any]:
yield col.description.get_relation(ColumnMetadata.COLUMN_NODE_LABEL,
self._get_col_key(col),
self._get_col_description_key(col, col.description))

if col.badges:
for badge in col.badges:
yield {
RELATION_START_LABEL: ColumnMetadata.COLUMN_NODE_LABEL,
RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL,
RELATION_START_KEY: self._get_col_key(col),
RELATION_END_KEY: BadgeMetadata.get_badge_key(badge),
RELATION_TYPE: ColumnMetadata.COL_BADGE_RELATION_TYPE,
RELATION_REVERSE_TYPE: ColumnMetadata.BADGE_COL_RELATION_TYPE,
}
badge_metadata = BadgeMetadata(db_name=self._get_database_key(),
schema=self._get_schema_key(),
start_label=ColumnMetadata.COLUMN_NODE_LABEL,
start_key=self._get_col_key(col),
badges=col.badges,
cluster=self._get_cluster_key())
badge_relations = badge_metadata.create_relation()
for relation in badge_relations:
yield relation

others = [
RelTuple(start_label=TableMetadata.DATABASE_NODE_LABEL,
Expand Down
2 changes: 1 addition & 1 deletion databuilder/models/table_owner.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self,
schema: str,
table_name: str,
owners: Union[List, str],
cluster: str ='gold',
cluster: str = 'gold',
) -> None:
self.db = db_name.lower()
self.schema = schema.lower()
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from setuptools import setup, find_packages


__version__ = '3.3.2'
__version__ = '3.4.0'


requirements = [
Expand Down
1 change: 1 addition & 0 deletions tests/unit/extractor/test_hive_table_metadata_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def test_extraction_with_single_result(self) -> None:
4),
ColumnMetadata('ds', None, 'varchar', 5)],
is_view=False)

self.assertEqual(expected.__repr__(), actual.__repr__())
self.assertIsNone(extractor.extract())

Expand Down
98 changes: 98 additions & 0 deletions tests/unit/models/test_badge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Copyright Contributors to the Amundsen project.
# SPDX-License-Identifier: Apache-2.0

import unittest
from databuilder.models.badge import Badge, BadgeMetadata

from databuilder.models.neo4j_csv_serde import NODE_KEY, NODE_LABEL, \
RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \
RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE

db = 'hive'
SCHEMA = 'BASE'
TABLE = 'TEST'
CLUSTER = 'DEFAULT'
badge1 = Badge('badge1', 'column')
badge2 = Badge('badge2', 'column')


class TestBadge(unittest.TestCase):
def setUp(self) -> None:
super(TestBadge, self).setUp()
self.badge_metada = BadgeMetadata(db_name='hive',
schema=SCHEMA,
start_label='Column',
start_key='hive://default.base/test/ds',
cluster=CLUSTER,
badges=[badge1, badge2])

def test_get_badge_key(self) -> None:
badge_key = self.badge_metada.get_badge_key(badge1.name)
self.assertEquals(badge_key, badge1.name)

def test_create_nodes(self) -> None:
nodes = self.badge_metada.create_nodes()
self.assertEquals(len(nodes), 2)

node1 = {
NODE_KEY: BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge1.name),
NODE_LABEL: BadgeMetadata.BADGE_NODE_LABEL,
BadgeMetadata.BADGE_CATEGORY: badge1.category
}
node2 = {
NODE_KEY: BadgeMetadata.BADGE_KEY_FORMAT.format(badge=badge2.name),
NODE_LABEL: BadgeMetadata.BADGE_NODE_LABEL,
BadgeMetadata.BADGE_CATEGORY: badge2.category
}

self.assertTrue(node1 in nodes)
self.assertTrue(node2 in nodes)

def test_bad_key_entity_match(self) -> None:
column_label = 'Column'
table_key = 'hive://default.base/test'

self.assertRaises(Exception,
BadgeMetadata,
db_name='hive',
schema=SCHEMA,
start_label=column_label,
start_key=table_key,
cluster=CLUSTER,
badges=[badge1, badge2])

def test_bad_entity_label(self) -> None:
user_label = 'User'
table_key = 'hive://default.base/test'
self.assertRaises(Exception,
BadgeMetadata,
db_name='hive',
schema=SCHEMA,
start_label=user_label,
start_key=table_key,
cluster=CLUSTER,
badges=[badge1, badge2])

def test_create_relation(self) -> None:
relations = self.badge_metada.create_relation()
self.assertEquals(len(relations), 2)

relation1 = {
RELATION_START_LABEL: self.badge_metada.start_label,
RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL,
RELATION_START_KEY: self.badge_metada.start_key,
RELATION_END_KEY: BadgeMetadata.get_badge_key(badge1.name),
RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE,
RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE,
}
relation2 = {
RELATION_START_LABEL: self.badge_metada.start_label,
RELATION_END_LABEL: BadgeMetadata.BADGE_NODE_LABEL,
RELATION_START_KEY: self.badge_metada.start_key,
RELATION_END_KEY: BadgeMetadata.get_badge_key(badge2.name),
RELATION_TYPE: BadgeMetadata.BADGE_RELATION_TYPE,
RELATION_REVERSE_TYPE: BadgeMetadata.INVERSE_BADGE_RELATION_TYPE,
}

self.assertTrue(relation1 in relations)
self.assertTrue(relation2 in relations)

0 comments on commit af4b512

Please sign in to comment.