Skip to content

Commit

Permalink
Support indexing hive view (#244)
Browse files Browse the repository at this point in the history
* Support indexing hive view

* clean code
  • Loading branch information
feng-tao committed Apr 23, 2020
1 parent 125e2ca commit 5f7224a
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 13 deletions.
11 changes: 7 additions & 4 deletions databuilder/extractor/hive_table_metadata_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ class HiveTableMetadataExtractor(Extractor):
SELECT source.* FROM
(SELECT t.TBL_ID, d.NAME as `schema`, t.TBL_NAME name, t.TBL_TYPE, tp.PARAM_VALUE as description,
p.PKEY_NAME as col_name, p.INTEGER_IDX as col_sort_order,
p.PKEY_TYPE as col_type, p.PKEY_COMMENT as col_description, 1 as "is_partition_col"
p.PKEY_TYPE as col_type, p.PKEY_COMMENT as col_description, 1 as "is_partition_col",
IF(t.TBL_TYPE = 'VIRTUAL_VIEW', 1, 0) "is_view"
FROM TBLS t
JOIN DBS d ON t.DB_ID = d.DB_ID
JOIN PARTITION_KEYS p ON t.TBL_ID = p.TBL_ID
Expand All @@ -38,7 +39,8 @@ class HiveTableMetadataExtractor(Extractor):
UNION
SELECT t.TBL_ID, d.NAME as `schema`, t.TBL_NAME name, t.TBL_TYPE, tp.PARAM_VALUE as description,
c.COLUMN_NAME as col_name, c.INTEGER_IDX as col_sort_order,
c.TYPE_NAME as col_type, c.COMMENT as col_description, 0 as "is_partition_col"
c.TYPE_NAME as col_type, c.COMMENT as col_description, 0 as "is_partition_col",
IF(t.TBL_TYPE = 'VIRTUAL_VIEW', 1, 0) "is_view"
FROM TBLS t
JOIN DBS d ON t.DB_ID = d.DB_ID
JOIN SDS s ON t.SD_ID = s.SD_ID
Expand Down Expand Up @@ -99,12 +101,13 @@ def _get_extract_iter(self):
last_row = row
columns.append(ColumnMetadata(row['col_name'], row['col_description'],
row['col_type'], row['col_sort_order']))

is_view = last_row['is_view'] == 1
yield TableMetadata('hive', self._cluster,
last_row['schema'],
last_row['name'],
last_row['description'],
columns)
columns,
is_view=is_view)

def _get_raw_extract_iter(self):
# type: () -> Iterator[Dict[str, Any]]
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from setuptools import setup, find_packages


__version__ = '2.5.4'
__version__ = '2.5.5'

requirements_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'requirements.txt')
with open(requirements_path) as requirements_file:
Expand Down
24 changes: 16 additions & 8 deletions tests/unit/extractor/test_hive_table_metadata_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def test_extraction_with_single_result(self):
connection.execute = sql_execute
table = {'schema': 'test_schema',
'name': 'test_table',
'description': 'a table for testing'}
'description': 'a table for testing',
'is_view': 0}

sql_execute.return_value = [
self._union(
Expand Down Expand Up @@ -86,7 +87,8 @@ def test_extraction_with_single_result(self):
ColumnMetadata('is_active', None, 'boolean', 2),
ColumnMetadata('source', 'description of source', 'varchar', 3),
ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
ColumnMetadata('ds', None, 'varchar', 5)])
ColumnMetadata('ds', None, 'varchar', 5)],
is_view=False)
self.assertEqual(expected.__repr__(), actual.__repr__())
self.assertIsNone(extractor.extract())

Expand All @@ -99,15 +101,18 @@ def test_extraction_with_multiple_result(self):
connection.execute = sql_execute
table = {'schema': 'test_schema1',
'name': 'test_table1',
'description': 'test table 1'}
'description': 'test table 1',
'is_view': 0}

table1 = {'schema': 'test_schema1',
'name': 'test_table2',
'description': 'test table 2'}
'description': 'test table 2',
'is_view': 0}

table2 = {'schema': 'test_schema2',
'name': 'test_table3',
'description': 'test table 3'}
'description': 'test table 3',
'is_view': 0}

sql_execute.return_value = [
self._union(
Expand Down Expand Up @@ -171,18 +176,21 @@ def test_extraction_with_multiple_result(self):
ColumnMetadata('is_active', None, 'boolean', 2),
ColumnMetadata('source', 'description of source', 'varchar', 3),
ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
ColumnMetadata('ds', None, 'varchar', 5)])
ColumnMetadata('ds', None, 'varchar', 5)],
is_view=False)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__())

expected = TableMetadata('hive', 'gold', 'test_schema1', 'test_table2', 'test table 2',
[ColumnMetadata('col_name', 'description of col_name', 'varchar', 0),
ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)])
ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1)],
is_view=False)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__())

expected = TableMetadata('hive', 'gold', 'test_schema2', 'test_table3', 'test table 3',
[ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0),
ColumnMetadata('col_name3', 'description of col_name3',
'varchar', 1)])
'varchar', 1)],
is_view=False)
self.assertEqual(expected.__repr__(), extractor.extract().__repr__())

self.assertIsNone(extractor.extract())
Expand Down

0 comments on commit 5f7224a

Please sign in to comment.