Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add custom argument parser for cell magic
- Loading branch information
Showing
4 changed files
with
706 additions
and
0 deletions.
There are no files selected for viewing
22 changes: 22 additions & 0 deletions
22
google/cloud/bigquery/ipython_magics/line_arg_parser/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from google.cloud.bigquery.ipython_magics.line_arg_parser.lexer import Lexer | ||
from google.cloud.bigquery.ipython_magics.line_arg_parser.parser import Parser | ||
from google.cloud.bigquery.ipython_magics.line_arg_parser import ( | ||
visitors, | ||
) # TODO: import all | ||
|
||
|
||
__all__ = ("Lexer", "Parser", "visitors") |
189 changes: 189 additions & 0 deletions
189
google/cloud/bigquery/ipython_magics/line_arg_parser/lexer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
from collections import namedtuple | ||
from collections import OrderedDict | ||
import itertools | ||
import re | ||
|
||
import enum | ||
|
||
|
||
Token = namedtuple("Token", ("type_", "lexeme", "pos")) | ||
StateTransition = namedtuple("StateTransition", ("new_state", "total_offset")) | ||
|
||
|
||
token_types = OrderedDict( | ||
state_1=OrderedDict( | ||
GOTO_STATE_2=r"(?P<GOTO_STATE_2>(?=--))", # double dash - starting the options list | ||
DEST_VAR=r"(?P<DEST_VAR>[^\d\W]\w*)", # essentially a Python ID | ||
), | ||
state_2=OrderedDict( | ||
GOTO_STATE_3=r"(?P<GOTO_STATE_3>(?=--params(?=\s|$)))", # the --params option | ||
OPTION_SPEC=r"(?P<OPTION_SPEC>--\w+)", | ||
# NOTE: currently the only valid value for a non "--params" option is project ID | ||
OPT_VAL=r"(?P<OPT_VAL>[^_\d\W](?:\w|\.)+)", | ||
), | ||
state_3=OrderedDict( | ||
PY_STRING=r"(?P<PY_STRING>(?:{})|(?:{}))".format( | ||
r"'(?:[^'\\]|\.)*'", r'"(?:[^"\\]|\.)*"' # single and double quoted strings | ||
), | ||
PARAMS_OPT_SPEC=r"(?P<PARAMS_OPT_SPEC>--params(?=\s|$))", | ||
GOTO_STATE_2=r"(?P<GOTO_STATE_2>(?=--\w+))", # found another option spec | ||
PY_BOOL=r"(?P<PY_BOOL>True|False)", | ||
DOLLAR_PY_ID=r"(?P<DOLLAR_PY_ID>\$[^\d\W]\w*)", | ||
PY_ID=r"(?P<PY_ID>[^\d\W]\w*)", | ||
# TODO: supporting only ints or floats, add floats in scientific notation, too? | ||
PY_NUMBER=r"(?P<PY_NUMBER>-?[1-9]\d*(?:\.\d+)?)", | ||
SQUOTE=r"(?P<SQUOTE>')", | ||
DQUOTE=r'(?P<DQUOTE>")', | ||
COLON=r"(?P<COLON>:)", | ||
COMMA=r"(?P<COMMA>,)", | ||
LCURL=r"(?P<LCURL>\{)", | ||
RCURL=r"(?P<RCURL>})", | ||
LSQUARE=r"(?P<LSQUARE>\[)", | ||
RSQUARE=r"(?P<RSQUARE>])", | ||
LPAREN=r"(?P<LPAREN>\()", | ||
RPAREN=r"(?P<RPAREN>\))", | ||
), | ||
common=OrderedDict( | ||
WS=r"(?P<WS>\s+)", | ||
EOL=r"(?P<EOL>$)", | ||
UNKNOWN=r"(?P<UNKNOWN>\S+)", # anything not a whitespace or matched by something else | ||
) | ||
) | ||
|
||
|
||
class AutoStrEnum(str, enum.Enum): | ||
def _generate_next_value_(name, start, count, last_values): | ||
return name | ||
|
||
|
||
TokenType = AutoStrEnum( | ||
"TokenType", | ||
[ | ||
name for name in itertools.chain.from_iterable(token_types.values()) | ||
if not name.startswith("GOTO_STATE") | ||
] | ||
) | ||
|
||
|
||
class LexerState(AutoStrEnum): | ||
STATE_1 = enum.auto() # parsing positional arguments | ||
STATE_2 = enum.auto() # parsing options other than "--params" | ||
STATE_3 = enum.auto() # parsing the "--params" option | ||
STATE_END = enum.auto() | ||
|
||
|
||
class Lexer(object): | ||
"""Lexical analyzer for tokenizing the cell magic input line.""" | ||
|
||
_GRAND_PATTERNS = { | ||
LexerState.STATE_1: re.compile( | ||
"|".join( | ||
itertools.chain( | ||
token_types["state_1"].values(), token_types["common"].values(), | ||
) | ||
) | ||
), | ||
LexerState.STATE_2: re.compile( | ||
"|".join( | ||
itertools.chain( | ||
token_types["state_2"].values(), token_types["common"].values(), | ||
) | ||
) | ||
), | ||
LexerState.STATE_3: re.compile( | ||
"|".join( | ||
itertools.chain( | ||
token_types["state_3"].values(), token_types["common"].values(), | ||
) | ||
) | ||
), | ||
} | ||
|
||
def __init__(self, input_text): | ||
self._text = input_text | ||
self._state_handlers = { | ||
LexerState.STATE_1: self._state_1, | ||
LexerState.STATE_2: self._state_2, | ||
LexerState.STATE_3: self._state_3, | ||
} | ||
|
||
def __iter__(self): | ||
# Since re.scanner does not seem to support manipulating inner scanner states, | ||
# we need to implement lexer state transitions manually using special | ||
# non-capturing lookahead token patterns to signal when a state transition | ||
# should be made. | ||
# Each state is then processed by a dedicated state handler method. | ||
state = LexerState.STATE_1 | ||
offset = 0 # the number of characters processed so far | ||
|
||
while state != LexerState.STATE_END: | ||
token_generator = self._get_state_token_generator(state, offset) | ||
|
||
for maybe_token in token_generator: | ||
if isinstance(maybe_token, StateTransition): | ||
state = maybe_token.new_state | ||
offset = maybe_token.total_offset | ||
break | ||
|
||
if maybe_token.type_ != TokenType.WS: | ||
yield maybe_token | ||
|
||
if maybe_token.type_ == TokenType.EOL: | ||
state = LexerState.STATE_END | ||
break | ||
|
||
def _get_state_token_generator(self, state, current_offset): | ||
"""TODO: explain... we need to create the canner and pick the state handler | ||
and return that | ||
""" | ||
state_handler = self._state_handlers[state] | ||
pattern = self._GRAND_PATTERNS[state] | ||
scanner = pattern.scanner(self._text, pos=current_offset) | ||
return state_handler(scanner) | ||
|
||
def _state_1(self, scanner): | ||
for match in iter(scanner.match, None): | ||
token_type = match.lastgroup | ||
|
||
if token_type == "GOTO_STATE_2": | ||
yield StateTransition( | ||
new_state=LexerState.STATE_2, total_offset=match.start(), | ||
) | ||
|
||
yield Token(token_type, match.group(), match.start()) | ||
|
||
def _state_2(self, scanner): | ||
for match in iter(scanner.match, None): | ||
token_type = match.lastgroup | ||
|
||
if token_type == "GOTO_STATE_3": | ||
yield StateTransition( | ||
new_state=LexerState.STATE_3, total_offset=match.start(), | ||
) | ||
|
||
yield Token(token_type, match.group(), match.start()) | ||
|
||
def _state_3(self, scanner): | ||
for match in iter(scanner.match, None): | ||
token_type = match.lastgroup | ||
|
||
if token_type == "GOTO_STATE_2": | ||
yield StateTransition( | ||
new_state=LexerState.STATE_2, total_offset=match.start(), | ||
) | ||
|
||
yield Token(token_type, match.group(), match.start()) |
Oops, something went wrong.