Add custom argument parser for cell magic

googleapis · Aug 5, 2020 · 5aa7f07 · 5aa7f07
1 parent cc274f7
commit 5aa7f07
Show file tree

Hide file tree

Showing 4 changed files with 706 additions and 0 deletions.
diff --git a/google/cloud/bigquery/ipython_magics/line_arg_parser/__init__.py b/google/cloud/bigquery/ipython_magics/line_arg_parser/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from google.cloud.bigquery.ipython_magics.line_arg_parser.lexer import Lexer
+from google.cloud.bigquery.ipython_magics.line_arg_parser.parser import Parser
+from google.cloud.bigquery.ipython_magics.line_arg_parser import (
+    visitors,
+)  # TODO: import all
+
+
+__all__ = ("Lexer", "Parser", "visitors")
diff --git a/google/cloud/bigquery/ipython_magics/line_arg_parser/lexer.py b/google/cloud/bigquery/ipython_magics/line_arg_parser/lexer.py
@@ -0,0 +1,189 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+from collections import OrderedDict
+import itertools
+import re
+
+import enum
+
+
+Token = namedtuple("Token", ("type_", "lexeme", "pos"))
+StateTransition = namedtuple("StateTransition", ("new_state", "total_offset"))
+
+
+token_types = OrderedDict(
+    state_1=OrderedDict(
+        GOTO_STATE_2=r"(?P<GOTO_STATE_2>(?=--))",  # double dash - starting the options list
+        DEST_VAR=r"(?P<DEST_VAR>[^\d\W]\w*)",  # essentially a Python ID
+    ),
+    state_2=OrderedDict(
+        GOTO_STATE_3=r"(?P<GOTO_STATE_3>(?=--params(?=\s|$)))",  # the --params option
+        OPTION_SPEC=r"(?P<OPTION_SPEC>--\w+)",
+        # NOTE: currently the only valid value for a non "--params" option is project ID
+        OPT_VAL=r"(?P<OPT_VAL>[^_\d\W](?:\w|\.)+)",
+    ),
+    state_3=OrderedDict(
+        PY_STRING=r"(?P<PY_STRING>(?:{})|(?:{}))".format(
+            r"'(?:[^'\\]|\.)*'", r'"(?:[^"\\]|\.)*"'  # single and double quoted strings
+        ),
+        PARAMS_OPT_SPEC=r"(?P<PARAMS_OPT_SPEC>--params(?=\s|$))",
+        GOTO_STATE_2=r"(?P<GOTO_STATE_2>(?=--\w+))",  # found another option spec
+        PY_BOOL=r"(?P<PY_BOOL>True|False)",
+        DOLLAR_PY_ID=r"(?P<DOLLAR_PY_ID>\$[^\d\W]\w*)",
+        PY_ID=r"(?P<PY_ID>[^\d\W]\w*)",
+        # TODO: supporting only ints or floats, add floats in scientific notation, too?
+        PY_NUMBER=r"(?P<PY_NUMBER>-?[1-9]\d*(?:\.\d+)?)",
+        SQUOTE=r"(?P<SQUOTE>')",
+        DQUOTE=r'(?P<DQUOTE>")',
+        COLON=r"(?P<COLON>:)",
+        COMMA=r"(?P<COMMA>,)",
+        LCURL=r"(?P<LCURL>\{)",
+        RCURL=r"(?P<RCURL>})",
+        LSQUARE=r"(?P<LSQUARE>\[)",
+        RSQUARE=r"(?P<RSQUARE>])",
+        LPAREN=r"(?P<LPAREN>\()",
+        RPAREN=r"(?P<RPAREN>\))",
+    ),
+    common=OrderedDict(
+        WS=r"(?P<WS>\s+)",
+        EOL=r"(?P<EOL>$)",
+        UNKNOWN=r"(?P<UNKNOWN>\S+)",  # anything not a whitespace or matched by something else
+    )
+)
+
+
+class AutoStrEnum(str, enum.Enum):
+    def _generate_next_value_(name, start, count, last_values):
+        return name
+
+
+TokenType = AutoStrEnum(
+    "TokenType",
+    [
+        name for name in itertools.chain.from_iterable(token_types.values())
+        if not name.startswith("GOTO_STATE")
+    ]
+)
+
+
+class LexerState(AutoStrEnum):
+    STATE_1 = enum.auto()  # parsing positional arguments
+    STATE_2 = enum.auto()  # parsing options other than "--params"
+    STATE_3 = enum.auto()  # parsing the "--params" option
+    STATE_END = enum.auto()
+
+
+class Lexer(object):
+    """Lexical analyzer for tokenizing the cell magic input line."""
+
+    _GRAND_PATTERNS = {
+        LexerState.STATE_1: re.compile(
+            "|".join(
+                itertools.chain(
+                    token_types["state_1"].values(), token_types["common"].values(),
+                )
+            )
+        ),
+        LexerState.STATE_2: re.compile(
+            "|".join(
+                itertools.chain(
+                    token_types["state_2"].values(), token_types["common"].values(),
+                )
+            )
+        ),
+        LexerState.STATE_3: re.compile(
+            "|".join(
+                itertools.chain(
+                    token_types["state_3"].values(), token_types["common"].values(),
+                )
+            )
+        ),
+    }
+
+    def __init__(self, input_text):
+        self._text = input_text
+        self._state_handlers = {
+            LexerState.STATE_1: self._state_1,
+            LexerState.STATE_2: self._state_2,
+            LexerState.STATE_3: self._state_3,
+        }
+
+    def __iter__(self):
+        # Since re.scanner does not seem to support manipulating inner scanner states,
+        # we need to implement lexer state transitions manually using special
+        # non-capturing lookahead token patterns to signal when a state transition
+        # should be made.
+        # Each state is then processed by a dedicated state handler method.
+        state = LexerState.STATE_1
+        offset = 0  # the number of characters processed so far
+
+        while state != LexerState.STATE_END:
+            token_generator = self._get_state_token_generator(state, offset)
+
+            for maybe_token in token_generator:
+                if isinstance(maybe_token, StateTransition):
+                    state = maybe_token.new_state
+                    offset = maybe_token.total_offset
+                    break
+
+                if maybe_token.type_ != TokenType.WS:
+                    yield maybe_token
+
+                if maybe_token.type_ == TokenType.EOL:
+                    state = LexerState.STATE_END
+                    break
+
+    def _get_state_token_generator(self, state, current_offset):
+        """TODO: explain... we need to create the canner and pick the state handler
+        and return that
+        """
+        state_handler = self._state_handlers[state]
+        pattern = self._GRAND_PATTERNS[state]
+        scanner = pattern.scanner(self._text, pos=current_offset)
+        return state_handler(scanner)
+
+    def _state_1(self, scanner):
+        for match in iter(scanner.match, None):
+            token_type = match.lastgroup
+
+            if token_type == "GOTO_STATE_2":
+                yield StateTransition(
+                    new_state=LexerState.STATE_2, total_offset=match.start(),
+                )
+
+            yield Token(token_type, match.group(), match.start())
+
+    def _state_2(self, scanner):
+        for match in iter(scanner.match, None):
+            token_type = match.lastgroup
+
+            if token_type == "GOTO_STATE_3":
+                yield StateTransition(
+                    new_state=LexerState.STATE_3, total_offset=match.start(),
+                )
+
+            yield Token(token_type, match.group(), match.start())
+
+    def _state_3(self, scanner):
+        for match in iter(scanner.match, None):
+            token_type = match.lastgroup
+
+            if token_type == "GOTO_STATE_2":
+                yield StateTransition(
+                    new_state=LexerState.STATE_2, total_offset=match.start(),
+                )
+
+            yield Token(token_type, match.group(), match.start())