fix: update parser to correctly parse desired tokens (#55)

* fix: correct parser to scan specific tokens only * fix: update parser for varying input types * test: add unittest for extract_docstring_info * fix: update parser and test * fix: update to return summary directly
googleapis · Jun 24, 2021 · d1e18c7 · d1e18c7
1 parent e322ff4
commit d1e18c7
Show file tree

Hide file tree

Showing 2 changed files with 247 additions and 104 deletions.
diff --git a/docfx_yaml/extension.py b/docfx_yaml/extension.py
@@ -270,6 +270,124 @@ def _extract_signature(obj_sig):
     return signature, parameters
 
 
+# Given documentation docstring, parse them into summary_info.
+def _extract_docstring_info(summary_info, summary, name):
+    top_summary = ""
+
+    # Initialize known types needing further processing.
+    var_types = {
+        ':rtype:': 'returns',
+        ':returns:': 'returns',
+        ':type': 'variables',
+        ':param': 'variables',
+        ':raises': 'exceptions',
+        ':raises:': 'exceptions'
+    }
+
+    # Clean the string by cleaning newlines and backlashes, then split by white space.
+    config = Config(napoleon_use_param=True, napoleon_use_rtype=True)
+    # Convert Google style to reStructuredText
+    parsed_text = str(GoogleDocstring(summary, config))
+
+    # Trim the top summary but maintain its formatting.
+    indexes = []
+    for types in var_types:
+        # Ensure that we look for exactly the string we want.
+        # Adding the extra space for non-colon ending types
+        # helps determine if we simply ran into desired occurrence
+        # or if we ran into a similar looking syntax but shouldn't
+        # parse upon it.
+        types += ' ' if types[-1] != ':' else ''
+        if types in parsed_text:
+            index = parsed_text.find(types)
+            if index > -1:
+                # For now, skip on parsing custom fields like attribute
+                if types == ':type ' and 'attribute::' in parsed_text:
+                    continue
+                indexes.append(index)
+
+    # If we found types needing further processing, locate its index,
+    # if we found empty array for indexes, stop processing further.
+    index = min(indexes) if indexes else 0
+
+    # Store the top summary separately.
+    if index == 0:
+        return summary
+
+    top_summary = parsed_text[:index]
+    parsed_text = parsed_text[index:]
+
+    # Clean up whitespace and other characters
+    parsed_text = " ".join(filter(None, re.split(r'\|\s', parsed_text))).split()
+
+    cur_type = ''
+    words = []
+    arg_name = ''
+    index = 0
+    # Used to track return type and description
+    r_type, r_descr = '', ''
+
+    # Using counter iteration to easily extract names rather than
+    # coming up with more complicated stopping logic for each tags.
+    while index <= len(parsed_text):
+        word = parsed_text[index] if index < len(parsed_text) else ""
+        # Check if we encountered specific words.
+        if word in var_types or index == len(parsed_text):               
+            # Finish processing previous section.
+            if cur_type:
+                if cur_type == ':type':
+                    summary_info[var_types[cur_type]][arg_name]['var_type'] = " ".join(words)
+                elif cur_type == ':param':
+                    summary_info[var_types[cur_type]][arg_name]['description'] = " ".join(words)
+                elif ":raises" in cur_type:
+                    summary_info[var_types[cur_type]].append({
+                        'var_type': arg_name,
+                        'description': " ".join(words)
+                    })
+                else:
+                    if cur_type == ':rtype:':
+                        r_type = " ".join(words)
+                    else:
+                        r_descr = " ".join(words)
+                    if r_type and r_descr:
+                        summary_info[var_types[cur_type]].append({
+                            'var_type': r_type,
+                            'description': r_descr
+                        })
+                        r_type, r_descr = '', ''
+
+            else:
+
+                # If after we processed the top summary and get in this state,
+                # likely we encountered a type that's not covered above or the docstring
+                # was formatted badly. This will likely break docfx job later on, should not
+                # process further.
+                if word not in var_types:
+                    raise ValueError(f"Encountered wrong formatting, please check docstring for {name}")
+
+            # Reached end of string, break after finishing processing
+            if index == len(parsed_text):
+                break
+
+            # Start processing for new section
+            cur_type = word
+            if cur_type in [':type', ':param', ':raises', ':raises:']:
+                index += 1
+                arg_name = parsed_text[index][:-1]
+                # Initialize empty dictionary if it doesn't exist already
+                if arg_name not in summary_info[var_types[cur_type]] and ':raises' not in cur_type:
+                    summary_info[var_types[cur_type]][arg_name] = {}
+
+            # Empty target string
+            words = []
+        else:
+            words.append(word)
+
+        index += 1
+
+    return top_summary
+
+
 def _create_datam(app, cls, module, name, _type, obj, lines=None):
     """
     Build the data structure for an autodoc class
@@ -291,108 +409,6 @@ def _update_friendly_package_name(path):
 
         return path
 
-    def _extract_docstring_info(summary_info, summary):
-        top_summary = ""
-
-        # Initialize known types needing further processing.
-        var_types = {
-            ':rtype:': 'returns',
-            ':returns:': 'returns',
-            ':type': 'variables',
-            ':param': 'variables',
-            ':raises': 'exceptions',
-            ':raises:': 'exceptions'
-        }
-
-        # Clean the string by cleaning newlines and backlashes, then split by white space.
-        config = Config(napoleon_use_param=True, napoleon_use_rtype=True)
-        # Convert Google style to reStructuredText
-        parsed_text = str(GoogleDocstring(summary, config))
-
-        # Trim the top summary but maintain its formatting.
-        indexes = []
-        for types in var_types:
-            index = parsed_text.find(types)
-            if index > -1:
-                # For now, skip on parsing custom fields like attribute
-                if types == ':type' and 'attribute::' in parsed_text:
-                    continue
-                indexes.append(index)
-
-        # If we found types needing further processing, locate its index,
-        # if we found empty array for indexes, stop processing further.
-        index = min(indexes) if indexes else 0
-
-        # Store the top summary separately.
-        if index == 0:
-            top_summary = summary
-        else:
-            top_summary = parsed_text[:index]
-            parsed_text = parsed_text[index:]
-
-            # Clean up whitespace and other characters
-            parsed_text = " ".join(filter(None, re.split(r'\n|  |\|\s', parsed_text))).split(" ")
-
-            cur_type = ''
-            words = []
-            arg_name = ''
-            index = 0
-
-            # Using counter iteration to easily extract names rather than
-            # coming up with more complicated stopping logic for each tags.
-            while index <= len(parsed_text):
-                word = parsed_text[index] if index < len(parsed_text) else ""
-                # Check if we encountered specific words.
-                if word in var_types or index == len(parsed_text):
-                    # Finish processing previous section.
-                    if cur_type:
-                        if cur_type == ':type':
-                            summary_info[var_types[cur_type]][arg_name]['var_type'] = " ".join(words)
-                        elif cur_type == ':param':
-                            summary_info[var_types[cur_type]][arg_name]['description'] = " ".join(words)
-                        elif ":raises" in cur_type:
-                            summary_info[var_types[cur_type]].append({
-                                'var_type': arg_name,
-                                'description': " ".join(words)
-                            })
-                        elif cur_type == ':rtype:':
-                            arg_name = " ".join(words)
-                        else:
-                            summary_info[var_types[cur_type]].append({
-                                'var_type': arg_name,
-                                'description': " ".join(words)
-                            })
-                    else:
-                        # If after we processed the top summary and get in this state,
-                        # likely we encountered a type that's not covered above or the docstring
-                        # was formatted badly. This will likely break docfx job later on, should not
-                        # process further.
-                        if word not in var_types:
-                            raise ValueError("Encountered wrong formatting, please check docstrings")
-
-                    # Reached end of string, break after finishing processing
-                    if index == len(parsed_text):
-                        break
-
-                    # Start processing for new section
-                    cur_type = word
-                    if cur_type in [':type', ':param', ':raises', ':raises:']:
-                        index += 1
-                        arg_name = parsed_text[index][:-1]
-                        # Initialize empty dictionary if it doesn't exist already
-                        if arg_name not in summary_info[var_types[cur_type]] and ':raises' not in cur_type:
-                            summary_info[var_types[cur_type]][arg_name] = {}
-
-                    # Empty target string
-                    words = []
-                else:
-                    words.append(word)
-
-                index += 1
-
-        return top_summary
-
-
     if lines is None:
         lines = []
     short_name = name.split('.')[-1]
@@ -421,7 +437,6 @@ def _extract_docstring_info(summary_info, summary):
                 lines = lines.split("\n") if lines else []
             except TypeError as e:
                 print("couldn't getdoc from method, function: {}".format(e))
-
 
         elif _type in [PROPERTY]:
             lines = inspect.getdoc(obj)
@@ -503,7 +518,7 @@ def _extract_docstring_info(summary_info, summary):
 
         # Extract summary info into respective sections.
         if summary:
-            top_summary = _extract_docstring_info(summary_info, summary)
+            top_summary = _extract_docstring_info(summary_info, summary, name)
             datam['summary'] = top_summary
 
     if args or sig or summary_info:

diff --git a/tests/test_unit.py b/tests/test_unit.py
@@ -1,5 +1,6 @@
 from docfx_yaml.extension import find_unique_name
 from docfx_yaml.extension import disambiguate_toc_name
+from docfx_yaml.extension import _extract_docstring_info
 
 import unittest
 
@@ -44,5 +45,132 @@ def test_disambiguate_toc_name(self):
 
         self.assertEqual(yaml_want, yaml_got)
 
+    # Variables used for testing _extract_docstring_info
+    top_summary1_want = "\nSimple test for docstring.\n\n"
+    summary_info1_want = {
+        'variables': {
+            'arg1': {
+                'var_type': 'int',
+                'description': 'simple description.'
+            },
+            'arg2': {
+                'var_type': 'str',
+                'description': 'simple description for `arg2`.'
+            }
+        },
+        'returns': [
+            {
+                'var_type': 'str', 
+                'description': 'simple description for return value.'
+            }
+        ],
+        'exceptions': [
+            {
+                'var_type': 'AttributeError', 
+                'description': 'if `condition x`.'
+            }
+        ]
+    }
+
+
+    def test_extract_docstring_info_normal_input(self):
+
+        ## Test for normal input
+        summary_info1_got = {
+            'variables': {},
+            'returns': [],
+            'exceptions': []
+        }
+
+        summary1 = """
+Simple test for docstring.
+
+Args: 
+    arg1(int): simple description.
+    arg2(str): simple description for `arg2`.
+
+Returns:
+    str: simple description for return value.
+
+Raises:
+    AttributeError: if `condition x`.
+"""
+
+        top_summary1_got = _extract_docstring_info(summary_info1_got, summary1, "")
+
+        self.assertEqual(top_summary1_got, self.top_summary1_want)
+        self.assertEqual(summary_info1_got, self.summary_info1_want)
+
+
+    def test_extract_docstring_info_mixed_format(self):
+        ## Test for input coming in mixed format.
+        summary2 = """
+Simple test for docstring.
+
+:type arg1: int
+:param arg1: simple description.
+:param arg2: simple description for `arg2`.
+:type arg2: str
+
+:rtype: str
+:returns: simple description for return value.
+
+:raises AttributeError: if `condition x`. 
+"""
+
+        summary_info2_got = {
+            'variables': {},
+            'returns': [],
+            'exceptions': []
+        }
+
+        top_summary2_got = _extract_docstring_info(summary_info2_got, summary2, "")
+
+        # Output should be same as test 1 with normal input.
+        self.assertEqual(top_summary2_got, self.top_summary1_want)
+        self.assertEqual(summary_info2_got, self.summary_info1_want)
+
+
+    def test_extract_docstring_info_check_parser(self):
+        ## Test for parser to correctly scan docstring tokens and not custom fields
+        summary_info3_want = {
+            'variables': {},
+            'returns': [],
+            'exceptions': []
+        }
+
+        summary3 = """
+Union[int, None]: Expiration time in milliseconds for a partition.
+
+If :attr:`partition_expiration` is set and <xref:type_> is
+not set, :attr:`type_` will default to
+:attr:`~google.cloud.bigquery.table.TimePartitioningType.DAY`.
+It could return :param: with :returns as well.
+"""
+
+        summary_info3_got = {
+            'variables': {},
+            'returns': [],
+            'exceptions': []
+        }
+
+        # Nothing should change
+        top_summary3_want = summary3
+
+        top_summary3_got = _extract_docstring_info(summary_info3_got, summary3, "")
+
+        self.assertEqual(top_summary3_got, top_summary3_want)
+        self.assertEqual(summary_info3_got, summary_info3_want)
+
+    def test_extract_docstring_info_check_error(self):
+        ## Test for incorrectly formmatted docstring raising error
+        summary4 = """
+Description of docstring which should fail. 
+
+:returns:param:
+"""
+        with self.assertRaises(ValueError):
+            _extract_docstring_info({}, summary4, "error string")
+
 if __name__ == '__main__':
     unittest.main()