Skip to content

Commit

Permalink
fix: update parser to correctly parse desired tokens (#55)
Browse files Browse the repository at this point in the history
* fix: correct parser to scan specific tokens only

* fix: update parser for varying input types

* test: add unittest for extract_docstring_info

* fix: update parser and test

* fix: update to return summary directly
  • Loading branch information
dandhlee committed Jun 24, 2021
1 parent e322ff4 commit d1e18c7
Show file tree
Hide file tree
Showing 2 changed files with 247 additions and 104 deletions.
223 changes: 119 additions & 104 deletions docfx_yaml/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,124 @@ def _extract_signature(obj_sig):
return signature, parameters


# Given documentation docstring, parse them into summary_info.
def _extract_docstring_info(summary_info, summary, name):
top_summary = ""

# Initialize known types needing further processing.
var_types = {
':rtype:': 'returns',
':returns:': 'returns',
':type': 'variables',
':param': 'variables',
':raises': 'exceptions',
':raises:': 'exceptions'
}

# Clean the string by cleaning newlines and backlashes, then split by white space.
config = Config(napoleon_use_param=True, napoleon_use_rtype=True)
# Convert Google style to reStructuredText
parsed_text = str(GoogleDocstring(summary, config))

# Trim the top summary but maintain its formatting.
indexes = []
for types in var_types:
# Ensure that we look for exactly the string we want.
# Adding the extra space for non-colon ending types
# helps determine if we simply ran into desired occurrence
# or if we ran into a similar looking syntax but shouldn't
# parse upon it.
types += ' ' if types[-1] != ':' else ''
if types in parsed_text:
index = parsed_text.find(types)
if index > -1:
# For now, skip on parsing custom fields like attribute
if types == ':type ' and 'attribute::' in parsed_text:
continue
indexes.append(index)

# If we found types needing further processing, locate its index,
# if we found empty array for indexes, stop processing further.
index = min(indexes) if indexes else 0

# Store the top summary separately.
if index == 0:
return summary

top_summary = parsed_text[:index]
parsed_text = parsed_text[index:]

# Clean up whitespace and other characters
parsed_text = " ".join(filter(None, re.split(r'\|\s', parsed_text))).split()

cur_type = ''
words = []
arg_name = ''
index = 0
# Used to track return type and description
r_type, r_descr = '', ''

# Using counter iteration to easily extract names rather than
# coming up with more complicated stopping logic for each tags.
while index <= len(parsed_text):
word = parsed_text[index] if index < len(parsed_text) else ""
# Check if we encountered specific words.
if word in var_types or index == len(parsed_text):
# Finish processing previous section.
if cur_type:
if cur_type == ':type':
summary_info[var_types[cur_type]][arg_name]['var_type'] = " ".join(words)
elif cur_type == ':param':
summary_info[var_types[cur_type]][arg_name]['description'] = " ".join(words)
elif ":raises" in cur_type:
summary_info[var_types[cur_type]].append({
'var_type': arg_name,
'description': " ".join(words)
})
else:
if cur_type == ':rtype:':
r_type = " ".join(words)
else:
r_descr = " ".join(words)
if r_type and r_descr:
summary_info[var_types[cur_type]].append({
'var_type': r_type,
'description': r_descr
})
r_type, r_descr = '', ''

else:

# If after we processed the top summary and get in this state,
# likely we encountered a type that's not covered above or the docstring
# was formatted badly. This will likely break docfx job later on, should not
# process further.
if word not in var_types:
raise ValueError(f"Encountered wrong formatting, please check docstring for {name}")

# Reached end of string, break after finishing processing
if index == len(parsed_text):
break

# Start processing for new section
cur_type = word
if cur_type in [':type', ':param', ':raises', ':raises:']:
index += 1
arg_name = parsed_text[index][:-1]
# Initialize empty dictionary if it doesn't exist already
if arg_name not in summary_info[var_types[cur_type]] and ':raises' not in cur_type:
summary_info[var_types[cur_type]][arg_name] = {}

# Empty target string
words = []
else:
words.append(word)

index += 1

return top_summary


def _create_datam(app, cls, module, name, _type, obj, lines=None):
"""
Build the data structure for an autodoc class
Expand All @@ -291,108 +409,6 @@ def _update_friendly_package_name(path):

return path

def _extract_docstring_info(summary_info, summary):
top_summary = ""

# Initialize known types needing further processing.
var_types = {
':rtype:': 'returns',
':returns:': 'returns',
':type': 'variables',
':param': 'variables',
':raises': 'exceptions',
':raises:': 'exceptions'
}

# Clean the string by cleaning newlines and backlashes, then split by white space.
config = Config(napoleon_use_param=True, napoleon_use_rtype=True)
# Convert Google style to reStructuredText
parsed_text = str(GoogleDocstring(summary, config))

# Trim the top summary but maintain its formatting.
indexes = []
for types in var_types:
index = parsed_text.find(types)
if index > -1:
# For now, skip on parsing custom fields like attribute
if types == ':type' and 'attribute::' in parsed_text:
continue
indexes.append(index)

# If we found types needing further processing, locate its index,
# if we found empty array for indexes, stop processing further.
index = min(indexes) if indexes else 0

# Store the top summary separately.
if index == 0:
top_summary = summary
else:
top_summary = parsed_text[:index]
parsed_text = parsed_text[index:]

# Clean up whitespace and other characters
parsed_text = " ".join(filter(None, re.split(r'\n| |\|\s', parsed_text))).split(" ")

cur_type = ''
words = []
arg_name = ''
index = 0

# Using counter iteration to easily extract names rather than
# coming up with more complicated stopping logic for each tags.
while index <= len(parsed_text):
word = parsed_text[index] if index < len(parsed_text) else ""
# Check if we encountered specific words.
if word in var_types or index == len(parsed_text):
# Finish processing previous section.
if cur_type:
if cur_type == ':type':
summary_info[var_types[cur_type]][arg_name]['var_type'] = " ".join(words)
elif cur_type == ':param':
summary_info[var_types[cur_type]][arg_name]['description'] = " ".join(words)
elif ":raises" in cur_type:
summary_info[var_types[cur_type]].append({
'var_type': arg_name,
'description': " ".join(words)
})
elif cur_type == ':rtype:':
arg_name = " ".join(words)
else:
summary_info[var_types[cur_type]].append({
'var_type': arg_name,
'description': " ".join(words)
})
else:
# If after we processed the top summary and get in this state,
# likely we encountered a type that's not covered above or the docstring
# was formatted badly. This will likely break docfx job later on, should not
# process further.
if word not in var_types:
raise ValueError("Encountered wrong formatting, please check docstrings")

# Reached end of string, break after finishing processing
if index == len(parsed_text):
break

# Start processing for new section
cur_type = word
if cur_type in [':type', ':param', ':raises', ':raises:']:
index += 1
arg_name = parsed_text[index][:-1]
# Initialize empty dictionary if it doesn't exist already
if arg_name not in summary_info[var_types[cur_type]] and ':raises' not in cur_type:
summary_info[var_types[cur_type]][arg_name] = {}

# Empty target string
words = []
else:
words.append(word)

index += 1

return top_summary


if lines is None:
lines = []
short_name = name.split('.')[-1]
Expand Down Expand Up @@ -421,7 +437,6 @@ def _extract_docstring_info(summary_info, summary):
lines = lines.split("\n") if lines else []
except TypeError as e:
print("couldn't getdoc from method, function: {}".format(e))


elif _type in [PROPERTY]:
lines = inspect.getdoc(obj)
Expand Down Expand Up @@ -503,7 +518,7 @@ def _extract_docstring_info(summary_info, summary):

# Extract summary info into respective sections.
if summary:
top_summary = _extract_docstring_info(summary_info, summary)
top_summary = _extract_docstring_info(summary_info, summary, name)
datam['summary'] = top_summary

if args or sig or summary_info:
Expand Down
128 changes: 128 additions & 0 deletions tests/test_unit.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from docfx_yaml.extension import find_unique_name
from docfx_yaml.extension import disambiguate_toc_name
from docfx_yaml.extension import _extract_docstring_info

import unittest

Expand Down Expand Up @@ -44,5 +45,132 @@ def test_disambiguate_toc_name(self):

self.assertEqual(yaml_want, yaml_got)

# Variables used for testing _extract_docstring_info
top_summary1_want = "\nSimple test for docstring.\n\n"
summary_info1_want = {
'variables': {
'arg1': {
'var_type': 'int',
'description': 'simple description.'
},
'arg2': {
'var_type': 'str',
'description': 'simple description for `arg2`.'
}
},
'returns': [
{
'var_type': 'str',
'description': 'simple description for return value.'
}
],
'exceptions': [
{
'var_type': 'AttributeError',
'description': 'if `condition x`.'
}
]
}


def test_extract_docstring_info_normal_input(self):

## Test for normal input
summary_info1_got = {
'variables': {},
'returns': [],
'exceptions': []
}

summary1 = """
Simple test for docstring.
Args:
arg1(int): simple description.
arg2(str): simple description for `arg2`.
Returns:
str: simple description for return value.
Raises:
AttributeError: if `condition x`.
"""

top_summary1_got = _extract_docstring_info(summary_info1_got, summary1, "")

self.assertEqual(top_summary1_got, self.top_summary1_want)
self.assertEqual(summary_info1_got, self.summary_info1_want)


def test_extract_docstring_info_mixed_format(self):
## Test for input coming in mixed format.
summary2 = """
Simple test for docstring.
:type arg1: int
:param arg1: simple description.
:param arg2: simple description for `arg2`.
:type arg2: str
:rtype: str
:returns: simple description for return value.
:raises AttributeError: if `condition x`.
"""

summary_info2_got = {
'variables': {},
'returns': [],
'exceptions': []
}

top_summary2_got = _extract_docstring_info(summary_info2_got, summary2, "")

# Output should be same as test 1 with normal input.
self.assertEqual(top_summary2_got, self.top_summary1_want)
self.assertEqual(summary_info2_got, self.summary_info1_want)


def test_extract_docstring_info_check_parser(self):
## Test for parser to correctly scan docstring tokens and not custom fields
summary_info3_want = {
'variables': {},
'returns': [],
'exceptions': []
}

summary3 = """
Union[int, None]: Expiration time in milliseconds for a partition.
If :attr:`partition_expiration` is set and <xref:type_> is
not set, :attr:`type_` will default to
:attr:`~google.cloud.bigquery.table.TimePartitioningType.DAY`.
It could return :param: with :returns as well.
"""

summary_info3_got = {
'variables': {},
'returns': [],
'exceptions': []
}

# Nothing should change
top_summary3_want = summary3

top_summary3_got = _extract_docstring_info(summary_info3_got, summary3, "")

self.assertEqual(top_summary3_got, top_summary3_want)
self.assertEqual(summary_info3_got, summary_info3_want)

def test_extract_docstring_info_check_error(self):
## Test for incorrectly formmatted docstring raising error
summary4 = """
Description of docstring which should fail.
:returns:param:
"""
with self.assertRaises(ValueError):
_extract_docstring_info({}, summary4, "error string")

if __name__ == '__main__':
unittest.main()

0 comments on commit d1e18c7

Please sign in to comment.