Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: update parser to correctly parse desired tokens #55

Merged
merged 6 commits into from
Jun 24, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
224 changes: 120 additions & 104 deletions docfx_yaml/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,125 @@ def _extract_signature(obj_sig):
return signature, parameters


# Given documentation docstring, parse them into summary_info.
def _extract_docstring_info(summary_info, summary, name):
top_summary = ""

# Initialize known types needing further processing.
var_types = {
':rtype:': 'returns',
':returns:': 'returns',
':type': 'variables',
':param': 'variables',
':raises': 'exceptions',
':raises:': 'exceptions'
}

# Clean the string by cleaning newlines and backlashes, then split by white space.
config = Config(napoleon_use_param=True, napoleon_use_rtype=True)
# Convert Google style to reStructuredText
parsed_text = str(GoogleDocstring(summary, config))

# Trim the top summary but maintain its formatting.
indexes = []
for types in var_types:
# Ensure that we look for exactly the string we want.
# Adding the extra space for non-colon ending types
# helps determine if we simply ran into desired occurrence
# or if we ran into a similar looking syntax but shouldn't
# parse upon it.
types += ' ' if types[-1] != ':' else ''
if types in parsed_text:
index = parsed_text.find(types)
if index > -1:
# For now, skip on parsing custom fields like attribute
if types == ':type ' and 'attribute::' in parsed_text:
continue
indexes.append(index)

# If we found types needing further processing, locate its index,
# if we found empty array for indexes, stop processing further.
index = min(indexes) if indexes else 0

# Store the top summary separately.
if index == 0:
top_summary = summary
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we return at this point and avoid needing the else (and the indentation that comes with it)?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just return summary directly?

return top_summary

top_summary = parsed_text[:index]
parsed_text = parsed_text[index:]

# Clean up whitespace and other characters
parsed_text = " ".join(filter(None, re.split(r'\|\s', parsed_text))).split()

cur_type = ''
words = []
arg_name = ''
index = 0
# Used to track return type and description
r_type, r_descr = '', ''

# Using counter iteration to easily extract names rather than
# coming up with more complicated stopping logic for each tags.
while index <= len(parsed_text):
word = parsed_text[index] if index < len(parsed_text) else ""
# Check if we encountered specific words.
if word in var_types or index == len(parsed_text):
# Finish processing previous section.
if cur_type:
if cur_type == ':type':
summary_info[var_types[cur_type]][arg_name]['var_type'] = " ".join(words)
elif cur_type == ':param':
summary_info[var_types[cur_type]][arg_name]['description'] = " ".join(words)
elif ":raises" in cur_type:
summary_info[var_types[cur_type]].append({
'var_type': arg_name,
'description': " ".join(words)
})
else:
if cur_type == ':rtype:':
r_type = " ".join(words)
else:
r_descr = " ".join(words)
if r_type and r_descr:
summary_info[var_types[cur_type]].append({
'var_type': r_type,
'description': r_descr
})
r_type, r_descr = '', ''

else:

# If after we processed the top summary and get in this state,
# likely we encountered a type that's not covered above or the docstring
# was formatted badly. This will likely break docfx job later on, should not
# process further.
if word not in var_types:
raise ValueError(f"Encountered wrong formatting, please check docstring for {name}")

# Reached end of string, break after finishing processing
if index == len(parsed_text):
break

# Start processing for new section
cur_type = word
if cur_type in [':type', ':param', ':raises', ':raises:']:
index += 1
arg_name = parsed_text[index][:-1]
# Initialize empty dictionary if it doesn't exist already
if arg_name not in summary_info[var_types[cur_type]] and ':raises' not in cur_type:
summary_info[var_types[cur_type]][arg_name] = {}

# Empty target string
words = []
else:
words.append(word)

index += 1

return top_summary


def _create_datam(app, cls, module, name, _type, obj, lines=None):
"""
Build the data structure for an autodoc class
Expand All @@ -291,108 +410,6 @@ def _update_friendly_package_name(path):

return path

def _extract_docstring_info(summary_info, summary):
top_summary = ""

# Initialize known types needing further processing.
var_types = {
':rtype:': 'returns',
':returns:': 'returns',
':type': 'variables',
':param': 'variables',
':raises': 'exceptions',
':raises:': 'exceptions'
}

# Clean the string by cleaning newlines and backlashes, then split by white space.
config = Config(napoleon_use_param=True, napoleon_use_rtype=True)
# Convert Google style to reStructuredText
parsed_text = str(GoogleDocstring(summary, config))

# Trim the top summary but maintain its formatting.
indexes = []
for types in var_types:
index = parsed_text.find(types)
if index > -1:
# For now, skip on parsing custom fields like attribute
if types == ':type' and 'attribute::' in parsed_text:
continue
indexes.append(index)

# If we found types needing further processing, locate its index,
# if we found empty array for indexes, stop processing further.
index = min(indexes) if indexes else 0

# Store the top summary separately.
if index == 0:
top_summary = summary
else:
top_summary = parsed_text[:index]
parsed_text = parsed_text[index:]

# Clean up whitespace and other characters
parsed_text = " ".join(filter(None, re.split(r'\n| |\|\s', parsed_text))).split(" ")

cur_type = ''
words = []
arg_name = ''
index = 0

# Using counter iteration to easily extract names rather than
# coming up with more complicated stopping logic for each tags.
while index <= len(parsed_text):
word = parsed_text[index] if index < len(parsed_text) else ""
# Check if we encountered specific words.
if word in var_types or index == len(parsed_text):
# Finish processing previous section.
if cur_type:
if cur_type == ':type':
summary_info[var_types[cur_type]][arg_name]['var_type'] = " ".join(words)
elif cur_type == ':param':
summary_info[var_types[cur_type]][arg_name]['description'] = " ".join(words)
elif ":raises" in cur_type:
summary_info[var_types[cur_type]].append({
'var_type': arg_name,
'description': " ".join(words)
})
elif cur_type == ':rtype:':
arg_name = " ".join(words)
else:
summary_info[var_types[cur_type]].append({
'var_type': arg_name,
'description': " ".join(words)
})
else:
# If after we processed the top summary and get in this state,
# likely we encountered a type that's not covered above or the docstring
# was formatted badly. This will likely break docfx job later on, should not
# process further.
if word not in var_types:
raise ValueError("Encountered wrong formatting, please check docstrings")

# Reached end of string, break after finishing processing
if index == len(parsed_text):
break

# Start processing for new section
cur_type = word
if cur_type in [':type', ':param', ':raises', ':raises:']:
index += 1
arg_name = parsed_text[index][:-1]
# Initialize empty dictionary if it doesn't exist already
if arg_name not in summary_info[var_types[cur_type]] and ':raises' not in cur_type:
summary_info[var_types[cur_type]][arg_name] = {}

# Empty target string
words = []
else:
words.append(word)

index += 1

return top_summary


if lines is None:
lines = []
short_name = name.split('.')[-1]
Expand Down Expand Up @@ -421,7 +438,6 @@ def _extract_docstring_info(summary_info, summary):
lines = lines.split("\n") if lines else []
except TypeError as e:
print("couldn't getdoc from method, function: {}".format(e))


elif _type in [PROPERTY]:
lines = inspect.getdoc(obj)
Expand Down Expand Up @@ -503,7 +519,7 @@ def _extract_docstring_info(summary_info, summary):

# Extract summary info into respective sections.
if summary:
top_summary = _extract_docstring_info(summary_info, summary)
top_summary = _extract_docstring_info(summary_info, summary, name)
datam['summary'] = top_summary

if args or sig or summary_info:
Expand Down
128 changes: 128 additions & 0 deletions tests/test_unit.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from docfx_yaml.extension import find_unique_name
from docfx_yaml.extension import disambiguate_toc_name
from docfx_yaml.extension import _extract_docstring_info

import unittest

Expand Down Expand Up @@ -44,5 +45,132 @@ def test_disambiguate_toc_name(self):

self.assertEqual(yaml_want, yaml_got)

# Variables used for testing _extract_docstring_info
top_summary1_want = "\nSimple test for docstring.\n\n"
summary_info1_want = {
'variables': {
'arg1': {
'var_type': 'int',
'description': 'simple description.'
},
'arg2': {
'var_type': 'str',
'description': 'simple description for `arg2`.'
}
},
'returns': [
{
'var_type': 'str',
'description': 'simple description for return value.'
}
],
'exceptions': [
{
'var_type': 'AttributeError',
'description': 'if `condition x`.'
}
]
}


def test_extract_docstring_info_normal_input(self):

## Test for normal input
summary_info1_got = {
'variables': {},
'returns': [],
'exceptions': []
}

summary1 = """
Simple test for docstring.

Args:
arg1(int): simple description.
arg2(str): simple description for `arg2`.

Returns:
str: simple description for return value.

Raises:
AttributeError: if `condition x`.
"""

top_summary1_got = _extract_docstring_info(summary_info1_got, summary1, "")

self.assertEqual(top_summary1_got, self.top_summary1_want)
self.assertEqual(summary_info1_got, self.summary_info1_want)


def test_extract_docstring_info_mixed_format(self):
## Test for input coming in mixed format.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider creating a separate test case for each summary. It's a bit hard to follow all of these as-is.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done.

summary2 = """
Simple test for docstring.

:type arg1: int
:param arg1: simple description.
:param arg2: simple description for `arg2`.
:type arg2: str

:rtype: str
:returns: simple description for return value.

:raises AttributeError: if `condition x`.
"""

summary_info2_got = {
'variables': {},
'returns': [],
'exceptions': []
}

top_summary2_got = _extract_docstring_info(summary_info2_got, summary2, "")

# Output should be same as test 1 with normal input.
self.assertEqual(top_summary2_got, self.top_summary1_want)
self.assertEqual(summary_info2_got, self.summary_info1_want)


def test_extract_docstring_info_check_parser(self):
## Test for parser to correctly scan docstring tokens and not custom fields
summary_info3_want = {
'variables': {},
'returns': [],
'exceptions': []
}

summary3 = """
Union[int, None]: Expiration time in milliseconds for a partition.

If :attr:`partition_expiration` is set and <xref:type_> is
not set, :attr:`type_` will default to
:attr:`~google.cloud.bigquery.table.TimePartitioningType.DAY`.
It could return :param: with :returns as well.
"""

summary_info3_got = {
'variables': {},
'returns': [],
'exceptions': []
}

# Nothing should change
top_summary3_want = summary3

top_summary3_got = _extract_docstring_info(summary_info3_got, summary3, "")

self.assertEqual(top_summary3_got, top_summary3_want)
self.assertEqual(summary_info3_got, summary_info3_want)

def test_extract_docstring_info_check_error(self):
## Test for incorrectly formmatted docstring raising error
summary4 = """
Description of docstring which should fail.

:returns:param:
"""
with self.assertRaises(ValueError):
_extract_docstring_info({}, summary4, "error string")

if __name__ == '__main__':
unittest.main()