Skip to content

Commit

Permalink
Merge pull request #6221 from jxlil/fix/LxmlLinkExtractor
Browse files Browse the repository at this point in the history
fix: LxmlLinkExtractor unique_list missing key
  • Loading branch information
wRAR committed Apr 19, 2024
2 parents b1fe97d + 5e51417 commit a5da77d
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 2 deletions.
4 changes: 2 additions & 2 deletions scrapy/linkextractors/lxmlhtml.py
Expand Up @@ -154,7 +154,7 @@ def __init__(
unique=unique,
process=process_value,
strip=strip,
canonicalized=canonicalize,
canonicalized=not canonicalize,
)
self.allow_res = [
x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)
Expand Down Expand Up @@ -249,5 +249,5 @@ def extract_links(self, response):
links = self._extract_links(doc, response.url, response.encoding, base_url)
all_links.extend(self._process_links(links))
if self.link_extractor.unique:
return unique_list(all_links)
return unique_list(all_links, key=self.link_extractor.link_key)
return all_links
112 changes: 112 additions & 0 deletions tests/test_linkextractors.py
Expand Up @@ -745,6 +745,118 @@ def test_pickle_extractor(self):
lx = self.extractor_cls()
self.assertIsInstance(pickle.loads(pickle.dumps(lx)), self.extractor_cls)

def test_link_extractor_aggregation(self):
"""When a parameter like restrict_css is used, the underlying
implementation calls its internal link extractor once per selector
matching the specified restrictions, and then aggregates the
extracted links.
Test that aggregation respects the unique and canonicalize
parameters.
"""
# unique=True (default), canonicalize=False (default)
lx = self.extractor_cls(restrict_css=("div",))
response = HtmlResponse(
"https://example.com",
body=b"""
<div>
<a href="/a">a1</a>
<a href="/b?a=1&b=2">b1</a>
</div>
<div>
<a href="/a">a2</a>
<a href="/b?b=2&a=1">b2</a>
</div>
""",
)
actual = lx.extract_links(response)
self.assertEqual(
actual,
[
Link(url="https://example.com/a", text="a1"),
Link(url="https://example.com/b?a=1&b=2", text="b1"),
Link(url="https://example.com/b?b=2&a=1", text="b2"),
],
)

# unique=True (default), canonicalize=True
lx = self.extractor_cls(restrict_css=("div",), canonicalize=True)
response = HtmlResponse(
"https://example.com",
body=b"""
<div>
<a href="/a">a1</a>
<a href="/b?a=1&b=2">b1</a>
</div>
<div>
<a href="/a">a2</a>
<a href="/b?b=2&a=1">b2</a>
</div>
""",
)
actual = lx.extract_links(response)
self.assertEqual(
actual,
[
Link(url="https://example.com/a", text="a1"),
Link(url="https://example.com/b?a=1&b=2", text="b1"),
],
)

# unique=False, canonicalize=False (default)
lx = self.extractor_cls(restrict_css=("div",), unique=False)
response = HtmlResponse(
"https://example.com",
body=b"""
<div>
<a href="/a">a1</a>
<a href="/b?a=1&b=2">b1</a>
</div>
<div>
<a href="/a">a2</a>
<a href="/b?b=2&a=1">b2</a>
</div>
""",
)
actual = lx.extract_links(response)
self.assertEqual(
actual,
[
Link(url="https://example.com/a", text="a1"),
Link(url="https://example.com/b?a=1&b=2", text="b1"),
Link(url="https://example.com/a", text="a2"),
Link(url="https://example.com/b?b=2&a=1", text="b2"),
],
)

# unique=False, canonicalize=True
lx = self.extractor_cls(
restrict_css=("div",), unique=False, canonicalize=True
)
response = HtmlResponse(
"https://example.com",
body=b"""
<div>
<a href="/a">a1</a>
<a href="/b?a=1&b=2">b1</a>
</div>
<div>
<a href="/a">a2</a>
<a href="/b?b=2&a=1">b2</a>
</div>
""",
)
actual = lx.extract_links(response)
self.assertEqual(
actual,
[
Link(url="https://example.com/a", text="a1"),
Link(url="https://example.com/b?a=1&b=2", text="b1"),
Link(url="https://example.com/a", text="a2"),
Link(url="https://example.com/b?a=1&b=2", text="b2"),
],
)


class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
extractor_cls = LxmlLinkExtractor
Expand Down

0 comments on commit a5da77d

Please sign in to comment.