diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py index 55bc0fc4305..71c6d08fc8a 100644 --- a/scrapy/linkextractors/lxmlhtml.py +++ b/scrapy/linkextractors/lxmlhtml.py @@ -154,7 +154,7 @@ def __init__( unique=unique, process=process_value, strip=strip, - canonicalized=canonicalize, + canonicalized=not canonicalize, ) self.allow_res = [ x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow) @@ -249,5 +249,5 @@ def extract_links(self, response): links = self._extract_links(doc, response.url, response.encoding, base_url) all_links.extend(self._process_links(links)) if self.link_extractor.unique: - return unique_list(all_links) + return unique_list(all_links, key=self.link_extractor.link_key) return all_links diff --git a/tests/test_linkextractors.py b/tests/test_linkextractors.py index 217c7a29904..d9c09a16a8e 100644 --- a/tests/test_linkextractors.py +++ b/tests/test_linkextractors.py @@ -745,6 +745,118 @@ def test_pickle_extractor(self): lx = self.extractor_cls() self.assertIsInstance(pickle.loads(pickle.dumps(lx)), self.extractor_cls) + def test_link_extractor_aggregation(self): + """When a parameter like restrict_css is used, the underlying + implementation calls its internal link extractor once per selector + matching the specified restrictions, and then aggregates the + extracted links. + + Test that aggregation respects the unique and canonicalize + parameters. + """ + # unique=True (default), canonicalize=False (default) + lx = self.extractor_cls(restrict_css=("div",)) + response = HtmlResponse( + "https://example.com", + body=b""" +
+ a1 + b1 +
+
+ a2 + b2 +
+ """, + ) + actual = lx.extract_links(response) + self.assertEqual( + actual, + [ + Link(url="https://example.com/a", text="a1"), + Link(url="https://example.com/b?a=1&b=2", text="b1"), + Link(url="https://example.com/b?b=2&a=1", text="b2"), + ], + ) + + # unique=True (default), canonicalize=True + lx = self.extractor_cls(restrict_css=("div",), canonicalize=True) + response = HtmlResponse( + "https://example.com", + body=b""" +
+ a1 + b1 +
+
+ a2 + b2 +
+ """, + ) + actual = lx.extract_links(response) + self.assertEqual( + actual, + [ + Link(url="https://example.com/a", text="a1"), + Link(url="https://example.com/b?a=1&b=2", text="b1"), + ], + ) + + # unique=False, canonicalize=False (default) + lx = self.extractor_cls(restrict_css=("div",), unique=False) + response = HtmlResponse( + "https://example.com", + body=b""" +
+ a1 + b1 +
+
+ a2 + b2 +
+ """, + ) + actual = lx.extract_links(response) + self.assertEqual( + actual, + [ + Link(url="https://example.com/a", text="a1"), + Link(url="https://example.com/b?a=1&b=2", text="b1"), + Link(url="https://example.com/a", text="a2"), + Link(url="https://example.com/b?b=2&a=1", text="b2"), + ], + ) + + # unique=False, canonicalize=True + lx = self.extractor_cls( + restrict_css=("div",), unique=False, canonicalize=True + ) + response = HtmlResponse( + "https://example.com", + body=b""" +
+ a1 + b1 +
+
+ a2 + b2 +
+ """, + ) + actual = lx.extract_links(response) + self.assertEqual( + actual, + [ + Link(url="https://example.com/a", text="a1"), + Link(url="https://example.com/b?a=1&b=2", text="b1"), + Link(url="https://example.com/a", text="a2"), + Link(url="https://example.com/b?a=1&b=2", text="b2"), + ], + ) + class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase): extractor_cls = LxmlLinkExtractor