From bccb4cf18ba38c8bf09d61d19e0ffabaf15554b1 Mon Sep 17 00:00:00 2001 From: Jalil SA <61639983+jxlil@users.noreply.github.com> Date: Wed, 14 Feb 2024 12:29:29 -0600 Subject: [PATCH 1/2] fix: LxmlLinkExtractor unique_list missing key --- scrapy/linkextractors/lxmlhtml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py index 23cbd0116bc..98781ba7fd9 100644 --- a/scrapy/linkextractors/lxmlhtml.py +++ b/scrapy/linkextractors/lxmlhtml.py @@ -248,5 +248,5 @@ def extract_links(self, response): links = self._extract_links(doc, response.url, response.encoding, base_url) all_links.extend(self._process_links(links)) if self.link_extractor.unique: - return unique_list(all_links) + return unique_list(all_links, key=self.link_extractor.link_key) return all_links From 5e51417a485f296354e9639f85fb0b51a4a3e533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 16 Feb 2024 20:10:52 +0100 Subject: [PATCH 2/2] Add tests, fix canonicalize passing --- scrapy/linkextractors/lxmlhtml.py | 2 +- tests/test_linkextractors.py | 112 ++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 1 deletion(-) diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py index 98781ba7fd9..7abdaaec497 100644 --- a/scrapy/linkextractors/lxmlhtml.py +++ b/scrapy/linkextractors/lxmlhtml.py @@ -153,7 +153,7 @@ def __init__( unique=unique, process=process_value, strip=strip, - canonicalized=canonicalize, + canonicalized=not canonicalize, ) self.allow_res = [ x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow) diff --git a/tests/test_linkextractors.py b/tests/test_linkextractors.py index 18e9608c1b3..f23b8988e17 100644 --- a/tests/test_linkextractors.py +++ b/tests/test_linkextractors.py @@ -745,6 +745,118 @@ def test_pickle_extractor(self): lx = self.extractor_cls() self.assertIsInstance(pickle.loads(pickle.dumps(lx)), self.extractor_cls) + def test_link_extractor_aggregation(self): + """When a parameter like restrict_css is used, the underlying + implementation calls its internal link extractor once per selector + matching the specified restrictions, and then aggregates the + extracted links. + + Test that aggregation respects the unique and canonicalize + parameters. + """ + # unique=True (default), canonicalize=False (default) + lx = self.extractor_cls(restrict_css=("div",)) + response = HtmlResponse( + "https://example.com", + body=b""" +
+ a1 + b1 +
+
+ a2 + b2 +
+ """, + ) + actual = lx.extract_links(response) + self.assertEqual( + actual, + [ + Link(url="https://example.com/a", text="a1"), + Link(url="https://example.com/b?a=1&b=2", text="b1"), + Link(url="https://example.com/b?b=2&a=1", text="b2"), + ], + ) + + # unique=True (default), canonicalize=True + lx = self.extractor_cls(restrict_css=("div",), canonicalize=True) + response = HtmlResponse( + "https://example.com", + body=b""" +
+ a1 + b1 +
+
+ a2 + b2 +
+ """, + ) + actual = lx.extract_links(response) + self.assertEqual( + actual, + [ + Link(url="https://example.com/a", text="a1"), + Link(url="https://example.com/b?a=1&b=2", text="b1"), + ], + ) + + # unique=False, canonicalize=False (default) + lx = self.extractor_cls(restrict_css=("div",), unique=False) + response = HtmlResponse( + "https://example.com", + body=b""" +
+ a1 + b1 +
+
+ a2 + b2 +
+ """, + ) + actual = lx.extract_links(response) + self.assertEqual( + actual, + [ + Link(url="https://example.com/a", text="a1"), + Link(url="https://example.com/b?a=1&b=2", text="b1"), + Link(url="https://example.com/a", text="a2"), + Link(url="https://example.com/b?b=2&a=1", text="b2"), + ], + ) + + # unique=False, canonicalize=True + lx = self.extractor_cls( + restrict_css=("div",), unique=False, canonicalize=True + ) + response = HtmlResponse( + "https://example.com", + body=b""" +
+ a1 + b1 +
+
+ a2 + b2 +
+ """, + ) + actual = lx.extract_links(response) + self.assertEqual( + actual, + [ + Link(url="https://example.com/a", text="a1"), + Link(url="https://example.com/b?a=1&b=2", text="b1"), + Link(url="https://example.com/a", text="a2"), + Link(url="https://example.com/b?a=1&b=2", text="b2"), + ], + ) + class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase): extractor_cls = LxmlLinkExtractor