From bccb4cf18ba38c8bf09d61d19e0ffabaf15554b1 Mon Sep 17 00:00:00 2001
From: Jalil SA <61639983+jxlil@users.noreply.github.com>
Date: Wed, 14 Feb 2024 12:29:29 -0600
Subject: [PATCH 1/2] fix: LxmlLinkExtractor unique_list missing key

---
 scrapy/linkextractors/lxmlhtml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py
index 23cbd0116bc..98781ba7fd9 100644
--- a/scrapy/linkextractors/lxmlhtml.py
+++ b/scrapy/linkextractors/lxmlhtml.py
@@ -248,5 +248,5 @@ def extract_links(self, response):
             links = self._extract_links(doc, response.url, response.encoding, base_url)
             all_links.extend(self._process_links(links))
         if self.link_extractor.unique:
-            return unique_list(all_links)
+            return unique_list(all_links, key=self.link_extractor.link_key)
         return all_links

From 5e51417a485f296354e9639f85fb0b51a4a3e533 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= <adrian@chaves.io>
Date: Fri, 16 Feb 2024 20:10:52 +0100
Subject: [PATCH 2/2] Add tests, fix canonicalize passing

---
 scrapy/linkextractors/lxmlhtml.py |   2 +-
 tests/test_linkextractors.py      | 112 ++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py
index 98781ba7fd9..7abdaaec497 100644
--- a/scrapy/linkextractors/lxmlhtml.py
+++ b/scrapy/linkextractors/lxmlhtml.py
@@ -153,7 +153,7 @@ def __init__(
             unique=unique,
             process=process_value,
             strip=strip,
-            canonicalized=canonicalize,
+            canonicalized=not canonicalize,
         )
         self.allow_res = [
             x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)
diff --git a/tests/test_linkextractors.py b/tests/test_linkextractors.py
index 18e9608c1b3..f23b8988e17 100644
--- a/tests/test_linkextractors.py
+++ b/tests/test_linkextractors.py
@@ -745,6 +745,118 @@ def test_pickle_extractor(self):
             lx = self.extractor_cls()
             self.assertIsInstance(pickle.loads(pickle.dumps(lx)), self.extractor_cls)
 
+        def test_link_extractor_aggregation(self):
+            """When a parameter like restrict_css is used, the underlying
+            implementation calls its internal link extractor once per selector
+            matching the specified restrictions, and then aggregates the
+            extracted links.
+
+            Test that aggregation respects the unique and canonicalize
+            parameters.
+            """
+            # unique=True (default), canonicalize=False (default)
+            lx = self.extractor_cls(restrict_css=("div",))
+            response = HtmlResponse(
+                "https://example.com",
+                body=b"""
+                    <div>
+                        <a href="/a">a1</a>
+                        <a href="/b?a=1&b=2">b1</a>
+                    </div>
+                    <div>
+                        <a href="/a">a2</a>
+                        <a href="/b?b=2&a=1">b2</a>
+                    </div>
+                """,
+            )
+            actual = lx.extract_links(response)
+            self.assertEqual(
+                actual,
+                [
+                    Link(url="https://example.com/a", text="a1"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b1"),
+                    Link(url="https://example.com/b?b=2&a=1", text="b2"),
+                ],
+            )
+
+            # unique=True (default), canonicalize=True
+            lx = self.extractor_cls(restrict_css=("div",), canonicalize=True)
+            response = HtmlResponse(
+                "https://example.com",
+                body=b"""
+                    <div>
+                        <a href="/a">a1</a>
+                        <a href="/b?a=1&b=2">b1</a>
+                    </div>
+                    <div>
+                        <a href="/a">a2</a>
+                        <a href="/b?b=2&a=1">b2</a>
+                    </div>
+                """,
+            )
+            actual = lx.extract_links(response)
+            self.assertEqual(
+                actual,
+                [
+                    Link(url="https://example.com/a", text="a1"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b1"),
+                ],
+            )
+
+            # unique=False, canonicalize=False (default)
+            lx = self.extractor_cls(restrict_css=("div",), unique=False)
+            response = HtmlResponse(
+                "https://example.com",
+                body=b"""
+                    <div>
+                        <a href="/a">a1</a>
+                        <a href="/b?a=1&b=2">b1</a>
+                    </div>
+                    <div>
+                        <a href="/a">a2</a>
+                        <a href="/b?b=2&a=1">b2</a>
+                    </div>
+                """,
+            )
+            actual = lx.extract_links(response)
+            self.assertEqual(
+                actual,
+                [
+                    Link(url="https://example.com/a", text="a1"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b1"),
+                    Link(url="https://example.com/a", text="a2"),
+                    Link(url="https://example.com/b?b=2&a=1", text="b2"),
+                ],
+            )
+
+            # unique=False, canonicalize=True
+            lx = self.extractor_cls(
+                restrict_css=("div",), unique=False, canonicalize=True
+            )
+            response = HtmlResponse(
+                "https://example.com",
+                body=b"""
+                    <div>
+                        <a href="/a">a1</a>
+                        <a href="/b?a=1&b=2">b1</a>
+                    </div>
+                    <div>
+                        <a href="/a">a2</a>
+                        <a href="/b?b=2&a=1">b2</a>
+                    </div>
+                """,
+            )
+            actual = lx.extract_links(response)
+            self.assertEqual(
+                actual,
+                [
+                    Link(url="https://example.com/a", text="a1"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b1"),
+                    Link(url="https://example.com/a", text="a2"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b2"),
+                ],
+            )
+
 
 class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
     extractor_cls = LxmlLinkExtractor