Merge pull request #6221 from jxlil/fix/LxmlLinkExtractor

fix: LxmlLinkExtractor unique_list missing key
scrapy · Apr 19, 2024 · a5da77d · a5da77d
2 parents b1fe97d + 5e51417
commit a5da77d
Show file tree

Hide file tree

Showing 2 changed files with 114 additions and 2 deletions.
diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py
@@ -154,7 +154,7 @@ def __init__(
             unique=unique,
             process=process_value,
             strip=strip,
-            canonicalized=canonicalize,
+            canonicalized=not canonicalize,
         )
         self.allow_res = [
             x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)
@@ -249,5 +249,5 @@ def extract_links(self, response):
             links = self._extract_links(doc, response.url, response.encoding, base_url)
             all_links.extend(self._process_links(links))
         if self.link_extractor.unique:
-            return unique_list(all_links)
+            return unique_list(all_links, key=self.link_extractor.link_key)
         return all_links
diff --git a/tests/test_linkextractors.py b/tests/test_linkextractors.py
@@ -745,6 +745,118 @@ def test_pickle_extractor(self):
             lx = self.extractor_cls()
             self.assertIsInstance(pickle.loads(pickle.dumps(lx)), self.extractor_cls)
 
+        def test_link_extractor_aggregation(self):
+            """When a parameter like restrict_css is used, the underlying
+            implementation calls its internal link extractor once per selector
+            matching the specified restrictions, and then aggregates the
+            extracted links.
+
+            Test that aggregation respects the unique and canonicalize
+            parameters.
+            """
+            # unique=True (default), canonicalize=False (default)
+            lx = self.extractor_cls(restrict_css=("div",))
+            response = HtmlResponse(
+                "https://example.com",
+                body=b"""
+                    <div>
+                        <a href="/a">a1</a>
+                        <a href="/b?a=1&b=2">b1</a>
+                    </div>
+                    <div>
+                        <a href="/a">a2</a>
+                        <a href="/b?b=2&a=1">b2</a>
+                    </div>
+                """,
+            )
+            actual = lx.extract_links(response)
+            self.assertEqual(
+                actual,
+                [
+                    Link(url="https://example.com/a", text="a1"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b1"),
+                    Link(url="https://example.com/b?b=2&a=1", text="b2"),
+                ],
+            )
+
+            # unique=True (default), canonicalize=True
+            lx = self.extractor_cls(restrict_css=("div",), canonicalize=True)
+            response = HtmlResponse(
+                "https://example.com",
+                body=b"""
+                    <div>
+                        <a href="/a">a1</a>
+                        <a href="/b?a=1&b=2">b1</a>
+                    </div>
+                    <div>
+                        <a href="/a">a2</a>
+                        <a href="/b?b=2&a=1">b2</a>
+                    </div>
+                """,
+            )
+            actual = lx.extract_links(response)
+            self.assertEqual(
+                actual,
+                [
+                    Link(url="https://example.com/a", text="a1"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b1"),
+                ],
+            )
+
+            # unique=False, canonicalize=False (default)
+            lx = self.extractor_cls(restrict_css=("div",), unique=False)
+            response = HtmlResponse(
+                "https://example.com",
+                body=b"""
+                    <div>
+                        <a href="/a">a1</a>
+                        <a href="/b?a=1&b=2">b1</a>
+                    </div>
+                    <div>
+                        <a href="/a">a2</a>
+                        <a href="/b?b=2&a=1">b2</a>
+                    </div>
+                """,
+            )
+            actual = lx.extract_links(response)
+            self.assertEqual(
+                actual,
+                [
+                    Link(url="https://example.com/a", text="a1"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b1"),
+                    Link(url="https://example.com/a", text="a2"),
+                    Link(url="https://example.com/b?b=2&a=1", text="b2"),
+                ],
+            )
+
+            # unique=False, canonicalize=True
+            lx = self.extractor_cls(
+                restrict_css=("div",), unique=False, canonicalize=True
+            )
+            response = HtmlResponse(
+                "https://example.com",
+                body=b"""
+                    <div>
+                        <a href="/a">a1</a>
+                        <a href="/b?a=1&b=2">b1</a>
+                    </div>
+                    <div>
+                        <a href="/a">a2</a>
+                        <a href="/b?b=2&a=1">b2</a>
+                    </div>
+                """,
+            )
+            actual = lx.extract_links(response)
+            self.assertEqual(
+                actual,
+                [
+                    Link(url="https://example.com/a", text="a1"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b1"),
+                    Link(url="https://example.com/a", text="a2"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b2"),
+                ],
+            )
+
 
 class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
     extractor_cls = LxmlLinkExtractor