Feature/section link fix (#94)

* header section links * add pulling content from header section * minor improvement
barrust · Dec 10, 2020 · ea5d351 · ea5d351
1 parent 7fa60fb
commit ea5d351
Show file tree

Hide file tree

Showing 5 changed files with 125 additions and 25 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,10 +3,11 @@
 ## Version 0.6.7 (planned)
 
 * ***NOTE:*** Last planed support for **Python 2.7**
-* Cache results of `BeautifulSoup` parsing of `page.html` [PR #90](https://github.com/barrust/mediawiki/pull/90) Thank [ldorigo](https://github.com/ldorigo)
+* Cache results of `BeautifulSoup` parsing of `page.html` [PR #90](https://github.com/barrust/mediawiki/pull/90) Thanks [ldorigo](https://github.com/ldorigo)
+* Add ability to pull links from the header section; Thanks to [ldorigo](https://github.com/ldorigo) for example code!
+* Add ability to pull the header section text
 * Move to GitHib Actions and CodeCov for testing
 
-
 ## Version 0.6.6
 
 * Fix a bug using `find_all()` on newer versions of BeautifulSoup4

diff --git a/mediawiki/mediawikipage.py b/mediawiki/mediawikipage.py
@@ -472,31 +472,43 @@ def section(self, section_title):
         """ Plain text section content
 
             Args:
-                section_title (str): Name of the section to pull
+                section_title (str): Name of the section to pull or None \
+                    for the header section
             Returns:
                 str: The content of the section
+            Note:
+                Use **None** if the header section is desired
             Note:
                 Returns **None** if section title is not found; only text \
                 between title and next section or sub-section title is returned
             Note:
                 Side effect is to also pull the content which can be slow
             Note:
                 This is a parsing operation and not part of the standard API"""
-        section = "== {0} ==".format(section_title)
-        try:
-            content = self.content
-            index = content.index(section) + len(section)
-
-            # ensure we have the full section header...
-            while True:
-                if content[index + 1] == "=":
-                    index += 1
-                else:
-                    break
-        except ValueError:
-            return None
-        except IndexError:
-            pass
+        if not section_title:
+            try:
+                content = self.content
+                index = 0
+            except ValueError:
+                return None
+            except IndexError:
+                pass
+        else:
+            section = "== {0} ==".format(section_title)
+            try:
+                content = self.content
+                index = content.index(section) + len(section)
+
+                # ensure we have the full section header...
+                while True:
+                    if content[index + 1] == "=":
+                        index += 1
+                    else:
+                        break
+            except ValueError:
+                return None
+            except IndexError:
+                pass
 
         try:
             next_index = self.content.index("==", index)
@@ -509,9 +521,13 @@ def parse_section_links(self, section_title):
         """ Parse all links within a section
 
             Args:
-                section_title (str): Name of the section to pull
+                section_title (str): Name of the section to pull or, if \
+                    None is provided, the links between the main heading and \
+                    the first section
             Returns:
                 list: List of (title, url) tuples
+            Note:
+                Use **None** to pull the links from the header section
             Note:
                 Returns **None** if section title is not found
             Note:
@@ -522,6 +538,9 @@ def parse_section_links(self, section_title):
         if not self._soup:
             self._soup = BeautifulSoup(self.html, "html.parser")
 
+        if not section_title:
+            return self._parse_section_links(None)
+
         headlines = self._soup.find_all("span", class_="mw-headline")
         tmp_soup = BeautifulSoup(section_title, "html.parser")
         tmp_sec_title = tmp_soup.get_text().lower()
@@ -673,21 +692,34 @@ def _continued_query(self, query_params, key="pages"):
 
     def _parse_section_links(self, id_tag):
         """ given a section id, parse the links in the unordered list """
-
-        info = self._soup.find("span", {"id": id_tag})
         all_links = list()
 
-        if info is None:
-            return all_links
+        if id_tag is None:
+            root = self._soup.find("div", {"class": "mw-parser-output"})
+            if root is None:
+                return all_links
+            candidates = root.children
+        else:
+            root = self._soup.find("span", {"id": id_tag})
+            if root is None:
+                return all_links
+            candidates = self._soup.find(id=id_tag).parent.next_siblings
 
-        for node in self._soup.find(id=id_tag).parent.next_siblings:
+        for node in candidates:
             if not isinstance(node, Tag):
                 continue
             if node.get("role", "") == "navigation":
                 continue
             elif "infobox" in node.get("class", []):
                 continue
 
+            # If the classname contains "toc", the element is a table of contents.
+            # The comprehension is necessary because there are several possible
+            # types of tocs: "toclevel", "toc", ...
+            toc_classnames = [cname for cname in node.get("class", []) if "toc" in cname]
+            if toc_classnames:
+                continue
+
             # this is actually the child node's class...
             is_headline = node.find("span", {"class": "mw-headline"})
             if is_headline is not None:

diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py
@@ -435,11 +435,15 @@ def _post_response(self, params):
     res = pg.parse_section_links('EXTERNAL LINKS')
     responses[wikipedia.api_url]['mcy_ds_external_links'] = res
 
+    res = pg.parse_section_links(None)
+    responses[wikipedia.api_url]['mcy_ds_external_links_none'] = res
+
     # doesn't contain external links
     pg = wikipedia.page('Tropical rainforest conservation')
     res = pg.parse_section_links('EXTERNAL LINKS')
     responses[wikipedia.api_url]['page_no_sec_links'] = res
 
+
     pg = asoiaf.page('arya')
     for section in pg.sections:
         links = pg.parse_section_links(section)
@@ -454,6 +458,7 @@ def _post_response(self, params):
     res = pg.table_of_contents
     responses[wikipedia.api_url]['new_york_city_toc'] = res
     responses[wikipedia.api_url]['new_york_city_air_quality'] = pg.section('Air quality')
+    responses[wikipedia.api_url]['new_york_city_none'] = pg.section(None)
     responses[wikipedia.api_url]['new_york_city_last_sec'] = pg.section('External links')
     print("Completed pulling Table of Content data")
 

diff --git a/tests/mediawiki_test.py b/tests/mediawiki_test.py
@@ -1167,6 +1167,13 @@ def test_page_section(self):
         self.assertEqual(self.pag.section('A Game of Thrones'),
                          self.response['arya']['section_a_game_of_thrones'])
 
+    def test_page_section_header(self):
+        ''' test a page returning the section header '''
+        res = self.pag.section(None)
+        print(res)
+        self.assertEqual(self.pag.section(None),
+                         self.response['arya']['section_a_game_of_thrones'])
+
     def test_page_last_section(self):
         ''' test a page returning the last section '''
         self.assertEqual(self.pag.section('External links'),
@@ -1309,6 +1316,15 @@ def test_page_section_large(self):
         pg = wiki.page('New York City')
         self.assertEqual(pg.section('Air quality'), response['new_york_city_air_quality'])
 
+    def test_page_section_header(self):
+        ''' test a page returning a section - header '''
+        wiki = MediaWikiOverloaded()
+        response = wiki.responses[wiki.api_url]
+        pg = wiki.page('New York City')
+        import json
+        print(json.dumps(pg.section(None)))
+        self.assertEqual(pg.section(None), response['new_york_city_none'])
+
     def test_page_last_section_large(self):
         ''' test a page returning the last section - large '''
         wiki = MediaWikiOverloaded()
@@ -1560,6 +1576,16 @@ def test_contains_ext_links_2(self):
             tmp[i] = list(item)
         self.assertEqual(tmp, res['mcy_ds_external_links'])
 
+    def test_contains_ext_links_3(self):
+        ''' Test when external links are present None '''
+        site = MediaWikiOverloaded()
+        res = site.responses[site.api_url]
+        page = site.page('''McDonald's''')
+        tmp = page.parse_section_links(None)
+        for i, item in enumerate(tmp):
+            tmp[i] = list(item)
+        self.assertEqual(tmp, res['mcy_ds_external_links_none'])
+
     def test_no_ext_links(self):
         ''' Test when no external links on the page '''
         site = MediaWikiOverloaded()

diff --git a/tests/mock_responses.json b/tests/mock_responses.json
@@ -6248,6 +6248,41 @@
     "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=MCD"
    ]
   ],
+  "mcy_ds_external_links_none": [
+      ["Macdonald (disambiguation)", "https://en.wikipedia.org/wiki/Macdonald_(disambiguation)"],
+      ["MCD (disambiguation)", "https://en.wikipedia.org/wiki/MCD_(disambiguation)"],
+      ["fast food", "https://en.wikipedia.org/wiki/Fast_food"],
+      ["company", "https://en.wikipedia.org/wiki/Company"],
+      ["Richard and Maurice McDonald", "https://en.wikipedia.org/wiki/Richard_and_Maurice_McDonald"],
+      ["San Bernardino, California", "https://en.wikipedia.org/wiki/San_Bernardino,_California"],
+      ["hamburger", "https://en.wikipedia.org/wiki/Hamburger"],
+      ["Golden Arches logo", "https://en.wikipedia.org/wiki/Golden_Arches"],
+      ["Phoenix, Arizona", "https://en.wikipedia.org/wiki/Phoenix,_Arizona"],
+      ["Ray Kroc", "https://en.wikipedia.org/wiki/Ray_Kroc"],
+      ["Oak Brook, Illinois", "https://en.wikipedia.org/wiki/Oak_Brook,_Illinois"],
+      ["Chicago", "https://en.wikipedia.org/wiki/Chicago"],
+      ["[4]", "https://en.wikipedia.org/wiki/McDonald%27s#cite_note-west_side-4"],
+      ["[5]", "https://en.wikipedia.org/wiki/McDonald%27s#cite_note-move_HQ-5"],
+      ["[6]", "https://en.wikipedia.org/wiki/McDonald%27s#cite_note-6"],
+      ["restaurant chain", "https://en.wikipedia.org/wiki/Restaurant_chain"],
+      ["[7]", "https://en.wikipedia.org/wiki/McDonald%27s#cite_note-7"],
+      ["countries", "https://en.wikipedia.org/wiki/Country"],
+      ["[8]", "https://en.wikipedia.org/wiki/McDonald%27s#cite_note-ChicagoTribune60years-8"],
+      ["[9]", "https://en.wikipedia.org/wiki/McDonald%27s#cite_note-2016_10K-9"],
+      ["cheeseburgers", "https://en.wikipedia.org/wiki/Cheeseburger"],
+      ["french fries", "https://en.wikipedia.org/wiki/French_fries"],
+      ["breakfast", "https://en.wikipedia.org/wiki/Breakfast"],
+      ["soft drinks", "https://en.wikipedia.org/wiki/Soft_drink"],
+      ["milkshakes", "https://en.wikipedia.org/wiki/Milkshake"],
+      ["wraps", "https://en.wikipedia.org/wiki/Wrap_(food)"],
+      ["[10]", "https://en.wikipedia.org/wiki/McDonald%27s#cite_note-10"],
+      ["salads", "https://en.wikipedia.org/wiki/Salad"],
+      ["fish", "https://en.wikipedia.org/wiki/Fish"],
+      ["smoothies", "https://en.wikipedia.org/wiki/Smoothie"],
+      ["fruit", "https://en.wikipedia.org/wiki/Fruit"],
+      ["BBC", "https://en.wikipedia.org/wiki/BBC"],
+      ["Walmart", "https://en.wikipedia.org/wiki/Walmart"]
+  ],
   "missing_categorytree": "\"Category:Chess Ebola\" does not match any pages. Try another query!",
   "missing_title_disamb_dets": [
    {
@@ -6304,6 +6339,7 @@
    }
   ],
   "missing_title_disamb_msg": "\n\"Leaching\" may refer to: \n  Bioleaching\n  Dump leaching\n  Heap leaching\n  In-situ leaching\n  Leach (disambiguation)\n  Leachate\n  Leaching (agriculture)\n  Leaching (chemistry)\n  Leaching (metallurgy)\n  Leaching (pedology)\n  Leech (disambiguation)\n  Leeching (disambiguation)\n  Tank leaching",
+  "new_york_city_none": "The City of New York, often called New York City (NYC) or simply New York (NY), is the most populous city in the United States. With an estimated 2017 population of 8,622,698 distributed over a land area of about 302.6 square miles (784 km2), New York City is also the most densely populated major city in the United States. Located at the southern tip of the state of New York, the city is the center of the New York metropolitan area, the largest metropolitan area in the world by urban landmass and one of the world's most populous megacities, with an estimated 20,320,876 people in its 2017 Metropolitan Statistical Area and 23,876,155 residents in its Combined Statistical Area. A global power city, New York City has been described as the cultural, financial, and media capital of the world, and exerts a significant impact upon commerce, entertainment, research, technology, education, politics, tourism, art, fashion, and sports. The city's fast pace has inspired the term New York minute. Home to the headquarters of the United Nations, New York is an important center for international diplomacy.Situated on one of the world's largest natural harbors, New York City consists of five boroughs, each of which is a separate county of the State of New York. The five boroughs \u2013 Brooklyn, Queens, Manhattan, The Bronx, and Staten Island \u2013 were consolidated into a single city in 1898. The city and its metropolitan area constitute the premier gateway for legal immigration to the United States. As many as 800 languages are spoken in New York, making it the most linguistically diverse city in the world. New York City is home to more than 3.2 million residents born outside the United States, the largest foreign-born population of any city in the world.  In 2017, the New York metropolitan area produced a gross metropolitan product (GMP) of US$1.73 trillion. If greater New York City were a sovereign state, it would have the 12th highest GDP in the world.New York City traces its origins to a trading post founded by colonists from the Dutch Republic in 1624 on Lower Manhattan; the post was named New Amsterdam in 1626. The city and its surroundings came under English control in 1664 and were renamed New York after King Charles II of England granted the lands to his brother, the Duke of York. New York served as the capital of the United States from 1785 until 1790. It has been the country's largest city since 1790. The Statue of Liberty greeted millions of immigrants as they came to the Americas by ship in the late 19th and early 20th centuries and is an international symbol of the United States and its ideals of liberty and peace. In the 21st century, New York has emerged as a global node of creativity and entrepreneurship, social tolerance, and environmental sustainability, and as a symbol of freedom and cultural diversity.Many districts and landmarks in New York City are well known, with the city having three of the world's ten most visited tourist attractions in 2013 and receiving a record 62.8 million tourists in 2017. Several sources have ranked New York the most photographed city in the world. Times Square, iconic as the world's \"heart\" and its \"Crossroads\", is the brightly illuminated hub of the Broadway Theater District, one of the world's busiest pedestrian intersections, and a major center of the world's entertainment industry. The names of many of the city's landmarks, skyscrapers, and parks are known around the world. Manhattan's real estate market is among the most expensive in the world. New York is home to the largest ethnic Chinese population outside of Asia, with multiple signature Chinatowns developing across the city. Providing continuous 24/7 service, the New York City Subway is the largest single-operator rapid transit system worldwide, with 472 rail stations. Over 120 colleges and universities are located in New York City, including Columbia University, New York University, and Rockefeller University, which have been ranked among the top universities in the world.  Anchored by Wall Street in the Financial District of Lower Manhattan, New York has been called both the most economically powerful city and the leading financial center of the world, and the city is home to the world's two largest stock exchanges by total market capitalization, the New York Stock Exchange and NASDAQ.",
   "new_york_city_air_quality": "According to the 2016 World Health Organization Global Urban Ambient Air Pollution Database, the annual average concentration in New York City's air of particulate matter measuring 2.5 microns or less (PM2.5) was 7 micrograms per cubic meter, or 3 micrograms below the recommended limit of the WHO Air Quality Guidelines for the annual mean PM2.5. The New York City Department of Health and Mental Hygiene, in partnership with Queens College, conducts the New York Community Air Survey to measure pollutants at about 150 locations.",
   "new_york_city_last_sec": "Official website\nNYC Go, official tourism website of New York City\nNew York City at Curlie\n Geographic data related to New York City at OpenStreetMap.\nCollections, 145,000 NYC photographs at Museum of the City of New York\n\"The New New York Skyline\". Interactive. National Geographic. Nov 2015.",
   "new_york_city_sections": [
@@ -16076,4 +16112,4 @@
    "Zygophyllum fabago"
   ]
  }
-}
+}