0.3.15 (#34)

* add base url property * parse links from sections
barrust · Sep 18, 2017 · bf38c20 · bf38c20
1 parent ba8bbe1
commit bf38c20
Show file tree

Hide file tree

Showing 9 changed files with 6,063 additions and 1,501 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,11 @@
 
 ## Current
 
+### Version 0.3.15
+
+* Add parse all links within a section [issue #33](https://github.com/barrust/mediawiki/issues/33)
+* Add base url property to mediawiki site
+
 ### Version 0.3.14
 
 * Add refresh interval to cached responses (Defaults to not refresh)

diff --git a/README.rst b/README.rst
@@ -15,10 +15,10 @@ MediaWiki
     :target: https://opensource.org/licenses/MIT/
     :alt: License
 
-**mediawiki** is a python wrapper for the MediaWiki API. The goal is to allow
-users to quickly and efficiently pull data from the MediaWiki site of their
-choice instead of worrying about dealing directly with the API. As such,
-it does not force the use of a particular MediaWiki site. It defaults to
+**mediawiki** is a python wrapper and parser for the MediaWiki API. The goal
+is to allow users to quickly and efficiently pull data from the MediaWiki site
+of their choice instead of worrying about dealing directly with the API. As
+such, it does not force the use of a particular MediaWiki site. It defaults to
 `Wikipedia <http://www.wikipedia.org>`__ but other MediaWiki sites can
 also be used.
 

diff --git a/mediawiki/mediawiki.py b/mediawiki/mediawiki.py
@@ -16,7 +16,7 @@
 from .utilities import (memoize)
 
 URL = 'https://github.com/barrust/mediawiki'
-VERSION = '0.3.14'
+VERSION = '0.3.15'
 
 
 class MediaWiki(object):
@@ -51,6 +51,7 @@ def __init__(self, url='http://{lang}.wikipedia.org/w/api.php', lang='en',
         self._min_wait = rate_limit_wait
         self._extensions = None
         self._api_version = None
+        self._base_url = None
         self.__supported_languages = None
 
         # for memoized results
@@ -85,6 +86,16 @@ def api_version(self):
         '''
         return '.'.join([str(x) for x in self._api_version])
 
+    @property
+    def base_url(self):
+        ''' Base URL for the MediaWiki site
+
+        :getter: Returns the base url of the site
+        :setter: Not settable
+        :type: string
+        '''
+        return self._base_url
+
     @property
     def extensions(self):
         '''Extensions installed on the MediaWiki site
@@ -755,25 +766,31 @@ def wiki_request(self, params):
 
     # Protected functions
     def _get_site_info(self):
-        '''
-        Parse out the Wikimedia site information including
-        API Version and Extensions
-        '''
+        ''' Parse out the Wikimedia site information including
+        API Version and Extensions '''
         response = self.wiki_request({
             'meta': 'siteinfo',
             'siprop': 'extensions|general'
         })
 
         # shouldn't a check for success be done here?
-
-        gen = response['query']['general']['generator']
-        api_version = gen.split(' ')[1].split('-')[0]
+        gen = response['query']['general']
+        api_version = gen['generator'].split(' ')[1].split('-')[0]
 
         major_minor = api_version.split('.')
         for i, item in enumerate(major_minor):
             major_minor[i] = int(item)
         self._api_version = tuple(major_minor)
 
+        # parse the base url out
+        tmp = gen['server']
+        if tmp.startswith('http://') or tmp.startswith('https://'):
+            self._base_url = tmp
+        elif gen['base'].startswith('https:'):
+            self._base_url = 'https:{}'.format(tmp)
+        else:
+            self._base_url = 'http:{}'.format(tmp)
+
         self._extensions = set()
         for ext in response['query']['extensions']:
             self._extensions.add(ext['name'])

diff --git a/mediawiki/mediawikipage.py b/mediawiki/mediawikipage.py
@@ -6,8 +6,8 @@
 
 from __future__ import (unicode_literals, absolute_import)
 from decimal import (Decimal)
-from bs4 import (BeautifulSoup)
-from .utilities import (str_or_unicode)
+from bs4 import (BeautifulSoup, Tag)
+from .utilities import (str_or_unicode, is_relative_url)
 from .exceptions import (MediaWikiException, PageError, RedirectError,
                          DisambiguationError, ODD_ERROR_MESSAGE)
 
@@ -210,13 +210,15 @@ def images(self):
 
     @property
     def logos(self):
-        ''' Images within the infobox signifying either the main image or logo
+        ''' Parse images within the infobox signifying either the main image \
+        or logo
 
         :getter: Returns the list of all images in the information box
         :setter: Not settable
         :type: list
 
         .. note:: Side effect is to also pull the html which can be slow
+        .. note:: This is a parsing operation and not part of the standard API
         '''
         if self._logos is False:
             self._logos = list()
@@ -230,13 +232,14 @@ def logos(self):
 
     @property
     def hatnotes(self):
-        ''' Pull hatnotes from the page
+        ''' Parse hatnotes from the html
 
         :getter: Returns the list of all hatnotes from the page
         :setter: Not settable
         :type: list
 
         .. note:: Side effect is to also pull the html which can be slow
+        .. note:: This is a parsing operation and not part of the standard API
         '''
         if self._hatnotes is False:
             self._hatnotes = list()
@@ -255,7 +258,7 @@ def hatnotes(self):
 
     @property
     def references(self):
-        ''' External links, or references, listed on the page
+        ''' External links, or references, listed anywhere on the MediaWiki page
 
         :getter: Returns the list of all external links
         :setter: Not settable
@@ -457,6 +460,8 @@ def section(self, section_title):
         .. note:: Returns **None** if section title is not found; \
         only text between title and next section or sub-section title \
         is returned.
+        .. note:: Side effect is to also pull the content which can be slow
+        .. note:: This is a parsing operation and not part of the standard API
         '''
         section = '== {0} =='.format(section_title)
         try:
@@ -472,6 +477,32 @@ def section(self, section_title):
 
         return self.content[index:next_index].lstrip('=').strip()
 
+    def parse_section_links(self, section_title):
+        ''' Parse all links within a section
+
+        :param section_title: Name of the section to pull
+        :typee section_title: string
+        :return: list of (title, url) tuples
+
+        .. note:: Returns **None** if section title is not found
+        .. note:: Side effect is to also pull the html which can be slow
+        .. note:: This is a parsing operation and not part of the standard API
+        '''
+        soup = BeautifulSoup(self.html, 'html.parser')
+        headlines = soup.find_all('span', {'class': 'mw-headline'})
+        tmp_soup = BeautifulSoup(section_title, 'html.parser')
+        tmp_sec_title = tmp_soup.get_text().lower()
+        id_tag = None
+        for headline in headlines:
+            tmp_id = headline.text
+            if tmp_id.lower() == tmp_sec_title:
+                id_tag = headline.get('id')
+                break
+
+        if id_tag is not None:
+            return self._parse_section_links(id_tag)
+        return None
+
     # Protected Methods
     def __load(self, redirect=True, preload=False):
         ''' load the basic page information '''
@@ -605,6 +636,49 @@ def _continued_query(self, query_params, key='pages'):
             last_cont = request['continue']
     # end _continued_query
 
+    def _parse_section_links(self, id_tag):
+        ''' given a section id, parse the links in the unordered list '''
+        soup = BeautifulSoup(self.html, 'html.parser')
+        info = soup.find('span', {'id': id_tag})
+        all_links = list()
+
+        if info is None:
+            return all_links
+
+        for node in soup.find(id=id_tag).parent.next_siblings:
+            if not isinstance(node, Tag):
+                continue
+            elif node.get('role', '') == 'navigation':
+                continue
+            elif 'infobox' in node.get('class', []):
+                continue
+
+            # this is actually the child node's class...
+            is_headline = node.find('span', {'class': 'mw-headline'})
+            if is_headline is not None:
+                break
+            elif node.name == 'a':
+                all_links.append(self.__parse_link_info(node))
+            else:
+                for link in node.findAll('a'):
+                    all_links.append(self.__parse_link_info(link))
+        return all_links
+    # end _parse_section_links
+
+    def __parse_link_info(self, link):
+        ''' parse the <a> tag for the link '''
+        href = link.get('href', '')
+        txt = link.string or href
+        is_rel = is_relative_url(href)
+        if is_rel is True:
+            tmp = '{0}{1}'.format(self.mediawiki.base_url, href)
+        elif is_rel is None:
+            tmp = '{0}{1}'.format(self.url, href)
+        else:
+            tmp = href
+        return txt, tmp
+    # end __parse_link_info
+
     def __title_query_param(self):
         ''' util function to determine which parameter method to use '''
         if getattr(self, 'title', None) is not None:

diff --git a/mediawiki/utilities.py b/mediawiki/utilities.py
@@ -70,3 +70,13 @@ def str_or_unicode(text):
     if sys.version_info > (3, 0):
         return text.encode(encoding).decode(encoding)
     return text.encode(encoding)
+
+
+def is_relative_url(url):
+    ''' simple method to determine if a url is relative or absolute '''
+    if url.startswith('#'):
+        return None
+    if url.find('://') > 0 or url.startswith('//'):
+        # either 'http(s)://...' or '//cdn...' and therefore absolute
+        return False
+    return True