Skip to content

Commit

Permalink
0.3.15 (#34)
Browse files Browse the repository at this point in the history
* add base url property
* parse links from sections
  • Loading branch information
barrust committed Sep 18, 2017
1 parent ba8bbe1 commit bf38c20
Show file tree
Hide file tree
Showing 9 changed files with 6,063 additions and 1,501 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Expand Up @@ -2,6 +2,11 @@

## Current

### Version 0.3.15

* Add parse all links within a section [issue #33](https://github.com/barrust/mediawiki/issues/33)
* Add base url property to mediawiki site

### Version 0.3.14

* Add refresh interval to cached responses (Defaults to not refresh)
Expand Down
8 changes: 4 additions & 4 deletions README.rst
Expand Up @@ -15,10 +15,10 @@ MediaWiki
:target: https://opensource.org/licenses/MIT/
:alt: License

**mediawiki** is a python wrapper for the MediaWiki API. The goal is to allow
users to quickly and efficiently pull data from the MediaWiki site of their
choice instead of worrying about dealing directly with the API. As such,
it does not force the use of a particular MediaWiki site. It defaults to
**mediawiki** is a python wrapper and parser for the MediaWiki API. The goal
is to allow users to quickly and efficiently pull data from the MediaWiki site
of their choice instead of worrying about dealing directly with the API. As
such, it does not force the use of a particular MediaWiki site. It defaults to
`Wikipedia <http://www.wikipedia.org>`__ but other MediaWiki sites can
also be used.

Expand Down
33 changes: 25 additions & 8 deletions mediawiki/mediawiki.py
Expand Up @@ -16,7 +16,7 @@
from .utilities import (memoize)

URL = 'https://github.com/barrust/mediawiki'
VERSION = '0.3.14'
VERSION = '0.3.15'


class MediaWiki(object):
Expand Down Expand Up @@ -51,6 +51,7 @@ def __init__(self, url='http://{lang}.wikipedia.org/w/api.php', lang='en',
self._min_wait = rate_limit_wait
self._extensions = None
self._api_version = None
self._base_url = None
self.__supported_languages = None

# for memoized results
Expand Down Expand Up @@ -85,6 +86,16 @@ def api_version(self):
'''
return '.'.join([str(x) for x in self._api_version])

@property
def base_url(self):
''' Base URL for the MediaWiki site
:getter: Returns the base url of the site
:setter: Not settable
:type: string
'''
return self._base_url

@property
def extensions(self):
'''Extensions installed on the MediaWiki site
Expand Down Expand Up @@ -755,25 +766,31 @@ def wiki_request(self, params):

# Protected functions
def _get_site_info(self):
'''
Parse out the Wikimedia site information including
API Version and Extensions
'''
''' Parse out the Wikimedia site information including
API Version and Extensions '''
response = self.wiki_request({
'meta': 'siteinfo',
'siprop': 'extensions|general'
})

# shouldn't a check for success be done here?

gen = response['query']['general']['generator']
api_version = gen.split(' ')[1].split('-')[0]
gen = response['query']['general']
api_version = gen['generator'].split(' ')[1].split('-')[0]

major_minor = api_version.split('.')
for i, item in enumerate(major_minor):
major_minor[i] = int(item)
self._api_version = tuple(major_minor)

# parse the base url out
tmp = gen['server']
if tmp.startswith('http://') or tmp.startswith('https://'):
self._base_url = tmp
elif gen['base'].startswith('https:'):
self._base_url = 'https:{}'.format(tmp)
else:
self._base_url = 'http:{}'.format(tmp)

self._extensions = set()
for ext in response['query']['extensions']:
self._extensions.add(ext['name'])
Expand Down
84 changes: 79 additions & 5 deletions mediawiki/mediawikipage.py
Expand Up @@ -6,8 +6,8 @@

from __future__ import (unicode_literals, absolute_import)
from decimal import (Decimal)
from bs4 import (BeautifulSoup)
from .utilities import (str_or_unicode)
from bs4 import (BeautifulSoup, Tag)
from .utilities import (str_or_unicode, is_relative_url)
from .exceptions import (MediaWikiException, PageError, RedirectError,
DisambiguationError, ODD_ERROR_MESSAGE)

Expand Down Expand Up @@ -210,13 +210,15 @@ def images(self):

@property
def logos(self):
''' Images within the infobox signifying either the main image or logo
''' Parse images within the infobox signifying either the main image \
or logo
:getter: Returns the list of all images in the information box
:setter: Not settable
:type: list
.. note:: Side effect is to also pull the html which can be slow
.. note:: This is a parsing operation and not part of the standard API
'''
if self._logos is False:
self._logos = list()
Expand All @@ -230,13 +232,14 @@ def logos(self):

@property
def hatnotes(self):
''' Pull hatnotes from the page
''' Parse hatnotes from the html
:getter: Returns the list of all hatnotes from the page
:setter: Not settable
:type: list
.. note:: Side effect is to also pull the html which can be slow
.. note:: This is a parsing operation and not part of the standard API
'''
if self._hatnotes is False:
self._hatnotes = list()
Expand All @@ -255,7 +258,7 @@ def hatnotes(self):

@property
def references(self):
''' External links, or references, listed on the page
''' External links, or references, listed anywhere on the MediaWiki page
:getter: Returns the list of all external links
:setter: Not settable
Expand Down Expand Up @@ -457,6 +460,8 @@ def section(self, section_title):
.. note:: Returns **None** if section title is not found; \
only text between title and next section or sub-section title \
is returned.
.. note:: Side effect is to also pull the content which can be slow
.. note:: This is a parsing operation and not part of the standard API
'''
section = '== {0} =='.format(section_title)
try:
Expand All @@ -472,6 +477,32 @@ def section(self, section_title):

return self.content[index:next_index].lstrip('=').strip()

def parse_section_links(self, section_title):
''' Parse all links within a section
:param section_title: Name of the section to pull
:typee section_title: string
:return: list of (title, url) tuples
.. note:: Returns **None** if section title is not found
.. note:: Side effect is to also pull the html which can be slow
.. note:: This is a parsing operation and not part of the standard API
'''
soup = BeautifulSoup(self.html, 'html.parser')
headlines = soup.find_all('span', {'class': 'mw-headline'})
tmp_soup = BeautifulSoup(section_title, 'html.parser')
tmp_sec_title = tmp_soup.get_text().lower()
id_tag = None
for headline in headlines:
tmp_id = headline.text
if tmp_id.lower() == tmp_sec_title:
id_tag = headline.get('id')
break

if id_tag is not None:
return self._parse_section_links(id_tag)
return None

# Protected Methods
def __load(self, redirect=True, preload=False):
''' load the basic page information '''
Expand Down Expand Up @@ -605,6 +636,49 @@ def _continued_query(self, query_params, key='pages'):
last_cont = request['continue']
# end _continued_query

def _parse_section_links(self, id_tag):
''' given a section id, parse the links in the unordered list '''
soup = BeautifulSoup(self.html, 'html.parser')
info = soup.find('span', {'id': id_tag})
all_links = list()

if info is None:
return all_links

for node in soup.find(id=id_tag).parent.next_siblings:
if not isinstance(node, Tag):
continue
elif node.get('role', '') == 'navigation':
continue
elif 'infobox' in node.get('class', []):
continue

# this is actually the child node's class...
is_headline = node.find('span', {'class': 'mw-headline'})
if is_headline is not None:
break
elif node.name == 'a':
all_links.append(self.__parse_link_info(node))
else:
for link in node.findAll('a'):
all_links.append(self.__parse_link_info(link))
return all_links
# end _parse_section_links

def __parse_link_info(self, link):
''' parse the <a> tag for the link '''
href = link.get('href', '')
txt = link.string or href
is_rel = is_relative_url(href)
if is_rel is True:
tmp = '{0}{1}'.format(self.mediawiki.base_url, href)
elif is_rel is None:
tmp = '{0}{1}'.format(self.url, href)
else:
tmp = href
return txt, tmp
# end __parse_link_info

def __title_query_param(self):
''' util function to determine which parameter method to use '''
if getattr(self, 'title', None) is not None:
Expand Down
10 changes: 10 additions & 0 deletions mediawiki/utilities.py
Expand Up @@ -70,3 +70,13 @@ def str_or_unicode(text):
if sys.version_info > (3, 0):
return text.encode(encoding).decode(encoding)
return text.encode(encoding)


def is_relative_url(url):
''' simple method to determine if a url is relative or absolute '''
if url.startswith('#'):
return None
if url.find('://') > 0 or url.startswith('//'):
# either 'http(s)://...' or '//cdn...' and therefore absolute
return False
return True

0 comments on commit bf38c20

Please sign in to comment.