Complete API rewrite \o/ 🎉

Completely rewrote the API backend. Before, it parsed through the page HTML for entire Wikipedia articles. This was slow and inefficient, but more importantly it violated the Wikimedia bot policy (https://meta.wikimedia.org/wiki/Bot_policy#Unacceptable_usage). Now, this project uses the official WikiMedia API. This has countless advantages: 1. It's legal under WikiMedia's bot policy 2. It's faster. Things like fetching page title no longer need to involve loading the entire page HTML into a BeautifulSoup object. Now it's a fast API call. Instead of parsing thousands of characters of HTML, it needs to parse under 100 characters of JSON. Also on speed, now the HTML loading that *does* happen is much less, since the MediaWiki API lets me only get the HTML of the first section. This makes other stuff faster too. 3. It opens up new possibilities. Faster calls to get_page_name could (finally) lead to a realistic (speed-wise) solution to issue #2. I'm excited about this. It's probably not actually that exciting. But oh well :)
controversial · Mar 5, 2016 · 30e0c64 · 30e0c64
1 parent bccac32
commit 30e0c64
Show file tree

Hide file tree

Showing 3 changed files with 138 additions and 56 deletions.
diff --git a/api/api.py b/api/api.py
@@ -1,27 +1,31 @@
 # coding: utf-8
 
 from __future__ import unicode_literals
-from flask import Flask,request
 import json
 
-#My wikipedia API
+from flask import Flask, request
+
+# My wikipedia API
 from wikipedia_parse import *
 
 app = Flask(__name__)
 
+
 @app.route('/links')
 def getSubPages():
-    page=request.args.get("page")
+    page = request.args.get("page")
     return json.dumps(first_paragraph_links(page))
 
+
 @app.route('/pagename')
 def getPageName():
-    page=request.args.get("page")
+    page = request.args.get("page")
     return json.dumps(get_page_name(page))
 
+
 @app.route('/random')
 def randomArticle():
-  return get_random_article()
+    return get_random_article()
 
 if __name__ == "__main__":
     app.run()
diff --git a/api/api.wsgi b/api/api.wsgi
@@ -1,3 +1,4 @@
 import sys
-sys.path.insert(0,"/var/www/deentaylor.com/luke/public_html/wikipedia/api")
+sys.path.insert(0, "/var/www/deentaylor.com/luke/public_html/wikipedia/api")
+
 from api import app as application
diff --git a/api/wikipedia_parse.py b/api/wikipedia_parse.py
@@ -1,65 +1,142 @@
 #!/usr/local/bin/python
-# coding: utf-8
-from __future__ import unicode_literals
+# -*- coding: utf-8 -*-
+
+"""Functions for getting information about wikipedia pages. This contains the
+code for all of the functions used by the backend of Wikipedia Map"""
+
+import json
+from urllib2 import quote, unquote
+
 import bs4
-import urllib2
+import requests
+
+# Base URL for API
+_endpoint = "https://en.wikipedia.org/w/api.php"
 
-#Pretend not to be a bot
-opener = urllib2.build_opener()
-opener.addheaders = [('User-agent', 'Mozilla/5.0')]
 
-def get_url(pagename):
-    return "https://en.wikipedia.org/wiki/"+urllib2.quote(pagename.encode("utf-8"))
+# --- HELPER FUNCTIONS --- #
+
 
 def get_page_title(url):
-    #The last element of the URL is always the title. Allow for both URLs that
-    #end with a slash and for URLs that don't.
+    """Get the title of a page quickly, but inaccurately from a URL. Allows
+    both for URLs with a trailing slash and URLs without.
+
+    This is considered inaccurate because this does not handle redirects. E.g.
+    one page might link to /wiki/Cats, while another might link to /wiki/Cat.
+    These are both the same page, but will be recognized as different."""
+    # The last element of the URL is always the title.
     return url.rstrip('/').split('/')[-1]
 
+
 def get_page_name(page):
-    #The title of the page before the hyphen.
-    return get_wiki_soup(get_url(page)).title.string.split("-")[0].strip()
-
-def get_wiki_soup(url):
-    #Open the URL
-    f=opener.open(url)
-    #Return the data, ascii decoded.
-    data=str(f.read().decode("ascii",errors="ignore"))
-    f.close()
-    #Specify parser to hide error message
-    return bs4.BeautifulSoup(data,"html.parser")
+    """Get the title of a page accurately, but more slowly. See get_page_title
+    for notes on accuracy"""
+
+    payload = {
+        "format": "json",
+        "action": "query",
+        "titles": page,
+        "redirects": 1
+    }
+
+    req = requests.get(_endpoint, params=payload)
+    resp = json.loads(req.text)
+    return resp["query"]["pages"].values()[0]["title"]
+
+
+def is_article(name):
+    """Decide whether the name of a wikipedia page is an article, or belongs to
+    another namespace. See https://en.wikipedia.org/wiki/Wikipedia:Namespace"""
+    # Pages outside of main namespace have colons in the middle, e.g. 'WP:UA'
+    return ":" not in name.strip(":")
+
+
+# --- MAIN FUNCTIONS --- #
+
+
+def get_page_html(pagename):
+    """Get a BeautifulSoup object representing the HTML for the first section
+    of the Wikipedia article named <pagename>"""
+
+    payload = {
+        "format": "json",
+        "action": "parse",
+        "page": pagename,
+        "prop": "text",
+        "section": 0,
+        "redirects": 1
+    }
+
+    req = requests.get(_endpoint, params=payload)
+    resp = json.loads(req.text)
+
+    if "error" in resp.keys():
+        return None
+    else:
+        html = resp["parse"]["text"]["*"]
+        return bs4.BeautifulSoup(html, "html.parser")
+
+
+def get_first_paragraph(pagename):
+    """Get a BeautifulSoup object representing the HTML for the first paragraph
+    of the Wikipedia article named <pagename>"""
+    html = get_page_html(pagename)
+    if html is None:
+        return None
+    else:
+        return html.find("p", recursive=False)
+
+
+def first_paragraph_links(pagename):
+    """Get the name of each Wikipedia article linked to from the first
+    paragraph of the Wikipedia article named <pagename>"""
+    p1 = get_first_paragraph(pagename)
+    if p1 is None:
+        return []
+    else:
+        links = [link.get("href") for link in p1.find_all("a")]
+        links = [link for link in links if link.startswith("/wiki/")]
+        links = [get_page_title(link) for link in links]
+        links = [link.split("#")[0] for link in links]
+        links = [link for link in links if is_article(link)]
+        links = [link.replace("_", " ") for link in links]
+        links = list(set(links))
+        return links
+
 
 def get_random_article():
-  randomurl="https://en.wikipedia.org/wiki/Special:Random"
-  o = opener.open(randomurl)
-  pageurl = o.geturl()
-  return pageurl.split("/")[-1]
-
-def first_paragraph_links(page):
-    soup=get_wiki_soup(get_url(page))
-    #Div with content in it
-    content=soup.find("div",id="mw-content-text")
-    #First p tag directly under the content div
-    paragraphs=content.find_all("p",recursive=False)
-    paragraph1=paragraphs[0]
-
-    #If the first paragraph is coordinate info, use the second paragraph.
-    firstlink = paragraph1.find("a")
-    if "id" in firstlink.parent.attrs and firstlink.parent["id"]=="coordinates":
-        paragraph1=paragraphs[1]
-
-    #Find all links from the first paragraph (no duplicates)
-    links = list(set([link.get("href") for link in paragraph1.find_all("a")]))
-    #Exclude links that tag points later in the article, and return the page title.
-    pagenames = [str(l.split("/")[-1]) for l in links if l.startswith("/wiki/")]
-    #Remove files
-    pagenames = [pn for pn in pagenames if not pn.startswith(("File:","Wikipedia:","Help:"))]
-    #Remove underscores
-    pagenames = [pn.replace("_"," ") for pn in pagenames]
-    #Remove fragment identifiers
-    return [pn.rsplit("#")[0] for pn in pagenames]
+    """Get the name of a random Wikipedia article"""
+
+    payload = {
+        "format": "json",
+        "action": "query",
+        "list": "random",
+        "rnlimit": 1,
+        "rnnamespace": 0  # Limits results to articles
+    }
 
+    req = requests.get(_endpoint, payload)
+    resp = json.loads(req.text)
+    return resp["query"]["random"][0]["title"]
 
 
 if __name__ == "__main__":
-    print first_paragraph_links("Zürich")
+    import time
+
+    print is_article(":Cows"), is_article("WP:UA")  # Test if it's an article
+
+    start = time.time()
+    print get_page_name("Cats"),  # Test accurate page name fetch
+    print "({} seconds)\n".format(time.time()-start)
+
+    start = time.time()
+    print get_random_article(),  # Test random article fetching
+    print "({} seconds)\n".format(time.time()-start)
+
+    start = time.time()
+    print first_paragraph_links("Penguins"),  # Test link fetching
+    print "({} seconds)\n".format(time.time()-start)
+
+    start = time.time()
+    print first_paragraph_links("Zürich"),  # Test unicode
+    print "({} seconds)\n".format(time.time()-start)