bundestag · ulfgebhardt · Dec 5, 2020 · Dec 5, 2020 · Dec 6, 2020 · Dec 6, 2020
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,7 @@
 test.zip
+test.py
+bgbl
 laws
 laws-md
+.vscode
+__pycache__
diff --git a/README.md b/README.md
@@ -4,7 +4,6 @@ BundesGit Gesetze Tools
 These scripts are used to keep the law repository up to date.
 
 Install requirements:
-
 ```bash
 pip install -r requirements.txt
 ```
@@ -17,28 +16,57 @@ Downloads all laws as XML files from
 [www.gesetze-im-internet.de](http://www.gesetze-im-internet.de/)
 and extracts them to a directory.
 
-Last tested: 2017-01-14 SUCCESS
+### Useage
+Update your list of laws first:
+```bash
+python lawde.py updatelist
+python lawde.py loadall
-python lawde.py loadall
-python lawde.py loadall
+```
+
+You can then download all laws by calling (<span style="color:red">**not recommended!**</span>)
+```bash
+python lawde.py loadall
+```
+Which will take approx. 2-3hrs.
+
+Alternatively, you can find the individual law you're interested in in [./data/laws.json](./data/laws.json), which is mostly a list of laws in this form:
+```bash
+{"slug": "<shortname>", "name": "<longname>", "abbreviation": "<abbreviation>"}
+```
+You can download individual laws by calling (<span style="color:red">**recommended**</span>)
+```bash
+python lawde.py load <shortname>
+```
+
+Last tested: 2020-12-05 SUCCESS
 
 ## lawdown.py
 
 Converts all XML laws to Markdown and copies them with other files related
 to the law into specified working directory.
 
-Last tested: 2017-01-14 SUCCESS
+### Useage
+```bash
+python lawdown.py convert <inpath> <outpath>
+python lawdown.py convert ./laws ./laws-md
+```
+
+Last tested: 2020-12-05 SUCCESS
 
 ## bgbl_scraper.py
 
 Scrapes the table of contents of all issues of the Bundesgesetzblatt and dumps
 the result to JSON.
 
-Last tested: 2017-01-14 FAILED ("KeyError: xaversid")
+Last tested: 2020-12-05 FAILED ("KeyError: xaversid")
+Issue seems to be a restructure of the Bundesanzeiger Webpage. The original bgbl links get an error 404.
 
 ## banz_scraper.py
 
 Scrapes the table of contents of all available issues of the Bundesanzeiger and
 dumps the result to JSON.
 
-Last tested: 2017-01-14 SUCCESS
+Last tested: 2020-12-23 SUCCESS
 
 ## vkbl_scraper.py
 

diff --git a/banz_scraper.py b/banz_scraper.py
@@ -17,6 +17,8 @@
   banz_scaper.py data/banz.json
 
 """
+import os
+import sys
 from pathlib import Path
 import re
 import json
@@ -34,6 +36,11 @@ class BAnzScraper:
     LIST = ('genericsearch_param.edition=%s&genericsearch_param.sort_type='
             '&%%28page.navid%%3Dofficial_starttoofficial_start_update%%29='
             'Veröffentlichungen+anzeigen')
+    # Website changed, so I am changing the links here
+    BASE_URL = 'https://www.bundesanzeiger.de/pub/de/amtlicher-teil?'
+    BASE = ''
+    YEAR = '&year=%s'
+    LIST = '&edition=BAnz+AT+%s'
 
     MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli',
             'August', 'September', 'Oktober', 'November', 'Dezember']
@@ -55,30 +62,38 @@ def scrape(self, low=0, high=10000):
 
     def get_years(self):
         url = self.BASE_URL + self.BASE
+        # this is the landing page of the Bundesanzeiger
+        # https://www.bundesanzeiger.de/ebanzwww/wexsservlet?page.navid=to_official_part&global_data.designmode=eb
+        # which resolves to: https://www.bundesanzeiger.de/pub/de/amtlicher-teil
         response = self.get(url)
         years = []
         root = lxml.html.fromstring(response.text)
-        selector = '#td_sub_menu_v li'
+        selector = '#id3' # was: selector = '#td_sub_menu_v li'
+        # This is the YEAR dropdown selector on top of the table (checked 2020/12/22)
         for li in root.cssselect(selector):
             try:
-                year = int(li.text_content())
+                years += [int(x) for x in li.text_content().split('\n') if x]
+                #was: year = int(li.text_content())
             except ValueError:
                 continue
-            years.append(year)
+            #was: years.append(year)
         return years
 
     def get_dates(self, year):
         url = self.BASE_URL + self.YEAR % year
         response = self.get(url)
         dates = []
         root = lxml.html.fromstring(response.text)
-        selector = 'select[name="genericsearch_param.edition"] option'
+        selector = '#id4' # was: selector = 'select[name="genericsearch_param.edition"] option'
+        # This is the DATE dropdown selector on top of the table (checked 2020/12/22)
         for option in root.cssselect(selector):
-            dates.append((option.attrib['value'], option.text_content().strip()))
+            #was: dates.append((option.attrib['value'], option.text_content().strip()))
+            dates += [x for x in option.text_content().split('\n') if x]
         return dates
 
     def get_items(self, year, date):
-        url = self.BASE_URL + self.LIST % date[0]
+        #url = self.BASE_URL + self.LIST % date[0]
+        url = self.BASE_URL + self.YEAR % year + self.LIST % date
         response = self.get(url)
         items = {}
         root = lxml.html.fromstring(response.text)
@@ -121,14 +136,25 @@ def main(arguments):
     maxyear = arguments['<maxyear>'] or 10000
     minyear = int(minyear)
     maxyear = int(maxyear)
+    print('This will scrape information from the Bundesanzeiger between ' + str(minyear) + ' and ' + str(maxyear) + '.')
+    print('Results will be stored in ' + arguments['<outputfile>'])
+    print('You will see all dates with publications appear below as they are parsed.')
     banz = BAnzScraper()
     data = {}
-    if Path(arguments['<outputfile>']).exists():
-        with open(arguments['<outputfile>']) as f:
-            data = json.load(f)
+    if os.path.exists(arguments['<outputfile>']):
+        if (sys.version_info > (3, 0)):
+            with open(arguments['<outputfile>']) as f:
+                data = json.load(f)
+        else:
+            with file(arguments['<outputfile>']) as f:
+                data = json.load(f)
     data.update(banz.scrape(minyear, maxyear))
-    with open(arguments['<outputfile>'], 'w') as f:
-        json.dump(data, f)
+    if (sys.version_info > (3, 0)):
+        with open(arguments['<outputfile>'], 'w+', encoding='utf8') as f:
+            json.dump(data, f, indent=2, sort_keys=True, ensure_ascii=False)
+    else:
+        with file(arguments['<outputfile>'], 'w', encoding='utf8') as f:
+            json.dump(data, f)
 
 if __name__ == '__main__':
     from docopt import docopt

diff --git a/bgbl_scraper.py b/bgbl_scraper.py
@@ -16,12 +16,15 @@
 from pathlib import Path
 import re
 import json
+import sys
 from collections import defaultdict
 
 import lxml.html
 import requests
 
-
+# Landing page might be this one: 
+# https://www.bgbl.de/xaver/bgbl/start.xav#__bgbl__%2F%2F*[%40attr_id%3D'I_2020_57_inhaltsverz']__1607176275258
+# https://www.bgbl.de/xaver/bgbl/start.xav?start=//*[@attr_id=%27%27]#__bgbl__%2F%2F*%5B%40attr_id%3D%27I_2020_62_inhaltsverz%27%5D__1608231069168
 class BGBLScraper:
     BASE_URL = 'http://www.bgbl.de/Xaver/'
     START = 'start.xav?startbk=Bundesanzeiger_BGBl'
@@ -85,6 +88,7 @@ def parse(self, response):
 
     def get_base_toc(self):
         url = self.BASE_URL + self.BASE_TOC
+        print(url)
         response = self.get(url)
         root = self.parse(response)
         selector = 'a.tocEntry'
@@ -214,12 +218,20 @@ def main(arguments):
     maxyear = int(maxyear)
     bgbl = BGBLScraper()
     data = {}
-    if Path(arguments['<outputfile>']).exists():
-        with open(arguments['<outputfile>']) as f:
-            data = json.load(f)
+    if os.path.exists(arguments['<outputfile>']):
+        if (sys.version_info > (3, 0)):
+            with open(arguments['<outputfile>'], 'r') as f:
+                data = json.load(f)
+        else:
+            with file(arguments['<outputfile>']) as f:
+                data = json.load(f)
     data.update(bgbl.scrape(minyear, maxyear))
-    with open(arguments['<outputfile>'], 'w') as f:
-        json.dump(data, f)
+    if (sys.version_info > (3, 0)):
+        with open(arguments['<outputfile>'], 'w+', encoding='utf8') as f:
+            json.dump(data, f, indent=2, sort_keys=True, ensure_ascii=False)
+    else:
+        with file(arguments['<outputfile>'], 'w') as f:
+            json.dump(data, f)
 
 if __name__ == '__main__':
     from docopt import docopt

diff --git a/data/banz.json b/data/banz.json