Merge pull request #81 from openeventdata/retrain

Update dependencies, gazetteer, and models
openeventdata · Jul 14, 2020 · 05fa31d · 05fa31d
2 parents a3ccef3 + 97edbee
commit 05fa31d
Show file tree

Hide file tree

Showing 14 changed files with 653 additions and 88 deletions.
diff --git a/README.md b/README.md
@@ -51,6 +51,17 @@ Installation and Requirements
 pip install mordecai
 ```
 
+**Note**: It's *strongly* recommended that you run Mordecai in a virtual
+environment. The libraries that Mordecai depends on are not always the most
+recent versions and using a virtual environment prevents libraries from being
+downgraded or running into other issues:
+
+```
+python -m venv mordecai-env
+source mordecai-env/bin/activate
+pip install mordecai
+```
+
 2. You should then download the required spaCy NLP model:
 
 ```
@@ -64,7 +75,7 @@ installed first).
 
 ```
 docker pull elasticsearch:5.5.2
-wget https://s3.amazonaws.com/ahalterman-geo/geonames_index.tar.gz --output-file=wget_log.txt
+wget https://andrewhalterman.com/files/geonames_index.tar.gz --output-file=wget_log.txt
 tar -xzf geonames_index.tar.gz
 docker run -d -p 127.0.0.1:9200:9200 -v $(pwd)/geonames_index/:/usr/share/elasticsearch/data elasticsearch:5.5.2
 ```
@@ -111,7 +122,7 @@ from it.
   placename. 
 
 The training data for the two models includes copyrighted text so cannot be
-shared freely, but get in touch with me if you're interested in it.
+shared freely.
 
 API and Configuration
 ---------------------
@@ -136,9 +147,10 @@ When instantiating the `Geoparser()` module, the following options can be change
     Elasticsearch database. Defaults to `True`, which gives a ~6x speedup.
 
 `geoparse` is the primary endpoint and the only one that most users will need.
-Other methods are primarily internal to Mordecai but may be directly useful in
-some cases:
+Other, mostly internal, methods may be useful in some cases:
 
+- `lookup_city` takes a city name, country, and (optionally) ADM1/state/governorate and 
+    does a rule-based lookup for the city.
 - `infer_country` take a document and attempts to infer the most probable
     country for each.
 - `query_geonames` and `query_geonames_country` can be used for performing a
@@ -149,7 +161,7 @@ some cases:
 `batch_geoparse` takes in a list of documents and uses spaCy's `nlp.pipe`
 method to process them more efficiently in the NLP step. 
 
-Advanced users on large machines can modify the `lru_cache` parameter from 250
+Advanced users on large machines can increase the `lru_cache` parameter from 250
 to 1000. This will use more memory but will increase parsing speed.
 
 Tests

diff --git a/examples/README.md b/examples/README.md
@@ -0,0 +1,17 @@
+# Mordecai examples
+
+## Geocoding cities
+
+This script is an example usage of `geo.lookup_city()`, which takes a CSV
+containing columns with city names, country 3 letter codes, and (optionally)
+state/ADM1 names. If the columns are named (respectively) `city`, `adm1`, and
+`country`, you can run it like this:
+
+```
+python geocode_cities.py geocode_cities.csv out.csv
+```
+
+Otherwise, you'll have to specify the column names as part of the call. The
+geocoder returns lat/lon and Geonames information, as well as providing the
+reason for why it selected a particular location and cautions when the results
+were ambiguous. 
diff --git a/examples/geocode_cities.csv b/examples/geocode_cities.csv
@@ -0,0 +1,7 @@
+city,adm1,country
+Norman,OK,USA
+College Park,MD,USA
+Cambridge,MA,USA
+Whaugbggoan,OK,USA
+Columbia Heights,DC,USA
+Aleppo,Aleppo,SYR
diff --git a/examples/geocode_cities.py b/examples/geocode_cities.py
@@ -0,0 +1,62 @@
+import plac
+import pandas as pd
+from mordecai import Geoparser
+from tqdm import tqdm
+
+
+def main(in_file: ("input CSV file"), 
+        out_file: ("filename to write ouput to"), 
+        city_col: ("column in CSV with city col") = "city",
+         adm1_col: ("column in CSV with state/governorate/ADM1") = "adm1", 
+         country_col: ("column in CSV with country name") = "country"):
+    """Geocode a csv with a city, ADM1, and country columns."""
+    print("Loading Mordecai...")
+    geo = Geoparser() 
+    df = pd.read_csv(in_file)
+    geocoded = []
+    print("Geocoding...")
+    for i in tqdm(df.iterrows()):
+        row = i[1]
+        if pd.isnull(row[adm1_col]):
+            # Elasticsearch doesn't like NaN, change to None
+            adm1 = None
+        else:
+            adm1 = row[adm1_col] 
+        res = geo.lookup_city(city = row[city_col], 
+                              adm1 = adm1, 
+                              country = row[country_col])
+        try:
+            gc = {"admin1_code" : res['geo']['admin1_code'],
+                  "admin2_code": res['geo']['admin2_code'],
+                  "asciiname": res['geo']['asciiname'],
+                  "name": res['geo']['name'],
+                  "geonameid": res['geo']['geonameid'],
+                  "feature_class": res['geo']['feature_class'],
+                  "feature_code": res['geo']['feature_code'],
+                  "country_code3": res['geo']['country_code3'],
+                  "lat": float(res['geo']['coordinates'].split(",")[0]),
+                  "lon": float(res['geo']['coordinates'].split(",")[1])}
+        except TypeError:
+            gc = {"admin1_code" : "",
+                  "admin2_code": "",
+                  "asciiname": "",
+                  "name": "",
+                  "geonameid": "",
+                  "feature_class": "",
+                  "feature_code": "", 
+                  "country_code3": "",
+                  "lat": "",
+                  "lon": ""}
+        gc['search_city'] = row[city_col]
+        gc['search_adm1'] = row[adm1_col]
+        gc['search_country'] = row[country_col]
+        gc["info"] = res['info']
+        gc["reason"] = res['reason']
+        geocoded.append(gc)
+    geo_df = pd.DataFrame(geocoded)
+    geo_df.to_csv(out_file)
+    print("Wrote file out to ", out_file)
+
+
+if __name__ == '__main__':
+    plac.call(main)
diff --git a/examples/out.csv b/examples/out.csv
@@ -0,0 +1,7 @@
+,admin1_code,admin2_code,asciiname,name,geonameid,feature_class,feature_code,country_code3,lat,lon,search_city,search_adm1,search_country,info,reason
+0,OK,027,Norman,Norman,4543762,P,PPLA2,USA,35.22257,-97.43948,Norman,OK,USA,50 total results of all types,"Single match for city in Elasticsearch with name, ADM1, country."
+1,MD,033,College Park,College Park,4351977,P,PPL,USA,38.98067,-76.93692,College Park,MD,USA,2 elasticsearch matches for cities out of 37 total results of all types,Exact name match for city.
+2,ID,005,Cambridge,Cambridge,5587778,P,PPL,USA,42.45047,-112.11663,Cambridge,MA,USA,33 entries within minimum edit distance. Picking closest average distance: 2.25.,CAUTION: Best of several edit distance matches.
+3,,,,,,,,,,,Whaugbggoan,OK,USA,0 total results of all types.,FAILURE: No fuzzy match for city or neighborhood.
+4,DC,001,Columbia Heights,Columbia Heights,4138102,P,PPL,USA,38.92567,-77.02942,Columbia Heights,DC,USA,6 total results of all types,"Single match for city in Elasticsearch with name, ADM1, country."
+5,09,,Aleppo,Aleppo,170063,P,PPLA,SYR,36.20124,37.16117,Aleppo,Aleppo,SYR,9 total results of all types,"Single match for city in Elasticsearch with name, ADM1, country."