fix init, mix model folder, improve readme, fix setup.py

appeler · Aug 16, 2023 · 96908f9 · 96908f9
1 parent c9c4748
commit 96908f9
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 31 deletions.
diff --git a/README.rst b/README.rst
@@ -8,15 +8,12 @@ ethnicolr2: Predict Race and Ethnicity From Name
 .. image:: https://pepy.tech/badge/ethnicolr2
     :target: https://pepy.tech/project/ethnicolr2
 
-We exploit the US census data, and the Florida voting registration data, to predict race
-and ethnicity based on first and last name or just the last name. The granularity 
-at which we predict the race: Non-Hispanic Whites, Non-Hispanic Blacks, Asians, Hispanics, and 
-Other (Native Americans, Bi-Racial, Other).
+A pytorch implementation of `ethnicolr <https://github.com/appeler/ethnicolr>`__  with new models that make different assumptions (for instance, this package has models trained on unique names) than ethnicolr. The package uses the US census data and the Florida voting registration data to build models to predict the race and ethnicity (Non-Hispanic Whites, Non-Hispanic Blacks, Asians, Hispanics, and Other) based on first and last name or just the last name. For notebooks underlying the package, see `here <https://github.com/appeler/ethnicolr_v2>`__.
 
 Caveats and Notes
 -----------------------
 
-If you picked a person at random with the last name 'Smith' in the US in 2010 and asked us to guess this person's race (as measured by the census), the best guess would be based on what is available from the aggregated Census file. It is the Bayes Optimal Solution. So what good are last-name-only predictive models for? A few things---if you want to impute race and ethnicity for last names that are not in the census file, infer the race and ethnicity in different years than when the census was conducted (if some assumptions hold), infer the race of people in different countries (if some assumptions hold), etc. The biggest benefit comes in cases where both the first name and last name are known.
+If you picked a random person with the last name 'Smith' in the US in 2010 and asked us to guess this person's race (as measured by the census), the best guess is the modal race of the person named Smith (which you can get from the census popular last name data file). It is the Bayes Optimal Solution. So what good are predictive models? A few things---if you want to impute race and ethnicity for last names that are not in the census file, which can be because of errors, infer the race and ethnicity in different years than when the census was conducted (if some assumptions hold), infer the race of people in different countries (if some assumptions hold), etc. The biggest benefit comes in cases where both the first and last name are known.
 
 Install
 ----------
@@ -56,18 +53,18 @@ To see the available command line options for any function, please type in
 Examples
 ----------
 
-To append census data from 2010 to a `file with column header in the first row <ethnicolr/data/input-with-header.csv>`__, specify the column name carrying last names using the ``-l`` option, keeping the rest the same:
+To append census data from 2010 to a `file with column header in the first row <ethnicolr2/data/input-with-header.csv>`__, specify the column name carrying last names using the ``-l`` option, keeping the rest the same:
 
 ::
 
    census_ln -y 2010 -o output-census2010.csv -l last_name input-with-header.csv   
 
 
-To predict race/ethnicity using `Wikipedia full name model <ethnicolr/models/ethnicolr_keras_lstm_wiki_name.ipynb>`__, specify the column name of last name and first name by using ``-l`` and ``-f`` flags respectively.
+To predict race/ethnicity using the Florida Last Name Model, specify the column name of last name and first name by using ``-l`` and ``-f`` flags respectively.
 
 ::
 
-   pred_wiki_name -o output-wiki-pred-race.csv -l last_name -f first_name input-with-header.csv
+   pred_fl_last_name -o output-wiki-pred-race.csv -l last_name -f first_name input-with-header.csv
 
 
 Functions

diff --git a/ethnicolr2/__init__.py b/ethnicolr2/__init__.py
@@ -1,8 +1,8 @@
 
-from ethnicolr.census_ln import census_ln
-from ethnicolr.pred_fl_fn_lstm import pred_fl_full_name
-from ethnicolr.pred_fl_ln_lstm import pred_fl_last_name
-from ethnicolr.pred_cen_ln_lstm import pred_census_last_name
+from ethnicolr2.census_ln import census_ln
+from ethnicolr2.pred_fl_fn_lstm import pred_fl_full_name
+from ethnicolr2.pred_fl_ln_lstm import pred_fl_last_name
+from ethnicolr2.pred_cen_ln_lstm import pred_census_last_name
 
 
 __all__ = ['census_ln', 'pred_census_ln', 'pred_wiki_ln',

diff --git a/ethnicolr2/pred_fl_fn_lstm.py b/ethnicolr2/pred_fl_fn_lstm.py
@@ -9,10 +9,10 @@
 
 
 class FullNameLstmModel(EthnicolrModelClass):
-    """ To predict ethnicity based on fullname """
+    """Predict ethnicity based on fullname """
 
-    MODEL_FN = "models/pytorch/lstm_fullname.pt"
-    VOCAB_FN = "models/pytorch/pt_vec_fullname.joblib"
+    MODEL_FN = "models/lstm_fullname.pt"
+    VOCAB_FN = "models/pt_vec_fullname.joblib"
 
     @classmethod
     def pred_fl_full_name(cls,
@@ -33,8 +33,8 @@ def pred_fl_full_name(cls,
 
         Returns:
             DataFrame: Pandas DataFrame with additional columns:
-                - `race` the predict result
-                - Additional columns for probability of each classes.
+                - `race` the prediction result
+                - Additional columns for the probability of each of the classes.
 
         """
 

diff --git a/ethnicolr2/pred_fl_ln_lstm.py b/ethnicolr2/pred_fl_ln_lstm.py
@@ -9,8 +9,8 @@
 
 
 class LastNameLstmModel(EthnicolrModelClass):
-    MODEL_FN = "models/pytorch/lstm_lastname_gen.pt"
-    VOCAB_FN = "models/pytorch/pt_vec_lastname.joblib"
+    MODEL_FN = "models/lstm_lastname_gen.pt"
+    VOCAB_FN = "models/pt_vec_lastname.joblib"
 
     @classmethod
     def pred_fl_last_name(cls,

diff --git a/setup.py b/setup.py
@@ -20,6 +20,7 @@
 with open(path.join(here, "README.rst"), encoding="utf-8") as f:
     long_description = f.read()
 
+
 class PostDevelopCommand(develop):
     """Post-installation for development mode."""
     def run(self):
@@ -29,7 +30,7 @@ class PostInstallCommand(install):
     """Post-installation for installation mode."""
     def run(self):
         install.run(self)
-
+        
 class Tox(TestCommand):
     user_options = [("tox-args=", "a", "Arguments to pass to tox")]
 
@@ -53,11 +54,11 @@ def run_tests(self):
         tox.cmdline(args=args)
 
 setup(
-    name="ethnicolr",
+    name="ethnicolr2",
     # Versions should comply with PEP440.  For a discussion on single-sourcing
     # the version across setup.py and the project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
-    version="0.9.6",
+    version="0.1.0",
     description=("Predict Race/Ethnicity Based on Sequence of Characters"
                  " in the Name"),
     long_description=long_description,
@@ -76,8 +77,7 @@ def run_tests(self):
         #   5 - Production/Stable
         "Development Status :: 4 - Beta",
         "License :: OSI Approved :: MIT License",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
         "Topic :: Scientific/Engineering :: Information Analysis",
         "Topic :: Software Development :: Libraries :: Python Modules",
         "Topic :: Utilities",
@@ -105,7 +105,7 @@ def run_tests(self):
     # installed, specify them here.  If using Python 2.6 or less, then these
     # have to be included in MANIFEST.in as well.
     package_data={
-        "ethnicolr": [
+        "ethnicolr2": [
             "data/census/census_2000.csv",
             "data/census/census_2010.csv",
             "data/census/readme.md",
@@ -121,17 +121,17 @@ def run_tests(self):
     # Although 'package_data' is the preferred approach, in some case you may
     # need to place data files outside of your packages. See:
     # http://docs.python.org/3.4/distutils/setupscript.html#installing-additional-files # noqa
-    # In this case, 'ethnicolr' will be installed into '<sys.prefix>/ethnicolr'
-    # data_files=[('ethnicolr', ['ethnicolr/data/test.txt'])],
+    # In this case, 'ethnicolr2' will be installed into '<sys.prefix>/ethnicolr2'
+    # data_files=[('ethnicolr2', ['ethnicolr/data/test.txt'])],
     # To provide executable scripts, use entry points in preference to the
     # "scripts" keyword. Entry points provide cross-platform support and allow
     # pip to create the appropriate form of executable for the target platform.
     entry_points={
         "console_scripts": [
-            "census_ln=ethnicolr.census_ln:main",
-            "pred_fl_full_name=ethnicolr.pred_fl_full_name:main",
-            "pred_fl_last_name=ethnicolr.pred_fl_last_name:main",
-            "pred_census_last_name=ethnicolr.pred_census_last_name:main",
+            "census_ln=ethnicolr2.census_ln:main",
+            "pred_fl_full_name=ethnicolr2.pred_fl_full_name:main",
+            "pred_fl_last_name=ethnicolr2.pred_fl_last_name:main",
+            "pred_census_last_name=ethnicolr2.pred_census_last_name:main",
         ],
     },
     cmdclass={