Merge pull request specify#9 from calacademy-research/picturae_import

added derived test classes of base picturae import classes. Created new testing modules for picturae_importer
calacademy-research · Sep 20, 2023 · 92e5cbf · 92e5cbf
2 parents e6608ae + a21f0fa
commit 92e5cbf
Show file tree

Hide file tree

Showing 17 changed files with 282 additions and 70 deletions.
diff --git a/image_client/casbotany_sql_lite.py b/image_client/casbotany_sql_lite.py
@@ -382,6 +382,7 @@ def table_sql_list():
                         );''']
 
     return sql_bot_list
+
 def casbotany_lite_creator():
     connect = sqlite3.connect('cas_botanylite.db')
     sql_list = table_sql_list()
@@ -394,6 +395,28 @@ def casbotany_lite_creator():
     curs.close()
     connect.close()
 
+
+def sql_lite_connection(db_name):
+    connection = sqlite3.connect(db_name)
+    return connection
+
+def sql_lite_insert(sql, db_name):
+    """facimile statement to insert_table_record in sql_csv_utils.py"""
+    connect = sqlite3.connect(db_name)
+    curs = connect.cursor()
+    try:
+        curs.execute(sql)
+    except Exception as e:
+        raise ValueError(f"Exception thrown while processing sql: {sql}\n{e}\n")
+    try:
+        connect.commit()
+
+    except Exception as e:
+        raise ValueError(f"sql debug: {e}")
+
+    curs.close()
+    connect.close()
+
 def casbotany_lite_getrecord(sql_query: str):
     """modified get one record function for sql lite"""
     connect = sqlite3.connect('cas_botanylite.db')

diff --git a/image_client/picturae_csv_create.py b/image_client/picturae_csv_create.py
@@ -22,12 +22,12 @@
 
 
 class CsvCreatePicturae(Importer):
-    def __init__(self, date_string, istest = False):
+    def __init__(self, date_string):
         super().__init__(picturae_config, "Botany")
         self.init_all_vars(date_string)
 
-        if istest is False:
-            self.run_all()
+
+        self.run_all()
 
 
     def init_all_vars(self, date_string):

diff --git a/image_client/picturae_importer.py b/image_client/picturae_importer.py
@@ -24,10 +24,10 @@ class PicturaeImporter(Importer):
            along with attached images
     """
 
-    def __init__(self, paths, date_string=None, istest=False):
+    def __init__(self, paths, date_string=None):
         super().__init__(picturae_config, "Botany")
 
-        self.setting_init_variables(date_string=date_string, paths=paths)
+        self.init_all_vars(date_string=date_string, paths=paths)
 
         # running csv create
         CsvCreatePicturae(date_string=self.date_use)
@@ -40,11 +40,10 @@ def __init__(self, paths, date_string=None, istest=False):
 
         self.batch_md5 = generate_token(starting_time_stamp, self.file_path)
 
-        if istest is False:
-            self.run_all_methods()
+        self.run_all_methods()
 
 
-    def setting_init_variables(self, date_string, paths):
+    def init_all_vars(self, date_string, paths):
         """setting init variables:
             a list of variables and data structures to be initialized at the beginning of the class.
             args:
@@ -265,14 +264,14 @@ def create_agent_list(self, row):
             # need a way to keep ones with nas, but only split titles from real names
             if not pd.isna(first) or not pd.isna(middle) or not pd.isna(last):
                 # first name title taking priority over last
-                first_name, title = assign_titles(first_last='first', name=f"{first}")
-                last_name, title = assign_titles(first_last='last', name=f"{last}")
+                first_name, title = assign_collector_titles(first_last='first', name=f"{first}")
+                last_name, title = assign_collector_titles(first_last='last', name=f"{last}")
 
                 middle = middle
                 elements = [first_name, last_name, title, middle]
 
                 for index in range(len(elements)):
-                    if elements[index] == '':
+                    if pd.isna(elements[index]) or elements[index] == '':
                         elements[index] = pd.NA
 
                 first_name, last_name, title, middle = elements
@@ -309,7 +308,7 @@ def populate_fields(self, row):
         column_list = ['CatalogNumber', 'verbatim_date', 'start_date',
                        'end_date', 'collector_number', 'locality', 'fullname', 'taxname',
                        'gen_spec', 'qualifier', 'name_matched', 'Genus', 'Family', 'Hybrid', 'accepted_author',
-                       'name_matched', 'first_intra', 'county', 'state', 'country']
+                       'first_intra', 'county', 'state', 'country']
         # print(self.full_name)
         index_list = []
         for column in column_list:
@@ -330,15 +329,14 @@ def populate_fields(self, row):
         self.family_name = row[index_list[12]]
         self.is_hybrid = row[index_list[13]]
         self.author = row[index_list[14]]
-        self.name_matched = row[index_list[15]]
-        self.first_intra = row[index_list[16]]
+        self.first_intra = row[index_list[15]]
 
         guid_list = ['collecting_event_guid', 'collection_ob_guid', 'locality_guid', 'determination_guid']
         for guid_string in guid_list:
             setattr(self, guid_string, uuid4())
 
-        self.geography_string = str(row[index_list[17]]) + ", " + \
-                                str(row[index_list[18]]) + ", " + str(row[index_list[19]])
+        self.geography_string = str(row[index_list[16]]) + ", " + \
+                                str(row[index_list[17]]) + ", " + str(row[index_list[18]])
 
         self.GeographyID = get_one_match(self.specify_db_connection, tab_name='geography', id_col='GeographyID',
                                          key_col='FullName', match=self.geography_string)

diff --git a/image_client/sql_csv_utils.py b/image_client/sql_csv_utils.py
@@ -103,10 +103,12 @@ def insert_table_record(connection, logger_int , sql):
            connection: the connection parameter in the case of specify self.specify_db_connection
            logger: the logger instance of your class self.logger
     """
+
     cursor = connection.get_cursor()
 
     logger_int.info(f'running query: {sql}')
     logger_int.debug(sql)
+
     try:
         cursor.execute(sql)
     except Exception as e:
@@ -121,6 +123,7 @@ def insert_table_record(connection, logger_int , sql):
 
     cursor.close()
 
+
 def create_batch_record(start_time: datetime, end_time: datetime,
                       batch_size: int, batch_md5: str):
     """create_timestamps:

diff --git a/image_client/string_utils.py b/image_client/string_utils.py
@@ -52,7 +52,7 @@ def move_first_substring(string: str, n_char: int):
         return string[n_char+1:] + string[0:n_char+1]
 
 
-def assign_titles(first_last, name: str):
+def assign_collector_titles(first_last, name: str):
     """assign_titles:
             function designed to separate out titles in names into a new title column
         args:
@@ -123,7 +123,7 @@ def roman_to_int(string):
     return output
 
 
-def string_converter(df: pd.DataFrame, column: str, option: str):
+def string_to_int_converter(df: pd.DataFrame, column: str, option: str):
     """function to turn string with decimal points into string or int with no decimals
        args:
             df: dataframe to modify

diff --git a/tests/pic_csv_test_class.py b/tests/pic_csv_test_class.py
@@ -0,0 +1,10 @@
+"""test case of the CsvCreatePicturae class which runs a reduced init method to use in unittests"""
+from image_client.importer import Importer
+from image_client.picturae_csv_create import CsvCreatePicturae
+from image_client import picturae_config
+
+class TestCsvCreatePicturae(CsvCreatePicturae):
+    def __init__(self, date_string):
+        Importer.__init__(self, db_config_class=picturae_config, collection_name= "Botany")
+        self.init_all_vars(date_string)
+
diff --git a/tests/pic_importer_test_class.py b/tests/pic_importer_test_class.py
@@ -0,0 +1,8 @@
+"""test case of the PicturaeImporter class which runs a reduced init method to use in unittests"""
+from image_client.importer import Importer
+from image_client.picturae_importer import PicturaeImporter
+from image_client import picturae_config
+class TestPicturaeImporter(PicturaeImporter):
+    def __init__(self, date_string, paths):
+        Importer.__init__(self, db_config_class=picturae_config, collection_name="Botany")
+        self.init_all_vars(date_string=date_string, paths=paths)
diff --git a/tests/test_agent_list.py b/tests/test_agent_list.py
@@ -0,0 +1,61 @@
+import pandas as pd
+from tests.pic_importer_test_class import TestPicturaeImporter
+import unittest
+from tests.testing_tools import TestingTools
+
+class TestAgentList(unittest.TestCase, TestingTools):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.md5_hash = self.generate_random_md5()
+    def setUp(self):
+        """creating instance of PicturaeImporter, +
+           creating dummy dataset of real and fake names"""
+
+        self.test_picturae_importer = TestPicturaeImporter(date_string=self.md5_hash,
+                                                         paths=self.md5_hash)
+
+        # jose Gonzalez is a real agent,
+        # to make sure true matches are not added to list.
+        data = {'collector_first_name1': ['Bob', 'Joe'],
+                'collector_last_name1': ['Fakeson jr.', 'DiMaggio'],
+                'collector_middle_name1': ['J', 'S'],
+                'collector_first_name2': ['Enrique', pd.NA],
+                'collector_last_name2': ['de la fake', pd.NA],
+                'collector_middle_name2': ['X', pd.NA],
+                'collector_first_name3': ['Jose', pd.NA],
+                'collector_last_name3': ['Gonzalez', pd.NA],
+                'collector_middle_name3': [pd.NA, pd.NA]
+                }
+
+        self.test_picturae_importer.record_full = pd.DataFrame(data)
+
+        self.test_picturae_importer.collector_list = []
+
+    def test_agent_list(self):
+        """makes sure the correct list of dictionaries is produced of collectors,
+           where new agents are included, and old agents are excluded from new_collector_list"""
+        temp_agent_list = []
+        for index, row in self.test_picturae_importer.record_full.iterrows():
+            self.test_picturae_importer.create_agent_list(row)
+            temp_agent_list.extend(self.test_picturae_importer.new_collector_list)
+
+        first_dict = temp_agent_list[0]
+        second_dict = temp_agent_list[1]
+        third_dict = temp_agent_list[2]
+        # array
+        collectors = [[first_dict['collector_first_name'], 'Bob'], [first_dict['collector_last_name'], 'Fakeson'],
+                      [first_dict['collector_title'], 'jr.'], [second_dict['collector_first_name'], 'Enrique'],
+                      [second_dict['collector_middle_initial'], 'X'], [third_dict['collector_first_name'], 'Joe'],
+                      [third_dict['collector_last_name'], 'DiMaggio'], [len(temp_agent_list), 3]
+                      ]
+
+        for comparison in collectors:
+            self.assertEqual(comparison[0], comparison[1])
+
+        self.assertEqual(len(temp_agent_list), 3)
+
+    def tearDown(self):
+        """deleting instance of self.PicturaeImporter"""
+        del self.test_picturae_importer
+
+
diff --git a/tests/test_check_record.py b/tests/test_check_record.py
@@ -1,6 +1,6 @@
 """tests to test the record_present, barcode_present and image_has_record functions."""
 import unittest
-import picturae_csv_create as pcc
+from tests.pic_csv_test_class import TestCsvCreatePicturae
 import pandas as pd
 from tests.testing_tools import TestingTools
 
@@ -13,7 +13,7 @@ def setUp(self):
           that have a small subset of representative real column names,
         """
         # initializing
-        self.CsvCreatePicturae = pcc.CsvCreatePicturae(date_string=self.md5_hash, istesting=True)
+        self.test_csv_create_picturae = TestCsvCreatePicturae(date_string=self.md5_hash)
 
         # creating dummy dataset, one mistake 530923 != 530924 inserted on purpose
         # the test barcode that is set to return a false is 58719322,
@@ -24,34 +24,34 @@ def setUp(self):
                                'picturae_img/cas0008708.jpg'],
                 'folder_barcode': ['2310_2', '2310_2', '2312_2']}
 
-        self.CsvCreatePicturae.record_full = pd.DataFrame(data)
+        self.test_csv_create_picturae.record_full = pd.DataFrame(data)
 
     def test_barcode_present(self):
         """checks whether boolean column added for record present"""
-        self.CsvCreatePicturae.barcode_has_record()
+        self.test_csv_create_picturae.barcode_has_record()
         # checks whether boolean column correctly added
-        self.assertEqual(len(self.CsvCreatePicturae.record_full.columns), 4)
+        self.assertEqual(len(self.test_csv_create_picturae.record_full.columns), 4)
         # checks that no NAs were dropped
-        self.assertEqual(len(self.CsvCreatePicturae.record_full), 3)
+        self.assertEqual(len(self.test_csv_create_picturae.record_full), 3)
         # checks that the correct boolean order is returned
-        test_list = list(self.CsvCreatePicturae.record_full['barcode_present'])
+        test_list = list(self.test_csv_create_picturae.record_full['barcode_present'])
         self.assertEqual(test_list, [True, False, True])
 
     def test_if_barcode_match(self):
         """tests if there is a barcode in the barcode
            column that does not match the barcode in the img file name,
            the correct boolean is returned"""
-        self.CsvCreatePicturae.check_barcode_match()
-        test_list = list(self.CsvCreatePicturae.record_full['is_barcode_match'])
+        self.test_csv_create_picturae.check_barcode_match()
+        test_list = list(self.test_csv_create_picturae.record_full['is_barcode_match'])
         self.assertEqual([False, True, True], test_list)
 
     def test_image_has_record(self):
         """tests if image_has_record returns true for
            one real attachment in test df"""
-        self.CsvCreatePicturae.image_has_record()
-        test_list = list(self.CsvCreatePicturae.record_full['image_present'])
+        self.test_csv_create_picturae.image_has_record()
+        test_list = list(self.test_csv_create_picturae.record_full['image_present'])
         self.assertEqual([True, False, False], test_list)
 
 
     def tearDown(self):
-        del self.CsvCreatePicturae
+        del self.test_csv_create_picturae
diff --git a/tests/test_col_clean.py b/tests/test_col_clean.py
@@ -2,7 +2,7 @@
 import unittest
 import os
 import pandas as pd
-import picturae_csv_create as pcc
+from tests.pic_csv_test_class import TestCsvCreatePicturae
 from tests.testing_tools import TestingTools
 
 os.chdir("./image_client")
@@ -14,7 +14,7 @@ def __init__(self, *args, **kwargs):
     def setUp(self):
         """creates dummy dataset with representative column names"""
         # initializing class
-        self.CsvCreatePicturae = pcc.CsvCreatePicturae(date_string=self.md5_hash, istest=True)
+        self.test_csv_create_picturae = TestCsvCreatePicturae(date_string=self.md5_hash)
         # creating dummy dataset
         numb_range = list(range(1, 101))
         column_names = ['application_batch', 'csv_batch', 'object_type', 'folder_barcode',
@@ -39,25 +39,25 @@ def setUp(self):
         new_df['Notes'] = pd.NA
         new_df['Feedback'] = pd.NA
 
-        self.CsvCreatePicturae.record_full = pd.DataFrame(new_df)
+        self.test_csv_create_picturae.record_full = pd.DataFrame(new_df)
 
     def test_if_id_cols(self):
         """test_if_id_col: tests whether certain essential
            ID columns present. Also tests, wether name columns correctly
            reformated
         """
-        self.CsvCreatePicturae.csv_colnames()
-        csv_columns = self.CsvCreatePicturae.record_full.columns
+        self.test_csv_create_picturae.csv_colnames()
+        csv_columns = self.test_csv_create_picturae.record_full.columns
         column_names = ['collector_number', 'RankID',
                         'CatalogNumber', 'collector_last_name1',
                         'collector_first_name5']
         self.assertTrue(all(column in csv_columns for column in column_names))
 
     def test_if_nas(self):
         """test_if_nas: test if any left-over columns contain only NAs"""
-        self.CsvCreatePicturae.csv_colnames()
-        self.record_full = self.CsvCreatePicturae.record_full.dropna(axis=1, how='all')
+        self.test_csv_create_picturae.csv_colnames()
+        self.record_full = self.test_csv_create_picturae.record_full.dropna(axis=1, how='all')
         self.assertEqual(len(self.record_full.columns), len(self.record_full.columns))
 
     def tearDown(self):
-        del self.CsvCreatePicturae
+        del self.test_csv_create_picturae