Skip to content

Commit

Permalink
Merge pull request specify#9 from calacademy-research/picturae_import
Browse files Browse the repository at this point in the history
added derived test classes of base picturae import classes.  Created new testing modules for picturae_importer
  • Loading branch information
foozleface committed Sep 20, 2023
2 parents e6608ae + a21f0fa commit 92e5cbf
Show file tree
Hide file tree
Showing 17 changed files with 282 additions and 70 deletions.
23 changes: 23 additions & 0 deletions image_client/casbotany_sql_lite.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ def table_sql_list():
);''']

return sql_bot_list

def casbotany_lite_creator():
connect = sqlite3.connect('cas_botanylite.db')
sql_list = table_sql_list()
Expand All @@ -394,6 +395,28 @@ def casbotany_lite_creator():
curs.close()
connect.close()


def sql_lite_connection(db_name):
connection = sqlite3.connect(db_name)
return connection

def sql_lite_insert(sql, db_name):
"""facimile statement to insert_table_record in sql_csv_utils.py"""
connect = sqlite3.connect(db_name)
curs = connect.cursor()
try:
curs.execute(sql)
except Exception as e:
raise ValueError(f"Exception thrown while processing sql: {sql}\n{e}\n")
try:
connect.commit()

except Exception as e:
raise ValueError(f"sql debug: {e}")

curs.close()
connect.close()

def casbotany_lite_getrecord(sql_query: str):
"""modified get one record function for sql lite"""
connect = sqlite3.connect('cas_botanylite.db')
Expand Down
6 changes: 3 additions & 3 deletions image_client/picturae_csv_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@


class CsvCreatePicturae(Importer):
def __init__(self, date_string, istest = False):
def __init__(self, date_string):
super().__init__(picturae_config, "Botany")
self.init_all_vars(date_string)

if istest is False:
self.run_all()

self.run_all()


def init_all_vars(self, date_string):
Expand Down
24 changes: 11 additions & 13 deletions image_client/picturae_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ class PicturaeImporter(Importer):
along with attached images
"""

def __init__(self, paths, date_string=None, istest=False):
def __init__(self, paths, date_string=None):
super().__init__(picturae_config, "Botany")

self.setting_init_variables(date_string=date_string, paths=paths)
self.init_all_vars(date_string=date_string, paths=paths)

# running csv create
CsvCreatePicturae(date_string=self.date_use)
Expand All @@ -40,11 +40,10 @@ def __init__(self, paths, date_string=None, istest=False):

self.batch_md5 = generate_token(starting_time_stamp, self.file_path)

if istest is False:
self.run_all_methods()
self.run_all_methods()


def setting_init_variables(self, date_string, paths):
def init_all_vars(self, date_string, paths):
"""setting init variables:
a list of variables and data structures to be initialized at the beginning of the class.
args:
Expand Down Expand Up @@ -265,14 +264,14 @@ def create_agent_list(self, row):
# need a way to keep ones with nas, but only split titles from real names
if not pd.isna(first) or not pd.isna(middle) or not pd.isna(last):
# first name title taking priority over last
first_name, title = assign_titles(first_last='first', name=f"{first}")
last_name, title = assign_titles(first_last='last', name=f"{last}")
first_name, title = assign_collector_titles(first_last='first', name=f"{first}")
last_name, title = assign_collector_titles(first_last='last', name=f"{last}")

middle = middle
elements = [first_name, last_name, title, middle]

for index in range(len(elements)):
if elements[index] == '':
if pd.isna(elements[index]) or elements[index] == '':
elements[index] = pd.NA

first_name, last_name, title, middle = elements
Expand Down Expand Up @@ -309,7 +308,7 @@ def populate_fields(self, row):
column_list = ['CatalogNumber', 'verbatim_date', 'start_date',
'end_date', 'collector_number', 'locality', 'fullname', 'taxname',
'gen_spec', 'qualifier', 'name_matched', 'Genus', 'Family', 'Hybrid', 'accepted_author',
'name_matched', 'first_intra', 'county', 'state', 'country']
'first_intra', 'county', 'state', 'country']
# print(self.full_name)
index_list = []
for column in column_list:
Expand All @@ -330,15 +329,14 @@ def populate_fields(self, row):
self.family_name = row[index_list[12]]
self.is_hybrid = row[index_list[13]]
self.author = row[index_list[14]]
self.name_matched = row[index_list[15]]
self.first_intra = row[index_list[16]]
self.first_intra = row[index_list[15]]

guid_list = ['collecting_event_guid', 'collection_ob_guid', 'locality_guid', 'determination_guid']
for guid_string in guid_list:
setattr(self, guid_string, uuid4())

self.geography_string = str(row[index_list[17]]) + ", " + \
str(row[index_list[18]]) + ", " + str(row[index_list[19]])
self.geography_string = str(row[index_list[16]]) + ", " + \
str(row[index_list[17]]) + ", " + str(row[index_list[18]])

self.GeographyID = get_one_match(self.specify_db_connection, tab_name='geography', id_col='GeographyID',
key_col='FullName', match=self.geography_string)
Expand Down
3 changes: 3 additions & 0 deletions image_client/sql_csv_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,12 @@ def insert_table_record(connection, logger_int , sql):
connection: the connection parameter in the case of specify self.specify_db_connection
logger: the logger instance of your class self.logger
"""

cursor = connection.get_cursor()

logger_int.info(f'running query: {sql}')
logger_int.debug(sql)

try:
cursor.execute(sql)
except Exception as e:
Expand All @@ -121,6 +123,7 @@ def insert_table_record(connection, logger_int , sql):

cursor.close()


def create_batch_record(start_time: datetime, end_time: datetime,
batch_size: int, batch_md5: str):
"""create_timestamps:
Expand Down
4 changes: 2 additions & 2 deletions image_client/string_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def move_first_substring(string: str, n_char: int):
return string[n_char+1:] + string[0:n_char+1]


def assign_titles(first_last, name: str):
def assign_collector_titles(first_last, name: str):
"""assign_titles:
function designed to separate out titles in names into a new title column
args:
Expand Down Expand Up @@ -123,7 +123,7 @@ def roman_to_int(string):
return output


def string_converter(df: pd.DataFrame, column: str, option: str):
def string_to_int_converter(df: pd.DataFrame, column: str, option: str):
"""function to turn string with decimal points into string or int with no decimals
args:
df: dataframe to modify
Expand Down
10 changes: 10 additions & 0 deletions tests/pic_csv_test_class.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""test case of the CsvCreatePicturae class which runs a reduced init method to use in unittests"""
from image_client.importer import Importer
from image_client.picturae_csv_create import CsvCreatePicturae
from image_client import picturae_config

class TestCsvCreatePicturae(CsvCreatePicturae):
def __init__(self, date_string):
Importer.__init__(self, db_config_class=picturae_config, collection_name= "Botany")
self.init_all_vars(date_string)

8 changes: 8 additions & 0 deletions tests/pic_importer_test_class.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""test case of the PicturaeImporter class which runs a reduced init method to use in unittests"""
from image_client.importer import Importer
from image_client.picturae_importer import PicturaeImporter
from image_client import picturae_config
class TestPicturaeImporter(PicturaeImporter):
def __init__(self, date_string, paths):
Importer.__init__(self, db_config_class=picturae_config, collection_name="Botany")
self.init_all_vars(date_string=date_string, paths=paths)
61 changes: 61 additions & 0 deletions tests/test_agent_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pandas as pd
from tests.pic_importer_test_class import TestPicturaeImporter
import unittest
from tests.testing_tools import TestingTools

class TestAgentList(unittest.TestCase, TestingTools):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.md5_hash = self.generate_random_md5()
def setUp(self):
"""creating instance of PicturaeImporter, +
creating dummy dataset of real and fake names"""

self.test_picturae_importer = TestPicturaeImporter(date_string=self.md5_hash,
paths=self.md5_hash)

# jose Gonzalez is a real agent,
# to make sure true matches are not added to list.
data = {'collector_first_name1': ['Bob', 'Joe'],
'collector_last_name1': ['Fakeson jr.', 'DiMaggio'],
'collector_middle_name1': ['J', 'S'],
'collector_first_name2': ['Enrique', pd.NA],
'collector_last_name2': ['de la fake', pd.NA],
'collector_middle_name2': ['X', pd.NA],
'collector_first_name3': ['Jose', pd.NA],
'collector_last_name3': ['Gonzalez', pd.NA],
'collector_middle_name3': [pd.NA, pd.NA]
}

self.test_picturae_importer.record_full = pd.DataFrame(data)

self.test_picturae_importer.collector_list = []

def test_agent_list(self):
"""makes sure the correct list of dictionaries is produced of collectors,
where new agents are included, and old agents are excluded from new_collector_list"""
temp_agent_list = []
for index, row in self.test_picturae_importer.record_full.iterrows():
self.test_picturae_importer.create_agent_list(row)
temp_agent_list.extend(self.test_picturae_importer.new_collector_list)

first_dict = temp_agent_list[0]
second_dict = temp_agent_list[1]
third_dict = temp_agent_list[2]
# array
collectors = [[first_dict['collector_first_name'], 'Bob'], [first_dict['collector_last_name'], 'Fakeson'],
[first_dict['collector_title'], 'jr.'], [second_dict['collector_first_name'], 'Enrique'],
[second_dict['collector_middle_initial'], 'X'], [third_dict['collector_first_name'], 'Joe'],
[third_dict['collector_last_name'], 'DiMaggio'], [len(temp_agent_list), 3]
]

for comparison in collectors:
self.assertEqual(comparison[0], comparison[1])

self.assertEqual(len(temp_agent_list), 3)

def tearDown(self):
"""deleting instance of self.PicturaeImporter"""
del self.test_picturae_importer


24 changes: 12 additions & 12 deletions tests/test_check_record.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""tests to test the record_present, barcode_present and image_has_record functions."""
import unittest
import picturae_csv_create as pcc
from tests.pic_csv_test_class import TestCsvCreatePicturae
import pandas as pd
from tests.testing_tools import TestingTools

Expand All @@ -13,7 +13,7 @@ def setUp(self):
that have a small subset of representative real column names,
"""
# initializing
self.CsvCreatePicturae = pcc.CsvCreatePicturae(date_string=self.md5_hash, istesting=True)
self.test_csv_create_picturae = TestCsvCreatePicturae(date_string=self.md5_hash)

# creating dummy dataset, one mistake 530923 != 530924 inserted on purpose
# the test barcode that is set to return a false is 58719322,
Expand All @@ -24,34 +24,34 @@ def setUp(self):
'picturae_img/cas0008708.jpg'],
'folder_barcode': ['2310_2', '2310_2', '2312_2']}

self.CsvCreatePicturae.record_full = pd.DataFrame(data)
self.test_csv_create_picturae.record_full = pd.DataFrame(data)

def test_barcode_present(self):
"""checks whether boolean column added for record present"""
self.CsvCreatePicturae.barcode_has_record()
self.test_csv_create_picturae.barcode_has_record()
# checks whether boolean column correctly added
self.assertEqual(len(self.CsvCreatePicturae.record_full.columns), 4)
self.assertEqual(len(self.test_csv_create_picturae.record_full.columns), 4)
# checks that no NAs were dropped
self.assertEqual(len(self.CsvCreatePicturae.record_full), 3)
self.assertEqual(len(self.test_csv_create_picturae.record_full), 3)
# checks that the correct boolean order is returned
test_list = list(self.CsvCreatePicturae.record_full['barcode_present'])
test_list = list(self.test_csv_create_picturae.record_full['barcode_present'])
self.assertEqual(test_list, [True, False, True])

def test_if_barcode_match(self):
"""tests if there is a barcode in the barcode
column that does not match the barcode in the img file name,
the correct boolean is returned"""
self.CsvCreatePicturae.check_barcode_match()
test_list = list(self.CsvCreatePicturae.record_full['is_barcode_match'])
self.test_csv_create_picturae.check_barcode_match()
test_list = list(self.test_csv_create_picturae.record_full['is_barcode_match'])
self.assertEqual([False, True, True], test_list)

def test_image_has_record(self):
"""tests if image_has_record returns true for
one real attachment in test df"""
self.CsvCreatePicturae.image_has_record()
test_list = list(self.CsvCreatePicturae.record_full['image_present'])
self.test_csv_create_picturae.image_has_record()
test_list = list(self.test_csv_create_picturae.record_full['image_present'])
self.assertEqual([True, False, False], test_list)


def tearDown(self):
del self.CsvCreatePicturae
del self.test_csv_create_picturae
16 changes: 8 additions & 8 deletions tests/test_col_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import unittest
import os
import pandas as pd
import picturae_csv_create as pcc
from tests.pic_csv_test_class import TestCsvCreatePicturae
from tests.testing_tools import TestingTools

os.chdir("./image_client")
Expand All @@ -14,7 +14,7 @@ def __init__(self, *args, **kwargs):
def setUp(self):
"""creates dummy dataset with representative column names"""
# initializing class
self.CsvCreatePicturae = pcc.CsvCreatePicturae(date_string=self.md5_hash, istest=True)
self.test_csv_create_picturae = TestCsvCreatePicturae(date_string=self.md5_hash)
# creating dummy dataset
numb_range = list(range(1, 101))
column_names = ['application_batch', 'csv_batch', 'object_type', 'folder_barcode',
Expand All @@ -39,25 +39,25 @@ def setUp(self):
new_df['Notes'] = pd.NA
new_df['Feedback'] = pd.NA

self.CsvCreatePicturae.record_full = pd.DataFrame(new_df)
self.test_csv_create_picturae.record_full = pd.DataFrame(new_df)

def test_if_id_cols(self):
"""test_if_id_col: tests whether certain essential
ID columns present. Also tests, wether name columns correctly
reformated
"""
self.CsvCreatePicturae.csv_colnames()
csv_columns = self.CsvCreatePicturae.record_full.columns
self.test_csv_create_picturae.csv_colnames()
csv_columns = self.test_csv_create_picturae.record_full.columns
column_names = ['collector_number', 'RankID',
'CatalogNumber', 'collector_last_name1',
'collector_first_name5']
self.assertTrue(all(column in csv_columns for column in column_names))

def test_if_nas(self):
"""test_if_nas: test if any left-over columns contain only NAs"""
self.CsvCreatePicturae.csv_colnames()
self.record_full = self.CsvCreatePicturae.record_full.dropna(axis=1, how='all')
self.test_csv_create_picturae.csv_colnames()
self.record_full = self.test_csv_create_picturae.record_full.dropna(axis=1, how='all')
self.assertEqual(len(self.record_full.columns), len(self.record_full.columns))

def tearDown(self):
del self.CsvCreatePicturae
del self.test_csv_create_picturae

0 comments on commit 92e5cbf

Please sign in to comment.