-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
108 lines (84 loc) · 4.68 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import argparse
import os
import re
import json
import pandas as pd
import numpy as np
from preprocessing._utils import get_usda_urls, dict_to_json, fillna_and_define_dtype, postprocess_stacked_df, fillna_and_set_dtypes
from preprocessing.process_foundation import process_foundation
from preprocessing.process_srlegacy import process_srlegacy
from preprocessing.process_branded import process_branded
# ---------------------------------------------------------------------
# ---- Parse command-line arguments ----
# ---------------------------------------------------------------------
# Parse command-line arguments
parser = argparse.ArgumentParser(description='download and process USDA Food Data Central datasets.')
parser.add_argument('--output_dir', default='fdc_data', help='specify output directory path (default: fdc_data)')
parser.add_argument('--filename', default='usda_food_nutrition_data.parquet', help='specify output filename (default: usda_food_nutrition_data.csv')
parser.add_argument('--keep_files', action='store_true', help='keep raw/indv files post-processing (default: delete raw/indv files)')
args = parser.parse_args()
# Check filename and extension
filename = args.filename
file_ext = filename.split('.')[-1] if '.' in filename else None
filename = filename if file_ext else f"{filename}.csv"
# Check keep_files flag
keep_files = args.keep_files
if keep_files:
print(f"\n'keep_files' flag specified:\n> Raw and individual files will be kept after processing.")
# ---------------------------------------------------------------------
# ---- Setup raw and output directories ----
# ---------------------------------------------------------------------
# Define directories and ensure they exist (raw_dir existence will be checked in individual processing files)
OUTPUT_DIR = args.output_dir
RAW_DIR = os.path.join(OUTPUT_DIR, 'FoodData_Central_raw')
print(f'\nInitializing processing of USDA FDC data. Output directory set to:\n> {OUTPUT_DIR}\n')
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
print(f'Directory created:\n> {OUTPUT_DIR}\n')
# ---------------------------------------------------------------------
# ---- Process USDA datasets ----
# ---------------------------------------------------------------------
# Gather urls and send them out to appropriate processing scripts
usda_urls = get_usda_urls()
foundation_urls = [url for url in usda_urls if 'foundation' in url or 'FoodData_Central_csv' in url]
srlegacy_url = [url for url in usda_urls if 'sr_legacy' in url][0]
branded_url = [url for url in usda_urls if 'branded' in url][0]
process_foundation(foundation_urls, OUTPUT_DIR, RAW_DIR, keep_files)
process_srlegacy(srlegacy_url, OUTPUT_DIR, RAW_DIR, keep_files)
process_branded(branded_url, OUTPUT_DIR, RAW_DIR, keep_files)
# ---------------------------------------------------------------------
# ---- Postprocess processed USDA datasets ----
# ---------------------------------------------------------------------
print(f'Initializing stacking of individually processed data:')
for root, dirs, files in os.walk(OUTPUT_DIR):
for file in files:
if '.parquet' in file:
print(f'> {file}')
# Stack processed data, reorder columns, and save csv
stacked_data = pd.concat([
pd.read_parquet(os.path.join(OUTPUT_DIR, 'processed_foundation.parquet')),
pd.read_parquet(os.path.join(OUTPUT_DIR, 'processed_srlegacy.parquet')),
pd.read_parquet(os.path.join(OUTPUT_DIR, 'processed_branded.parquet'))
])
# apply some post-processing
# (select columns, fillna, set dtypes, fill in missing food_common_name and food_common_category values by food_description groups)
stacked_data = postprocess_stacked_df(stacked_data)
# ---------------------------------------------------------------------
# ---- Save final output data and cleanup directories ----
# ---------------------------------------------------------------------
stacked_data.to_csv(os.path.join(OUTPUT_DIR, filename), index=False)
# stacked_data.to_parquet(os.path.join(OUTPUT_DIR, "usda_food_nutrition_data2.parquet"), index=False)
# Delete raw dir/individual processed files if keep_files flag is not specified
for root, dirs, files in os.walk(OUTPUT_DIR):
for file in files:
if not keep_files:
if 'foundation' in file:
file_path = os.path.join(root, file)
os.remove(file_path)
if 'srlegacy' in file:
file_path = os.path.join(root, file)
os.remove(file_path)
if 'branded' in file:
file_path = os.path.join(root, file)
os.remove(file_path)
print(f"\nProcessing of USDA FDC data is complete. The processed data file ('{filename}') is now available in:\n> {OUTPUT_DIR}\n")