/
cms_api_INA.py
79 lines (58 loc) · 2.5 KB
/
cms_api_INA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# To be run weekly, on Tues. Pulls data from CMS api.
import pandas as pd
import requests
import json
from pandas.tseries.offsets import Week
from datetime import datetime, timedelta, date
from dateutil.relativedelta import relativedelta, MO, SU
import time
import boto3
url = "https://data.cms.gov/data.json"
title = "COVID-19 Nursing Home Data"
response = requests.request("GET",url)
if response.ok:
response = response.json()
dataset = response['dataset']
for set in dataset:
if title ==set['title']:
for distro in set['distribution']:
if 'format' in distro.keys() and 'description' in distro.keys():
if distro['format'] == "API" and distro['description'] == "latest":
latest_distro = distro['accessURL']
print(f"The latest data for {title} can be found at {latest_distro} or {set['identifier']}")
stats_endpoint = latest_distro + "/stats"
latest_data = []
stats_response = requests.request("GET", stats_endpoint)
stats_response = stats_response.json()
total_rows = stats_response['total_rows']
print(f"total rows: {total_rows}")
date_today = date.today()
end_wk_end_date = date_today - relativedelta(weeks=1, weekday=SU)
start_wk_end_date = end_wk_end_date - relativedelta(weeks=8, weekday=SU)
week = timedelta(days=7)
offset = 0
size = 5000
# while i < total_rows:
while start_wk_end_date <= end_wk_end_date:
for offset in range(0,total_rows,size):
offset_url = f"{latest_distro}?filter[week_ending]={start_wk_end_date}&offset={offset}&size={size}"
offset_response = requests.request("GET", offset_url)
data = offset_response.json()
print(f"Made request for {size} results at offset {offset}")
if len(data) == 0:
break
latest_data.extend(data)
offset = offset + size
time.sleep(3)
print("---")
current_url = f"{latest_distro}?filter[week_ending]={start_wk_end_date}&offset={offset}&size={size}"
print("Requesting",current_url)
start_wk_end_date = start_wk_end_date + week
df_latest_data = pd.DataFrame(latest_data)
print(df_latest_data)
#save as csv file
#df_latest_data.to_csv("data_pre_proc/nh_pre_proc_raw.csv", index=False)
#save concated as updated parquet file
# df_latest_data.to_parquet('data_pre_proc/nh_all_Jan8_23_Aug6_23_concat.parquet.gzip', compression='gzip')
#save as parquet file
df_latest_data.to_parquet("data_pre_proc/nh_pre_proc_raw.parquet", engine='auto', compression='snappy', index=None, partition_cols=None)