Skip to content

Commit

Permalink
feat: Onboard Chicago Crime dataset (#199)
Browse files Browse the repository at this point in the history
  • Loading branch information
dipannitab2392 committed Oct 21, 2021
1 parent d4b946b commit d766547
Show file tree
Hide file tree
Showing 10 changed files with 698 additions and 0 deletions.
37 changes: 37 additions & 0 deletions datasets/chicago_crime/_images/run_csv_transform_kub/Dockerfile
@@ -0,0 +1,37 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# The base image for this build
FROM python:3.8

# Allow statements and log messages to appear in Cloud logs
ENV PYTHONUNBUFFERED True

# Copy the requirements file into the image
COPY requirements.txt ./

# Install the packages specified in the requirements file
RUN python3 -m pip install --no-cache-dir -r requirements.txt

# The WORKDIR instruction sets the working directory for any RUN, CMD,
# ENTRYPOINT, COPY and ADD instructions that follow it in the Dockerfile.
# If the WORKDIR doesn’t exist, it will be created even if it’s not used in
# any subsequent Dockerfile instruction
WORKDIR /custom

# Copy the specific data processing script/s in the image under /custom/*
COPY ./csv_transform.py .

# Command to run the data processing script when the container is run
CMD ["python3", "csv_transform.py"]
248 changes: 248 additions & 0 deletions datasets/chicago_crime/_images/run_csv_transform_kub/csv_transform.py
@@ -0,0 +1,248 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import datetime
import logging
import math
import os
import pathlib
import subprocess
import typing

import pandas as pd
import requests
from google.cloud import storage


def main(
source_url: str,
source_file: pathlib.Path,
target_file: pathlib.Path,
target_gcs_bucket: str,
target_gcs_path: str,
chunk_size: str,
) -> None:

logging.info(
"Chicago Crime process started at "
+ str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
)

logging.info("Creating 'files' folder")
pathlib.Path("./files").mkdir(parents=True, exist_ok=True)

logging.info(f"Downloading file {source_url}")
download_file(source_url, source_file)

with pd.read_csv(
source_file,
chunksize=int(chunk_size),
) as reader:
for chunk_number, chunk in enumerate(reader):
logging.info(f"Processing batch {chunk_number}")
target_file_batch = str(target_file).replace(
".csv", "-" + str(chunk_number) + ".csv"
)
df = pd.DataFrame()
df = pd.concat([df, chunk])

logging.info(f"Transforming {source_file} ...")

logging.info(f"Transform: Rename columns {source_file} ...")
rename_headers(df)

logging.info("Transform: Converting date format.. ")
convert_values(df)

logging.info("Transform: Removing null values.. ")
filter_null_rows(df)

logging.info("Transform: Converting to integers..")
convert_values_to_integer_string(df)

logging.info("Transform: Converting to float..")
removing_nan_values(df)

logging.info("Transform: Reordering headers..")
df = df[
[
"unique_key",
"case_number",
"date",
"block",
"iucr",
"primary_type",
"description",
"location_description",
"arrest",
"domestic",
"beat",
"district",
"ward",
"community_area",
"fbi_code",
"x_coordinate",
"y_coordinate",
"year",
"updated_on",
"latitude",
"longitude",
"location",
]
]

process_chunk(df, target_file_batch)

logging.info(f"Appending batch {chunk_number} to {target_file}")
if chunk_number == 0:
subprocess.run(["cp", target_file_batch, target_file])
else:
subprocess.check_call(f"sed -i '1d' {target_file_batch}", shell=True)
subprocess.check_call(
f"cat {target_file_batch} >> {target_file}", shell=True
)
subprocess.run(["rm", target_file_batch])

logging.info(
f"Uploading output file to.. gs://{target_gcs_bucket}/{target_gcs_path}"
)
upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path)

logging.info(
"Chicago crime process completed at "
+ str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
)


def process_chunk(df: pd.DataFrame, target_file_batch: str) -> None:

logging.info(f"Saving to output file.. {target_file_batch}")
try:
save_to_new_file(df, file_path=str(target_file_batch))
except Exception as e:
logging.error(f"Error saving output file: {e}.")
logging.info("..Done!")


def resolve_nan(input: typing.Union[str, float]) -> str:
if not input or (math.isnan(input)):
return ""
return str(input).replace("None", "")


def removing_nan_values(df: pd.DataFrame) -> None:
cols = ["x_coordinate", "y_coordinate", "latitude", "longitude"]
for cols in cols:
df[cols] = df[cols].apply(resolve_nan)


def convert_to_integer_string(input: typing.Union[str, float]) -> str:

if not input or (math.isnan(input)):
return ""
return str(int(round(input, 0)))


def convert_values_to_integer_string(df: pd.DataFrame) -> None:
cols = ["unique_key", "beat", "district", "ward", "community_area", "year"]

for cols in cols:
df[cols] = df[cols].apply(convert_to_integer_string)


def rename_headers(df: pd.DataFrame) -> None:
header_names = {
"ID": "unique_key",
"Case Number": "case_number",
"Date": "date",
"Block": "block",
"IUCR": "iucr",
"Primary Type": "primary_type",
"Description": "description",
"Location Description": "location_description",
"Arrest": "arrest",
"Domestic": "domestic",
"Beat": "beat",
"District": "district",
"Ward": "ward",
"Community Area": "community_area",
"FBI Code": "fbi_code",
"X Coordinate": "x_coordinate",
"Y Coordinate": "y_coordinate",
"Year": "year",
"Updated On": "updated_on",
"Latitude": "latitude",
"Longitude": "longitude",
"Location": "location",
}

df.rename(columns=header_names, inplace=True)


def convert_dt_format(dt_str: str) -> str:
# Old format: MM/dd/yyyy hh:mm:ss aa
# New format: yyyy-MM-dd HH:mm:ss
if not dt_str:
return dt_str
else:
return datetime.datetime.strptime(dt_str, "%m/%d/%Y %H:%M:%S %p").strftime(
"%Y-%m-%d %H:%M:%S"
)


def convert_values(df: pd.DataFrame) -> None:
dt_cols = ["date", "updated_on"]

for dt_col in dt_cols:
df[dt_col] = df[dt_col].apply(convert_dt_format)


def filter_null_rows(df: pd.DataFrame) -> None:
df = df[df.unique_key != ""]


def save_to_new_file(df: pd.DataFrame, file_path: pathlib.Path) -> None:
df.to_csv(file_path, index=False)


def download_file(source_url: str, source_file: pathlib.Path) -> None:
logging.info(f"Downloading {source_url} into {source_file}")
r = requests.get(source_url, stream=True)
if r.status_code == 200:
with open(source_file, "wb") as f:
for chunk in r:
f.write(chunk)
else:
logging.error(f"Couldn't download {source_url}: {r.text}")


def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None:
storage_client = storage.Client()
bucket = storage_client.bucket(gcs_bucket)
blob = bucket.blob(gcs_path)
blob.upload_from_filename(file_path)


if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)

main(
source_url=os.environ["SOURCE_URL"],
source_file=pathlib.Path(os.environ["SOURCE_FILE"]).expanduser(),
target_file=pathlib.Path(os.environ["TARGET_FILE"]).expanduser(),
target_gcs_bucket=os.environ["TARGET_GCS_BUCKET"],
target_gcs_path=os.environ["TARGET_GCS_PATH"],
chunk_size=os.environ["CHUNK_SIZE"],
)
@@ -0,0 +1,3 @@
requests
pandas
google-cloud-storage
26 changes: 26 additions & 0 deletions datasets/chicago_crime/_terraform/chicago_crime_dataset.tf
@@ -0,0 +1,26 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


resource "google_bigquery_dataset" "chicago_crime" {
dataset_id = "chicago_crime"
project = var.project_id
description = "This dataset reflects reported incidents of crime (with the exception of murders where data exists for each victim) that occurred in the City of Chicago from 2001 to present, minus the most recent seven days. Data is extracted from the Chicago Police Department\u0027s CLEAR (Citizen Law Enforcement Analysis and Reporting) system. In order to protect the privacy of crime victims, addresses are shown at the block level only and specific locations are not identified. This data includes unverified reports supplied to the Police Department. The preliminary crime classifications may be changed at a later date based upon additional investigation and there is always the possibility of mechanical or human error. Therefore, the Chicago Police Department does not guarantee (either expressed or implied) the accuracy, completeness, timeliness, or correct sequencing of the information and the information should not be used for comparison purposes over time.\n\nDataset Source: City of Chicago\n\nCategory: Chicago, Public Safety\n\nUse: This dataset is publicly available for anyone to use under the following terms provided by the Dataset Source \u2014https://data.cityofchicago.org \u2014 and is provided \"AS IS\" without any warranty, express or implied, from Google. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.\n\nUpdate Frequency: Daily"
}

output "bigquery_dataset-chicago_crime-dataset_id" {
value = google_bigquery_dataset.chicago_crime.dataset_id
}
39 changes: 39 additions & 0 deletions datasets/chicago_crime/_terraform/crime_pipeline.tf
@@ -0,0 +1,39 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


resource "google_bigquery_table" "crime" {
project = var.project_id
dataset_id = "chicago_crime"
table_id = "crime"

description = "Chicago Crime dataset"




depends_on = [
google_bigquery_dataset.chicago_crime
]
}

output "bigquery_table-crime-table_id" {
value = google_bigquery_table.crime.table_id
}

output "bigquery_table-crime-id" {
value = google_bigquery_table.crime.id
}
28 changes: 28 additions & 0 deletions datasets/chicago_crime/_terraform/provider.tf
@@ -0,0 +1,28 @@
/**
* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


provider "google" {
project = var.project_id
impersonate_service_account = var.impersonating_acct
region = var.region
}

data "google_client_openid_userinfo" "me" {}

output "impersonating-account" {
value = data.google_client_openid_userinfo.me.email
}

0 comments on commit d766547

Please sign in to comment.