feat: Onboard Chicago Crime dataset (#199)

GoogleCloudPlatform · Oct 21, 2021 · d766547 · d766547
1 parent d4b946b
commit d766547
Show file tree

Hide file tree

Showing 10 changed files with 698 additions and 0 deletions.
diff --git a/datasets/chicago_crime/_images/run_csv_transform_kub/Dockerfile b/datasets/chicago_crime/_images/run_csv_transform_kub/Dockerfile
@@ -0,0 +1,37 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The base image for this build
+FROM python:3.8
+
+# Allow statements and log messages to appear in Cloud logs
+ENV PYTHONUNBUFFERED True
+
+# Copy the requirements file into the image
+COPY requirements.txt ./
+
+# Install the packages specified in the requirements file
+RUN python3 -m pip install --no-cache-dir -r requirements.txt
+
+# The WORKDIR instruction sets the working directory for any RUN, CMD,
+# ENTRYPOINT, COPY and ADD instructions that follow it in the Dockerfile.
+# If the WORKDIR doesn’t exist, it will be created even if it’s not used in
+# any subsequent Dockerfile instruction
+WORKDIR /custom
+
+# Copy the specific data processing script/s in the image under /custom/*
+COPY ./csv_transform.py .
+
+# Command to run the data processing script when the container is run
+CMD ["python3", "csv_transform.py"]
diff --git a/datasets/chicago_crime/_images/run_csv_transform_kub/csv_transform.py b/datasets/chicago_crime/_images/run_csv_transform_kub/csv_transform.py
@@ -0,0 +1,248 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import datetime
+import logging
+import math
+import os
+import pathlib
+import subprocess
+import typing
+
+import pandas as pd
+import requests
+from google.cloud import storage
+
+
+def main(
+    source_url: str,
+    source_file: pathlib.Path,
+    target_file: pathlib.Path,
+    target_gcs_bucket: str,
+    target_gcs_path: str,
+    chunk_size: str,
+) -> None:
+
+    logging.info(
+        "Chicago Crime process started at "
+        + str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+    )
+
+    logging.info("Creating 'files' folder")
+    pathlib.Path("./files").mkdir(parents=True, exist_ok=True)
+
+    logging.info(f"Downloading file {source_url}")
+    download_file(source_url, source_file)
+
+    with pd.read_csv(
+        source_file,
+        chunksize=int(chunk_size),
+    ) as reader:
+        for chunk_number, chunk in enumerate(reader):
+            logging.info(f"Processing batch {chunk_number}")
+            target_file_batch = str(target_file).replace(
+                ".csv", "-" + str(chunk_number) + ".csv"
+            )
+            df = pd.DataFrame()
+            df = pd.concat([df, chunk])
+
+            logging.info(f"Transforming {source_file} ...")
+
+            logging.info(f"Transform: Rename columns {source_file} ...")
+            rename_headers(df)
+
+            logging.info("Transform: Converting date format.. ")
+            convert_values(df)
+
+            logging.info("Transform: Removing null values.. ")
+            filter_null_rows(df)
+
+            logging.info("Transform: Converting to integers..")
+            convert_values_to_integer_string(df)
+
+            logging.info("Transform: Converting to float..")
+            removing_nan_values(df)
+
+            logging.info("Transform: Reordering headers..")
+            df = df[
+                [
+                    "unique_key",
+                    "case_number",
+                    "date",
+                    "block",
+                    "iucr",
+                    "primary_type",
+                    "description",
+                    "location_description",
+                    "arrest",
+                    "domestic",
+                    "beat",
+                    "district",
+                    "ward",
+                    "community_area",
+                    "fbi_code",
+                    "x_coordinate",
+                    "y_coordinate",
+                    "year",
+                    "updated_on",
+                    "latitude",
+                    "longitude",
+                    "location",
+                ]
+            ]
+
+            process_chunk(df, target_file_batch)
+
+            logging.info(f"Appending batch {chunk_number} to {target_file}")
+            if chunk_number == 0:
+                subprocess.run(["cp", target_file_batch, target_file])
+            else:
+                subprocess.check_call(f"sed -i '1d' {target_file_batch}", shell=True)
+                subprocess.check_call(
+                    f"cat {target_file_batch} >> {target_file}", shell=True
+                )
+            subprocess.run(["rm", target_file_batch])
+
+    logging.info(
+        f"Uploading output file to.. gs://{target_gcs_bucket}/{target_gcs_path}"
+    )
+    upload_file_to_gcs(target_file, target_gcs_bucket, target_gcs_path)
+
+    logging.info(
+        "Chicago crime process completed at "
+        + str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+    )
+
+
+def process_chunk(df: pd.DataFrame, target_file_batch: str) -> None:
+
+    logging.info(f"Saving to output file.. {target_file_batch}")
+    try:
+        save_to_new_file(df, file_path=str(target_file_batch))
+    except Exception as e:
+        logging.error(f"Error saving output file: {e}.")
+    logging.info("..Done!")
+
+
+def resolve_nan(input: typing.Union[str, float]) -> str:
+    if not input or (math.isnan(input)):
+        return ""
+    return str(input).replace("None", "")
+
+
+def removing_nan_values(df: pd.DataFrame) -> None:
+    cols = ["x_coordinate", "y_coordinate", "latitude", "longitude"]
+    for cols in cols:
+        df[cols] = df[cols].apply(resolve_nan)
+
+
+def convert_to_integer_string(input: typing.Union[str, float]) -> str:
+
+    if not input or (math.isnan(input)):
+        return ""
+    return str(int(round(input, 0)))
+
+
+def convert_values_to_integer_string(df: pd.DataFrame) -> None:
+    cols = ["unique_key", "beat", "district", "ward", "community_area", "year"]
+
+    for cols in cols:
+        df[cols] = df[cols].apply(convert_to_integer_string)
+
+
+def rename_headers(df: pd.DataFrame) -> None:
+    header_names = {
+        "ID": "unique_key",
+        "Case Number": "case_number",
+        "Date": "date",
+        "Block": "block",
+        "IUCR": "iucr",
+        "Primary Type": "primary_type",
+        "Description": "description",
+        "Location Description": "location_description",
+        "Arrest": "arrest",
+        "Domestic": "domestic",
+        "Beat": "beat",
+        "District": "district",
+        "Ward": "ward",
+        "Community Area": "community_area",
+        "FBI Code": "fbi_code",
+        "X Coordinate": "x_coordinate",
+        "Y Coordinate": "y_coordinate",
+        "Year": "year",
+        "Updated On": "updated_on",
+        "Latitude": "latitude",
+        "Longitude": "longitude",
+        "Location": "location",
+    }
+
+    df.rename(columns=header_names, inplace=True)
+
+
+def convert_dt_format(dt_str: str) -> str:
+    # Old format: MM/dd/yyyy hh:mm:ss aa
+    # New format: yyyy-MM-dd HH:mm:ss
+    if not dt_str:
+        return dt_str
+    else:
+        return datetime.datetime.strptime(dt_str, "%m/%d/%Y %H:%M:%S %p").strftime(
+            "%Y-%m-%d %H:%M:%S"
+        )
+
+
+def convert_values(df: pd.DataFrame) -> None:
+    dt_cols = ["date", "updated_on"]
+
+    for dt_col in dt_cols:
+        df[dt_col] = df[dt_col].apply(convert_dt_format)
+
+
+def filter_null_rows(df: pd.DataFrame) -> None:
+    df = df[df.unique_key != ""]
+
+
+def save_to_new_file(df: pd.DataFrame, file_path: pathlib.Path) -> None:
+    df.to_csv(file_path, index=False)
+
+
+def download_file(source_url: str, source_file: pathlib.Path) -> None:
+    logging.info(f"Downloading {source_url} into {source_file}")
+    r = requests.get(source_url, stream=True)
+    if r.status_code == 200:
+        with open(source_file, "wb") as f:
+            for chunk in r:
+                f.write(chunk)
+    else:
+        logging.error(f"Couldn't download {source_url}: {r.text}")
+
+
+def upload_file_to_gcs(file_path: pathlib.Path, gcs_bucket: str, gcs_path: str) -> None:
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(gcs_bucket)
+    blob = bucket.blob(gcs_path)
+    blob.upload_from_filename(file_path)
+
+
+if __name__ == "__main__":
+    logging.getLogger().setLevel(logging.INFO)
+
+    main(
+        source_url=os.environ["SOURCE_URL"],
+        source_file=pathlib.Path(os.environ["SOURCE_FILE"]).expanduser(),
+        target_file=pathlib.Path(os.environ["TARGET_FILE"]).expanduser(),
+        target_gcs_bucket=os.environ["TARGET_GCS_BUCKET"],
+        target_gcs_path=os.environ["TARGET_GCS_PATH"],
+        chunk_size=os.environ["CHUNK_SIZE"],
+    )
diff --git a/datasets/chicago_crime/_images/run_csv_transform_kub/requirements.txt b/datasets/chicago_crime/_images/run_csv_transform_kub/requirements.txt
@@ -0,0 +1,3 @@
+requests
+pandas
+google-cloud-storage
diff --git a/datasets/chicago_crime/_terraform/chicago_crime_dataset.tf b/datasets/chicago_crime/_terraform/chicago_crime_dataset.tf
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+resource "google_bigquery_dataset" "chicago_crime" {
+  dataset_id  = "chicago_crime"
+  project     = var.project_id
+  description = "This dataset reflects reported incidents of crime (with the exception of murders where data exists for each victim) that occurred in the City of Chicago from 2001 to present, minus the most recent seven days. Data is extracted from the Chicago Police Department\u0027s CLEAR (Citizen Law Enforcement Analysis and Reporting) system. In order to protect the privacy of crime victims, addresses are shown at the block level only and specific locations are not identified. This data includes unverified reports supplied to the Police Department. The preliminary crime classifications may be changed at a later date based upon additional investigation and there is always the possibility of mechanical or human error. Therefore, the Chicago Police Department does not guarantee (either expressed or implied) the accuracy, completeness, timeliness, or correct sequencing of the information and the information should not be used for comparison purposes over time.\n\nDataset Source: City of Chicago\n\nCategory: Chicago, Public Safety\n\nUse: This dataset is publicly available for anyone to use under the following terms provided by the Dataset Source \u2014https://data.cityofchicago.org \u2014 and is provided \"AS IS\" without any warranty, express or implied, from Google. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset.\n\nUpdate Frequency: Daily"
+}
+
+output "bigquery_dataset-chicago_crime-dataset_id" {
+  value = google_bigquery_dataset.chicago_crime.dataset_id
+}
diff --git a/datasets/chicago_crime/_terraform/crime_pipeline.tf b/datasets/chicago_crime/_terraform/crime_pipeline.tf
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+resource "google_bigquery_table" "crime" {
+  project    = var.project_id
+  dataset_id = "chicago_crime"
+  table_id   = "crime"
+
+  description = "Chicago Crime dataset"
+
+
+
+
+  depends_on = [
+    google_bigquery_dataset.chicago_crime
+  ]
+}
+
+output "bigquery_table-crime-table_id" {
+  value = google_bigquery_table.crime.table_id
+}
+
+output "bigquery_table-crime-id" {
+  value = google_bigquery_table.crime.id
+}
diff --git a/datasets/chicago_crime/_terraform/provider.tf b/datasets/chicago_crime/_terraform/provider.tf
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2021 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+provider "google" {
+  project                     = var.project_id
+  impersonate_service_account = var.impersonating_acct
+  region                      = var.region
+}
+
+data "google_client_openid_userinfo" "me" {}
+
+output "impersonating-account" {
+  value = data.google_client_openid_userinfo.me.email
+}