Skip to content

Commit

Permalink
fix: Delete temp GCS objects generated by gsutil's parallel composite…
Browse files Browse the repository at this point in the history
… upload for `geos_fp` dataset (#195)

* fix: Delete temporary files generated by gsutil's parallel composite uploads

* fix: fix type errors

* fix: Add node pool affinities for the DAG tasks
  • Loading branch information
adlersantos committed Sep 24, 2021
1 parent 1ea15a0 commit f307cce
Show file tree
Hide file tree
Showing 3 changed files with 230 additions and 2 deletions.
24 changes: 22 additions & 2 deletions datasets/geos_fp/_images/rolling_copy/script.py
Expand Up @@ -52,7 +52,7 @@ def main(
)


def _date_prefix(dt: date) -> typing.List[str]:
def _date_prefix(dt: date) -> str:
# Generates URL paths to folders containing the .nc4 files, for example
# https://portal.nccs.nasa.gov/datashare/gmao/geos-fp/das/Y2021/M01/D01/
# => Y2021/M01/D01
Expand Down Expand Up @@ -154,6 +154,7 @@ def move_dir_contents_to_gcs(
f"gs://{target_bucket}/{date_prefix}",
]
)
delete_temp_pcu_objects(target_bucket)
delete_dir_contents(dir_ / date_prefix)


Expand All @@ -164,8 +165,27 @@ def delete_dir_contents(dir_to_delete: pathlib.Path) -> None:
[f.unlink() for f in dir_to_delete.glob("*") if f.is_file()]


def delete_temp_pcu_objects(target_bucket: str) -> None:
"""Delete temp GCS objects created by gsutil's parallel composite uploads.
See https://cloud.google.com/storage/docs/uploads-downloads#gsutil-pcu
"""
res = subprocess.run(
["gsutil", "ls", f"gs://{target_bucket}"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)
uris = res.stdout.split()
for uri in uris:
object_name = uri.split(target_bucket + "/")[-1]
if not object_name.startswith("Y"):
subprocess.check_call(
["gsutil", "rm", "-r", f"gs://{target_bucket}/{object_name}"],
)


def update_manifest_file(
paths: typing.Set[str],
paths: typing.List[str],
download_dir: pathlib.Path,
target_bucket: str,
date_prefix: str,
Expand Down
Expand Up @@ -37,6 +37,23 @@
task_id="copy_files_dated_today",
name="geosfp",
namespace="default",
affinity={
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "cloud.google.com/gke-nodepool",
"operator": "In",
"values": ["pool-e2-standard-4"],
}
]
}
]
}
}
},
image="{{ var.json.geos_fp.container_registry.rolling_copy }}",
image_pull_policy="Always",
env_vars={
Expand All @@ -58,6 +75,23 @@
task_id="copy_files_dated_today_minus_1_day",
name="geosfp",
namespace="default",
affinity={
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "cloud.google.com/gke-nodepool",
"operator": "In",
"values": ["pool-e2-standard-4"],
}
]
}
]
}
}
},
image="{{ var.json.geos_fp.container_registry.rolling_copy }}",
image_pull_policy="Always",
env_vars={
Expand All @@ -79,6 +113,23 @@
task_id="copy_files_dated_today_minus_2_days",
name="geosfp",
namespace="default",
affinity={
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "cloud.google.com/gke-nodepool",
"operator": "In",
"values": ["pool-e2-standard-4"],
}
]
}
]
}
}
},
image="{{ var.json.geos_fp.container_registry.rolling_copy }}",
image_pull_policy="Always",
env_vars={
Expand All @@ -100,6 +151,23 @@
task_id="copy_files_dated_today_minus_3_days",
name="geosfp",
namespace="default",
affinity={
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "cloud.google.com/gke-nodepool",
"operator": "In",
"values": ["pool-e2-standard-4"],
}
]
}
]
}
}
},
image="{{ var.json.geos_fp.container_registry.rolling_copy }}",
image_pull_policy="Always",
env_vars={
Expand All @@ -121,6 +189,23 @@
task_id="copy_files_dated_today_minus_4_days",
name="geosfp",
namespace="default",
affinity={
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "cloud.google.com/gke-nodepool",
"operator": "In",
"values": ["pool-e2-standard-4"],
}
]
}
]
}
}
},
image="{{ var.json.geos_fp.container_registry.rolling_copy }}",
image_pull_policy="Always",
env_vars={
Expand All @@ -142,6 +227,23 @@
task_id="copy_files_dated_today_minus_5_days",
name="geosfp",
namespace="default",
affinity={
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "cloud.google.com/gke-nodepool",
"operator": "In",
"values": ["pool-e2-standard-4"],
}
]
}
]
}
}
},
image="{{ var.json.geos_fp.container_registry.rolling_copy }}",
image_pull_policy="Always",
env_vars={
Expand All @@ -163,6 +265,23 @@
task_id="copy_files_dated_today_minus_6_days",
name="geosfp",
namespace="default",
affinity={
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "cloud.google.com/gke-nodepool",
"operator": "In",
"values": ["pool-e2-standard-4"],
}
]
}
]
}
}
},
image="{{ var.json.geos_fp.container_registry.rolling_copy }}",
image_pull_policy="Always",
env_vars={
Expand All @@ -184,6 +303,23 @@
task_id="copy_files_dated_today_minus_7_days",
name="geosfp",
namespace="default",
affinity={
"nodeAffinity": {
"requiredDuringSchedulingIgnoredDuringExecution": {
"nodeSelectorTerms": [
{
"matchExpressions": [
{
"key": "cloud.google.com/gke-nodepool",
"operator": "In",
"values": ["pool-e2-standard-4"],
}
]
}
]
}
}
},
image="{{ var.json.geos_fp.container_registry.rolling_copy }}",
image_pull_policy="Always",
env_vars={
Expand Down
72 changes: 72 additions & 0 deletions datasets/geos_fp/copy_files_rolling_basis/pipeline.yaml
Expand Up @@ -35,6 +35,15 @@ dag:
task_id: "copy_files_dated_today"
name: "geosfp"
namespace: "default"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-nodepool
operator: In
values:
- "pool-e2-standard-4"
image: "{{ var.json.geos_fp.container_registry.rolling_copy }}"
image_pull_policy: "Always"
env_vars:
Expand All @@ -57,6 +66,15 @@ dag:
task_id: "copy_files_dated_today_minus_1_day"
name: "geosfp"
namespace: "default"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-nodepool
operator: In
values:
- "pool-e2-standard-4"
image: "{{ var.json.geos_fp.container_registry.rolling_copy }}"
image_pull_policy: "Always"
env_vars:
Expand All @@ -79,6 +97,15 @@ dag:
task_id: "copy_files_dated_today_minus_2_days"
name: "geosfp"
namespace: "default"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-nodepool
operator: In
values:
- "pool-e2-standard-4"
image: "{{ var.json.geos_fp.container_registry.rolling_copy }}"
image_pull_policy: "Always"
env_vars:
Expand All @@ -101,6 +128,15 @@ dag:
task_id: "copy_files_dated_today_minus_3_days"
name: "geosfp"
namespace: "default"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-nodepool
operator: In
values:
- "pool-e2-standard-4"
image: "{{ var.json.geos_fp.container_registry.rolling_copy }}"
image_pull_policy: "Always"
env_vars:
Expand All @@ -123,6 +159,15 @@ dag:
task_id: "copy_files_dated_today_minus_4_days"
name: "geosfp"
namespace: "default"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-nodepool
operator: In
values:
- "pool-e2-standard-4"
image: "{{ var.json.geos_fp.container_registry.rolling_copy }}"
image_pull_policy: "Always"
env_vars:
Expand All @@ -145,6 +190,15 @@ dag:
task_id: "copy_files_dated_today_minus_5_days"
name: "geosfp"
namespace: "default"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-nodepool
operator: In
values:
- "pool-e2-standard-4"
image: "{{ var.json.geos_fp.container_registry.rolling_copy }}"
image_pull_policy: "Always"
env_vars:
Expand All @@ -167,6 +221,15 @@ dag:
task_id: "copy_files_dated_today_minus_6_days"
name: "geosfp"
namespace: "default"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-nodepool
operator: In
values:
- "pool-e2-standard-4"
image: "{{ var.json.geos_fp.container_registry.rolling_copy }}"
image_pull_policy: "Always"
env_vars:
Expand All @@ -189,6 +252,15 @@ dag:
task_id: "copy_files_dated_today_minus_7_days"
name: "geosfp"
namespace: "default"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-nodepool
operator: In
values:
- "pool-e2-standard-4"
image: "{{ var.json.geos_fp.container_registry.rolling_copy }}"
image_pull_policy: "Always"
env_vars:
Expand Down

0 comments on commit f307cce

Please sign in to comment.