/
uploadFiles.py
92 lines (73 loc) · 3.59 KB
/
uploadFiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import shutil
import tempfile
import zipfile
import subprocess
import requests
from prefect import flow
GbOpen = "/sciclone/geounder/dev/geoBoundaries/database/geoBoundaries/releaseData/gbOpen/"
DVuploader="/sciclone/geounder/dev/geoBoundaries/scripts/geoBoundaryBot/external/DataVerse/DVUploader-v1.1.0.jar"
api_token = "2ed5ef8b-6f89-46bb-b140-a1c6914020ff"
dataset_id = 7242413 #7152306 #35614814
@flow(flow_run_name="gbOpenDV",log_prints=True)
def uploadFilesDV():
# Define the API endpoint URL to retrieve the list of files
files_url = f"https://dataverse.harvard.edu/api/datasets/{dataset_id}"
# Set the headers with the API token
headers = {
"Content-Type": "application/json",
"X-Dataverse-key": api_token
}
# Send a GET request to retrieve the list of files
response = requests.get(files_url, headers=headers)
files_data = response.json()
# Create temporary directory
Gbtemp_dir = tempfile.mkdtemp()
max_size = 2 * 1024 * 1024 * 1024# maximum size of each zip file in bytes
chunk_num = 1 # initialize chunk number
for directory in os.listdir(GbOpen):
directory_path = os.path.join(GbOpen, directory)
if os.path.isdir(directory_path):
for subdir in ["ADM0", "ADM1", "ADM2", "ADM3", "ADM4", "ADM5"]:
subdir_path = os.path.join(directory_path, subdir)
if os.path.isdir(subdir_path):
zip_file_path = None
for file in os.listdir(subdir_path):
if file.endswith('.zip'):
zip_file_path = os.path.join(subdir_path, file)
break
if zip_file_path is not None:
# Copy zip file to temporary directory
shutil.copy2(zip_file_path, Gbtemp_dir)
print(f"Copied {zip_file_path} to {Gbtemp_dir}")
#Create the zip of temporary directory
#zip_filename = '/home/rohith/Documents/temp_dir.zip'
# Loop over all files in the temp directory
for root, dirs, files in os.walk(Gbtemp_dir):
zip_size = 0 # initialize zip size for this chunk
zip_name = f"Gbopen{chunk_num}.zip" # initialize zip file name for this chunk
zip_file = zipfile.ZipFile(zip_name, mode="w", compression=zipfile.ZIP_DEFLATED)
# Loop over all files in this directory level
for file in files:
file_path = os.path.join(root, file)
file_size = os.path.getsize(file_path)
# If adding this file would exceed the max_size, close the current zip file and start a new one
if zip_size + file_size > max_size:
zip_file.close()
zip_size = 0
chunk_num += 1
zip_name = f"Gbopen{chunk_num}.zip"
zip_file = zipfile.ZipFile(zip_name, mode="w", compression=zipfile.ZIP_DEFLATED)
# Add the file to the current zip file and update the zip size
zip_file.write(file_path, arcname=file)
zip_size += file_size
# Close the current zip file
zip_file.close()
print(f"Created {chunk_num} zip files.")
# Execute the command in the terminal
for i in range(1, 6):
filename = f"Gbopen{i}.zip"
directory = f"/sciclone/geounder/dev/geoBoundaries/scripts/geoBoundaryBot/external/DataVerse/{filename}"
command = f"java -jar {DVuploader} -key=2ed5ef8b-6f89-46bb-b140-a1c6914020ff -did=doi:10.7910/DVN/WEWGNR -server=https://dataverse.harvard.edu {directory}"
subprocess.run(command, shell=True)
uploadFilesDV()