Merge pull request #386 from ror-community/staging

Merge staging to prod: CSV bulk update
ror-community · Apr 1, 2024 · fb9c2a1 · fb9c2a1
2 parents db2a6d0 + 8d14fc8
commit fb9c2a1
Show file tree

Hide file tree

Showing 28 changed files with 1,676 additions and 98 deletions.
diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml
@@ -35,6 +35,18 @@ jobs:
         uses: actions/checkout@v2
         with:
           path: ror-api
+      - name: Checkout ror-data-test
+        uses: actions/checkout@v2
+        with:
+          repository: ror-community/ror-data-test
+          token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
+          path: ror-data-test
+      - name: Get last data dump name
+        working-directory: ./ror-data-test
+        run: |
+          FILE="$(ls -Art *.zip | tail -n 1)"
+          echo ${FILE%.*}
+          echo "LATEST_DUMP_FILE=${FILE%.*}" >> $GITHUB_ENV
       - name: Cache dependency
         uses: actions/cache@v2
         with:
@@ -57,7 +69,7 @@ jobs:
       - name: Setup
         working-directory: ./ror-api
         run: |
-          python manage.py setup v1.35-2023-10-26-ror-data -t
+          python manage.py setup v1.42-2024-02-21-ror-data -t
           # Dump file temp hard coded for v2 beta
           # Pulled from ror-data-test per settings.py config
       - name: Test

diff --git a/Dockerfile b/Dockerfile
@@ -16,7 +16,7 @@ RUN mv /etc/apt/sources.list.d /etc/apt/sources.list.d.bak && \
     mv /etc/apt/sources.list.d.bak /etc/apt/sources.list.d && \
     apt-get upgrade -y -o Dpkg::Options::="--force-confold" && \
     apt-get clean && \
-    apt-get install ntp wget unzip tzdata python3-pip -y && \
+    apt-get install ntp wget unzip tzdata python3-pip libmagic1 -y && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
 # Enable Passenger and Nginx and remove the default site

diff --git a/README.md b/README.md
diff --git a/requirements.txt b/requirements.txt
@@ -20,5 +20,8 @@ boto3
 pandas==1.4.1
 numpy==1.22
 titlecase==2.3
-update_address @ git+https://github.com/ror-community/update_address.git
-launchdarkly-server-sdk
+update_address @ git+https://github.com/ror-community/update_address.git@v2-locations
+launchdarkly-server-sdk==7.6.1
+jsonschema==3.2.0
+python-magic
+iso639-lang
diff --git a/rorapi/common/create_update.py b/rorapi/common/create_update.py
@@ -0,0 +1,96 @@
+import copy
+from datetime import datetime
+from rorapi.common.record_utils import *
+import update_address as ua
+from rorapi.v2.record_constants import *
+from rorapi.v2.serializers import (
+    OrganizationSerializer as OrganizationSerializerV2
+)
+from rorapi.management.commands.generaterorid import check_ror_id
+
+V2_SCHEMA = get_file_from_url("https://raw.githubusercontent.com/ror-community/ror-schema/schema-v2/ror_schema_v2_0.json")
+
+
+def update_record(json_input, existing_record):
+    record = copy.deepcopy(existing_record)
+    for k, v in json_input.items():
+        record[k] = copy.deepcopy(v)
+    return update_last_mod(record)
+
+def update_last_mod(record):
+    record['admin']['last_modified'] = copy.deepcopy(V2_LAST_MOD)
+    record['admin']['last_modified']['date'] = datetime.now().strftime("%Y-%m-%d")
+    return record
+
+def check_optional_fields(record):
+    for k in V2_OPTIONAL_FIELD_DEFAULTS:
+        if k not in record:
+            return True
+    return False
+
+def add_missing_optional_fields(record):
+    for k, v in V2_OPTIONAL_FIELD_DEFAULTS.items():
+        if k not in record:
+            record[k] = v
+    return record
+
+def add_created_last_mod(record):
+    today = datetime.now().strftime("%Y-%m-%d")
+    record['admin'] = copy.deepcopy(V2_ADMIN)
+    record['admin']['created']['date'] = today
+    record['admin']['last_modified']['date'] = today
+    return record
+
+def update_locations(locations):
+    error = None
+    updated_locations = []
+    for location in locations:
+        if 'geonames_id' in location:
+            try:
+                print(location['geonames_id'])
+                updated_location = ua.new_geonames_v2(str(location['geonames_id']))
+                updated_locations.append(updated_location['location'])
+            except:
+                error = "Error retrieving Geonames data for ID {}. Please check that this is a valid Geonames ID".format(location['geonames_id'])
+    return error, updated_locations
+
+def sort_list_fields(v2_record):
+    for field in v2_record:
+        if field in V2_SORT_KEYS:
+            if V2_SORT_KEYS[field] is not None:
+                sort_key = V2_SORT_KEYS[field]
+                sorted_vals = sorted(v2_record[field], key=lambda x: x[sort_key])
+            else:
+                sorted_vals = sorted(v2_record[field])
+            v2_record[field] = sorted_vals
+    return v2_record
+
+
+def new_record_from_json(json_input, version):
+    error = None
+    valid_data = None
+    new_record = copy.deepcopy(json_input)
+    if check_optional_fields(new_record):
+        new_record = add_missing_optional_fields(new_record)
+    error, updated_locations = update_locations(new_record['locations'])
+    if not error:
+        new_record['locations'] = updated_locations
+        new_record = add_created_last_mod(new_record)
+        new_ror_id = check_ror_id(version)
+        print("new ror id: " + new_ror_id)
+        new_record['id'] = new_ror_id
+        error, valid_data = validate_record(sort_list_fields(new_record), V2_SCHEMA)
+    return error, valid_data
+
+
+def update_record_from_json(new_json, existing_org):
+    error = None
+    valid_data = None
+    serializer = OrganizationSerializerV2(existing_org)
+    existing_record = serializer.data
+    updated_record = update_record(new_json, existing_record)
+    error, updated_locations = update_locations(updated_record['locations'])
+    if not error:
+        updated_record['locations'] = updated_locations
+        error, valid_data = validate_record(sort_list_fields(updated_record), V2_SCHEMA)
+    return error, valid_data
diff --git a/rorapi/common/csv_bulk.py b/rorapi/common/csv_bulk.py
@@ -0,0 +1,118 @@
+import csv
+import json
+import io
+import os
+import shutil
+import urllib
+from datetime import datetime
+from rest_framework.renderers import JSONRenderer
+from rorapi.settings import DATA
+from rorapi.v2.serializers import (
+    OrganizationSerializer as OrganizationSerializerV2
+)
+from rorapi.common.csv_update import update_record_from_csv
+from rorapi.common.csv_create import new_record_from_csv
+
+
+def save_record_file(ror_id, updated, json_obj, dir_name):
+    dir_path = os.path.join(DATA['DIR'],dir_name)
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+    subdir = 'updates' if updated else 'new'
+    if not os.path.exists(os.path.join(dir_path, subdir)):
+        os.mkdir(os.path.join(dir_path, subdir))
+    full_path = os.path.join(dir_path, subdir, ror_id.split('https://ror.org/')[1] + '.json')
+    with open(full_path, "w") as outfile:
+        json.dump(json_obj, outfile, ensure_ascii=False, indent=2)
+
+def save_report_file(report, report_fields, csv_file, dir_name, validate_only):
+    dir_path = os.path.join(DATA['DIR'],dir_name)
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+    filepath =  os.path.join(dir_path, 'report.csv')
+    with open(filepath, 'w') as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=report_fields)
+            writer.writeheader()
+            writer.writerows(report)
+    if not validate_only:
+        # save copy of input file
+        filepath =  os.path.join(dir_path, 'input.csv')
+        csv_file.seek(0)
+        with open(filepath, 'wb+') as f:
+            for chunk in csv_file.chunks():
+                f.write(chunk)
+
+def process_csv(csv_file, version, validate_only):
+    print("Processing CSV")
+    dir_name = datetime.now().strftime("%Y-%m-%d_%H_%M_%S") + "-ror-records"
+    success_msg = None
+    error = None
+    report = []
+    report_fields = ['row', 'ror_id', 'action', 'errors']
+    skipped_count = 0
+    updated_count = 0
+    new_count = 0
+    read_file = csv_file.read().decode('utf-8')
+    print(read_file)
+    reader = csv.DictReader(io.StringIO(read_file))
+    row_num = 2
+    for row in reader:
+        ror_id = None
+        updated = False
+        print("Row data")
+        print(row)
+        if row['id']:
+            ror_id = row['id']
+            updated = True
+            row_errors, v2_record = update_record_from_csv(row, version)
+        else:
+            row_errors, v2_record = new_record_from_csv(row, version)
+        if not row_errors:
+            if updated:
+                action = 'updated'
+                updated_count += 1
+            else:
+                action = 'created'
+                new_count += 1
+            ror_id = v2_record['id']
+            serializer = OrganizationSerializerV2(v2_record)
+            json_obj = json.loads(JSONRenderer().render(serializer.data))
+            print(json_obj)
+            if not validate_only:
+                #create file
+                file = save_record_file(ror_id, updated, json_obj, dir_name)
+        else:
+            action = 'skipped'
+            skipped_count += 1
+        if validate_only and action == 'created':
+            ror_id = None
+        report.append({"row": row_num, "ror_id": ror_id if ror_id else '', "action": action, "errors": "; ".join(row_errors) if row_errors else ''})
+        row_num += 1
+    if new_count > 0 or updated_count > 0 or skipped_count > 0:
+        try:
+            if validate_only:
+                try:
+                    save_report_file(report, report_fields, csv_file, dir_name, validate_only)
+                    success_msg = os.path.join(DATA['DIR'], dir_name, 'report.csv')
+                except Exception as e:
+                    error = f"Error creating validation report: {e}"
+            else:
+                #create report file
+                save_report_file(report, report_fields, csv_file, dir_name, validate_only)
+                # create zip file
+                zipfile = shutil.make_archive(os.path.join(DATA['DIR'], dir_name), 'zip', DATA['DIR'], dir_name)
+                # upload to S3
+                try:
+                    DATA['CLIENT'].upload_file(zipfile, DATA['PUBLIC_STORE'], dir_name + '.zip')
+                    zipfile = f"https://s3.eu-west-1.amazonaws.com/{DATA['PUBLIC_STORE']}/{urllib.parse.quote(dir_name)}.zip"
+                    success_msg = {"file": zipfile,
+                        "rows processed": new_count + updated_count + skipped_count,
+                        "created": new_count,
+                        "updated": updated_count,
+                        "skipped": skipped_count}
+                except Exception as e:
+                    error = f"Error uploading zipfile to S3: {e}"
+        except Exception as e:
+            error = f"Unexpected error generating records: {e}"
+
+    return error, success_msg
diff --git a/rorapi/common/csv_create.py b/rorapi/common/csv_create.py
@@ -0,0 +1,110 @@
+import copy
+from rorapi.common.record_utils import *
+from rorapi.common.csv_utils import *
+from rorapi.v2.record_constants import *
+from rorapi.common.serializers import ErrorsSerializer
+from rorapi.common.create_update import new_record_from_json
+
+
+def new_record_from_csv(csv_data, version):
+    v2_data = copy.deepcopy(V2_TEMPLATE)
+    errors = []
+    #domains
+    if csv_data['domains']:
+        v2_data['domains'] = [d.strip() for d in csv_data['domains'].strip(';').split(';')]
+
+    #established
+    if csv_data['established']:
+        v2_data['established'] = int(csv_data['established'].strip())
+
+    #external ids
+    for k,v in V2_EXTERNAL_ID_TYPES.items():
+        if csv_data['external_ids.type.' + v + '.all']:
+            all_ids = [i.strip() for i in csv_data['external_ids.type.' + v + '.all'].strip(';').split(';')]
+            ext_id_obj = {
+                "type": v,
+                "all": all_ids,
+                "preferred": csv_data['external_ids.type.' + v + '.preferred'].strip() if csv_data['external_ids.type.' + v + '.preferred'] else all_ids[0]
+            }
+            v2_data['external_ids'].append(ext_id_obj)
+
+    #links
+    for k,v in V2_LINK_TYPES.items():
+        if csv_data['links.type.' + v]:
+            for l in csv_data['links.type.' + v].strip(';').split(';'):
+                link_obj = {
+                    "type": v,
+                    "value": l.strip()
+                }
+                v2_data['links'].append(link_obj)
+
+    #locations
+    if csv_data['locations.geonames_id']:
+        geonames_ids = [i.strip() for i in csv_data['locations.geonames_id'].strip(';').split(';')]
+        for geonames_id in geonames_ids:
+            location_obj = {
+                "geonames_id": geonames_id,
+                "geonames_details": {}
+            }
+            v2_data['locations'].append(location_obj)
+
+    #names
+    temp_names = []
+    for k,v in V2_NAME_TYPES.items():
+        if csv_data['names.types.' + v]:
+            for n in csv_data['names.types.' + v].strip(';').split(';'):
+                if LANG_DELIMITER in n:
+                    name_val, lang  = n.split("*")
+                    if lang:
+                        lang_errors, lang_code = get_lang_code(lang.strip())
+                        if lang_errors:
+                            errors.append("Could not convert language value to ISO code: {}".format(lang))
+                else:
+                    name_val = n
+                    lang_code = None
+
+                name_obj = {
+                    "types": [v],
+                    "value": name_val.strip(),
+                    "lang": lang_code
+                }
+                temp_names.append(name_obj)
+    print("temp names 1:")
+    print(temp_names)
+    name_values = [n['value'] for n in temp_names]
+    dup_names = []
+    for n in name_values:
+        if name_values.count(n) > 1:
+            if n not in dup_names:
+                dup_names.append(n)
+    if dup_names:
+        dup_names_objs = []
+        for d in dup_names:
+            types = []
+            for t in temp_names:
+                if t['value'] == d:
+                    types.extend(t['types'])
+            name_obj = {
+                "types": types,
+                "value": d,
+                "lang": None
+            }
+            dup_names_objs.append(name_obj)
+        temp_names = [t for t in temp_names if t['value'] not in dup_names]
+        temp_names.extend(dup_names_objs)
+    print("temp names 2:")
+    print(temp_names)
+    v2_data['names'] = temp_names
+
+    #status
+    if csv_data['status']:
+        v2_data['status'] = csv_data['status'].strip().lower()
+
+    #types
+    if csv_data['types']:
+        v2_data['types'] = [t.strip().lower() for t in csv_data['types'].strip(';').split(';')]
+
+    validation_error, new_record = new_record_from_json(v2_data, version)
+    if validation_error:
+        errors.append(validation_error)
+    return errors, new_record