Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bug CORE-4089: Onedrive partitioning fails - datetime formatting error #2638

Merged
merged 15 commits into from Mar 15, 2024
Merged
3 changes: 2 additions & 1 deletion CHANGELOG.md
@@ -1,4 +1,4 @@
## 0.12.7-dev0
## 0.12.7-dev1

### Enhancements

Expand All @@ -7,6 +7,7 @@
### Fixes

* **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.
* **Fix OneDrive dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string. See previous fix for SharePoint

## 0.12.6

Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/astra.txt
Expand Up @@ -8,7 +8,7 @@ anyio==3.7.1
# via
# -c ingest/../constraints.in
# httpx
astrapy==0.7.6
astrapy==0.7.7
# via -r ingest/astra.in
cassandra-driver==3.29.0
# via cassio
Expand Down
Expand Up @@ -3,8 +3,8 @@
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:09",
"date_modified": "2023-08-24T03:00:09",
"date_created": "2023-08-24T03:00:09+00:00",
"date_modified": "2023-08-24T03:00:09+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/fake-text.txt",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -24,8 +24,8 @@
"element_id": "a9d4657034aa3fdb5177f1325e912362",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:09",
"date_modified": "2023-08-24T03:00:09",
"date_created": "2023-08-24T03:00:09+00:00",
"date_modified": "2023-08-24T03:00:09+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/fake-text.txt",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -45,8 +45,8 @@
"element_id": "9c218520320f238595f1fde74bdd137d",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:09",
"date_modified": "2023-08-24T03:00:09",
"date_created": "2023-08-24T03:00:09+00:00",
"date_modified": "2023-08-24T03:00:09+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/fake-text.txt",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -66,8 +66,8 @@
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:09",
"date_modified": "2023-08-24T03:00:09",
"date_created": "2023-08-24T03:00:09+00:00",
"date_modified": "2023-08-24T03:00:09+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/fake-text.txt",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -87,8 +87,8 @@
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:09",
"date_modified": "2023-08-24T03:00:09",
"date_created": "2023-08-24T03:00:09+00:00",
"date_modified": "2023-08-24T03:00:09+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/fake-text.txt",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -108,8 +108,8 @@
"element_id": "0b61e826b1c4ab05750184da72b89f83",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:09",
"date_modified": "2023-08-24T03:00:09",
"date_created": "2023-08-24T03:00:09+00:00",
"date_modified": "2023-08-24T03:00:09+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/fake-text.txt",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand Down
Expand Up @@ -3,8 +3,8 @@
"element_id": "1df8eeb8be847c3a1a7411e3be3e0396",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:27",
"date_modified": "2023-08-24T03:00:27",
"date_created": "2023-08-24T03:00:27+00:00",
"date_modified": "2023-08-24T03:00:27+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/nested/fake-text.txt",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -24,8 +24,8 @@
"element_id": "a9d4657034aa3fdb5177f1325e912362",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:27",
"date_modified": "2023-08-24T03:00:27",
"date_created": "2023-08-24T03:00:27+00:00",
"date_modified": "2023-08-24T03:00:27+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/nested/fake-text.txt",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -45,8 +45,8 @@
"element_id": "9c218520320f238595f1fde74bdd137d",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:27",
"date_modified": "2023-08-24T03:00:27",
"date_created": "2023-08-24T03:00:27+00:00",
"date_modified": "2023-08-24T03:00:27+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/nested/fake-text.txt",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -66,8 +66,8 @@
"element_id": "39a3ae572581d0f1fe7511fd7b3aa414",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:27",
"date_modified": "2023-08-24T03:00:27",
"date_created": "2023-08-24T03:00:27+00:00",
"date_modified": "2023-08-24T03:00:27+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/nested/fake-text.txt",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -87,8 +87,8 @@
"element_id": "fc1adcb8eaceac694e500a103f9f698f",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:27",
"date_modified": "2023-08-24T03:00:27",
"date_created": "2023-08-24T03:00:27+00:00",
"date_modified": "2023-08-24T03:00:27+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/nested/fake-text.txt",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -108,8 +108,8 @@
"element_id": "0b61e826b1c4ab05750184da72b89f83",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:27",
"date_modified": "2023-08-24T03:00:27",
"date_created": "2023-08-24T03:00:27+00:00",
"date_modified": "2023-08-24T03:00:27+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/nested/fake-text.txt",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand Down
Expand Up @@ -3,8 +3,8 @@
"element_id": "a5c9668a6055bca2865ea5e6d16ea1e0",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -26,8 +26,8 @@
"element_id": "1d34c23ff08573afa07b42842b41277a",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -49,8 +49,8 @@
"element_id": "05440c6ca94cb55f6d185d8bd92ce9d6",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -72,8 +72,8 @@
"element_id": "e39c724f1b09a4c3286b6368538e05fc",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -95,8 +95,8 @@
"element_id": "1d34c23ff08573afa07b42842b41277a",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -118,8 +118,8 @@
"element_id": "85ada878f2345c23b8a74a931d2e20a4",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -141,8 +141,8 @@
"element_id": "0e570ca6fabe24f94e52c1833f3ffd25",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -164,8 +164,8 @@
"element_id": "4cf4ff5597274d0c1ce8ae5a17ead4df",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -187,8 +187,8 @@
"element_id": "dd167905de0defcaf72de673ee44c074",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -210,8 +210,8 @@
"element_id": "5f9d7b40d332fef76efdd0a97bcb8617",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -233,8 +233,8 @@
"element_id": "2b5c3d26721ae9c350cf3009318b626f",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -256,8 +256,8 @@
"element_id": "53d2273ac70fc31640cc45af840dbd42",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -279,8 +279,8 @@
"element_id": "4efca0d10c5feb8e9b35eb1d994f2905",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand All @@ -302,8 +302,8 @@
"element_id": "4c9720f1540cc84d33e30e09aca8c077",
"metadata": {
"data_source": {
"date_created": "2023-08-24T03:00:43",
"date_modified": "2023-08-24T03:00:43",
"date_created": "2023-08-24T03:00:43+00:00",
"date_modified": "2023-08-24T03:00:43+00:00",
"record_locator": {
"server_relative_path": "utic-test-ingest-fixtures/tests-example.xls",
"user_pname": "devops@unstructuredio.onmicrosoft.com"
Expand Down
2 changes: 1 addition & 1 deletion test_unstructured_ingest/unit/test_utils.py
Expand Up @@ -4,7 +4,7 @@

from unstructured.ingest.cli.utils import extract_config
from unstructured.ingest.interfaces import BaseConfig
from unstructured.ingest.utils.string_utils import json_to_dict
from unstructured.ingest.utils.string_and_date_utils import json_to_dict


@dataclass
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
@@ -1 +1 @@
__version__ = "0.12.7-dev0" # pragma: no cover
__version__ = "0.12.7-dev1" # pragma: no cover
7 changes: 6 additions & 1 deletion unstructured/ingest/connector/astra.py
Expand Up @@ -2,6 +2,8 @@
import typing as t
from dataclasses import dataclass, field

from unstructured import __name__ as integration_name
from unstructured.__version__ import __version__ as integration_version
from unstructured.ingest.enhanced_dataclass import enhanced_field
from unstructured.ingest.enhanced_dataclass.core import _asdict
from unstructured.ingest.error import DestinationConnectionError, SourceConnectionNetworkError
Expand Down Expand Up @@ -67,10 +69,13 @@ def astra_db_collection(self) -> "AstraDBCollection":
if self._astra_db_collection is None:
from astrapy.db import AstraDB

# Build the Astra DB object
# Build the Astra DB object.
# caller_name/version for AstraDB tracking
self._astra_db = AstraDB(
api_endpoint=self.connector_config.access_config.api_endpoint,
token=self.connector_config.access_config.token,
caller_name=integration_name,
caller_version=integration_version,
Comment on lines +77 to +78
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are Astra changes relevant to our PR?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If they got in unexpectedly, let's remove; if intentional, let's add explanations in the PR description and changelog

)

# Create and connect to the newly created collection
Expand Down
2 changes: 1 addition & 1 deletion unstructured/ingest/connector/fsspec/gcs.py
Expand Up @@ -13,7 +13,7 @@
from unstructured.ingest.enhanced_dataclass import enhanced_field
from unstructured.ingest.error import SourceConnectionError
from unstructured.ingest.interfaces import AccessConfig
from unstructured.ingest.utils.string_utils import json_to_dict
from unstructured.ingest.utils.string_and_date_utils import json_to_dict
from unstructured.utils import requires_dependencies


Expand Down
2 changes: 1 addition & 1 deletion unstructured/ingest/connector/google_drive.py
Expand Up @@ -24,7 +24,7 @@
SourceMetadata,
)
from unstructured.ingest.logger import logger
from unstructured.ingest.utils.string_utils import json_to_dict
from unstructured.ingest.utils.string_and_date_utils import json_to_dict
from unstructured.utils import requires_dependencies

if t.TYPE_CHECKING:
Expand Down
10 changes: 3 additions & 7 deletions unstructured/ingest/connector/onedrive.py
@@ -1,6 +1,5 @@
import typing as t
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path

from unstructured.file_utils.filetype import EXT_TO_FILETYPE
Expand All @@ -16,12 +15,12 @@
SourceMetadata,
)
from unstructured.ingest.logger import logger
from unstructured.ingest.utils.string_and_date_utils import ensure_isoformat_datetime
from unstructured.utils import requires_dependencies

if t.TYPE_CHECKING:
from office365.graph_client import GraphClient
from office365.onedrive.driveitems.driveItem import DriveItem

MAX_MB_SIZE = 512_000_000


Expand Down Expand Up @@ -144,11 +143,8 @@ def update_source_metadata(self, **kwargs):
version = file.versions[n_versions - 1].properties.get("id", None)

self.source_metadata = SourceMetadata(
date_created=datetime.strptime(file.created_datetime, "%Y-%m-%dT%H:%M:%SZ").isoformat(),
date_modified=datetime.strptime(
file.last_modified_datetime,
"%Y-%m-%dT%H:%M:%SZ",
).isoformat(),
date_created=ensure_isoformat_datetime(timestamp=file.created_datetime),
date_modified=ensure_isoformat_datetime(timestamp=file.last_modified_datetime),
version=version,
source_url=file.parent_reference.path + "/" + self.file_name,
exists=True,
Expand Down