Unstructured-IO · ahmetmeleq · Mar 28, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 15, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,11 @@
-## 0.13.0-dev11
+## 0.13.0-dev12
 
-### Enhancements 
+### Enhancements
 
 * **Add `.metadata.is_continuation` to text-split chunks.** `.metadata.is_continuation=True` is added to second-and-later chunks formed by text-splitting an oversized `Table` element but not to their counterpart `Text` element splits. Add this indicator for `CompositeElement` to allow text-split continuation chunks to be identified for downstream processes that may wish to skip intentionally redundant metadata values in continuation chunks.
 * **Add `compound_structure_acc` metric to table eval.** Add a new property to `unstructured.metrics.table_eval.TableEvaluation`: `composite_structure_acc`, which is computed from the element level row and column index and content accuracy scores
 * **Add `.metadata.orig_elements` to chunks.** `.metadata.orig_elements: list[Element]` is added to chunks during the chunking process (when requested) to allow access to information from the elements each chunk was formed from. This is useful for example to recover metadata fields that cannot be consolidated to a single value for a chunk, like `page_number`, `coordinates`, and `image_base64`.
+* **Add Google VertexAI embedder** Adds VertexAI embeddings to support embedding via Google Vertex AI.
 
 ### Features
 
@@ -19,7 +20,6 @@
 * **Fix OneDrive dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string. See previous fix for SharePoint
 * **Adds tracking for AstraDB** Adds tracking info so AstraDB can see what source called their api.
 * **Support AWS Bedrock Embeddings in ingest CLI** The configs required to instantiate the bedrock embedding class are now exposed in the api and the version of boto being used meets the minimum requirement to introduce the bedrock runtime required to hit the service.
->>>>>>> 6a63c941c (bump changelog)
 
 ## 0.12.6
 

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=build.txt build.in
 #
-alabaster==0.7.13
+alabaster==0.7.16
     # via sphinx
 babel==2.14.0
     # via sphinx
@@ -36,7 +36,7 @@ idna==3.6
     #   requests
 imagesize==1.4.1
     # via sphinx
-importlib-metadata==7.0.1
+importlib-metadata==7.0.2
     # via sphinx
 jinja2==3.1.3
     # via
@@ -57,6 +57,7 @@ myst-parser==2.0.0
 packaging==23.2
     # via
     #   -c base.txt
+    #   -c constraints.in
     #   sphinx
 pygments==2.17.2
     # via
@@ -119,5 +120,5 @@ urllib3==1.26.18
     #   -c base.txt
     #   -c constraints.in
     #   requests
-zipp==3.17.0
+zipp==3.18.1
     # via importlib-metadata
diff --git a/docs/source/core/embedding.rst b/docs/source/core/embedding.rst
@@ -171,6 +171,53 @@ To obtain an api key, visit: https://octo.ai/docs/getting-started/how-to-create-
     query = "This is the query"
     query_embedding = embedding_encoder.embed_query(query=query)
 
+    [print(e.embeddings, e) for e in elements]
+    print(query_embedding, query)
+    print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
+
+``VertexAIEmbeddingEncoder``
+--------------------------
+
+The ``VertexAIEmbeddingEncoder`` class connects to the GCP VertexAI to obtain embeddings for pieces of text.
+
+``embed_documents`` will receive a list of Elements, and return an updated list which
+includes the ``embeddings`` attribute for each Element.
+
+``embed_query`` will receive a query as a string, and return a list of floats which is the
+embedding vector for the given query string.
+
+``num_of_dimensions`` is a metadata property that denotes the number of dimensions in any
+embedding vector obtained via this class.
+
+``is_unit_vector`` is a metadata property that denotes if embedding vectors obtained via
+this class are unit vectors.
+
+The following code block shows an example of how to use ``VertexAIEmbeddingEncoder``. You will
+see the updated elements list (with the ``embeddings`` attribute included for each element),
+the embedding vector for the query string, and some metadata properties about the embedding model.
+
+You will need to have the credentials configured for your environment (gcloud, workload identity,
+etc…) or you'll need to store the path to a service account JSON file as the
+GOOGLE_APPLICATION_CREDENTIALS environment variable to be able to run this example.
+For more information: https://python.langchain.com/docs/integrations/text_embedding/google_vertex_ai_palm
+
+.. code:: python
+
+    import os
+
+    from unstructured.documents.elements import Text
+    from unstructured.embed.vertexai import VertexAIEmbeddingConfig, VertexAIEmbeddingEncoder
+
+    embedding_encoder = VertexAIEmbeddingEncoder(
+        config=VertexAIEmbeddingConfig()
+    )
+    elements = embedding_encoder.embed_documents(
+        elements=[Text("This is sentence 1"), Text("This is sentence 2")],
+    )
+
+    query = "This is the query"
+    query_embedding = embedding_encoder.embed_query(query=query)
+
     [print(e.embeddings, e) for e in elements]
     print(query_embedding, query)
     print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
diff --git a/examples/embed/example_vertexai.py b/examples/embed/example_vertexai.py
@@ -0,0 +1,19 @@
+from unstructured.documents.elements import Text
+from unstructured.embed.vertexai import VertexAIEmbeddingConfig, VertextAIEmbeddingEncoder
+
+# https://python.langchain.com/docs/integrations/text_embedding/google_vertex_ai_palm
+# To use Vertex AI PaLM you must either have credentials configured for your environment (gcloud,
+# workload identity, etc…), or store the path to a service account JSON file as the
+# GOOGLE_APPLICATION_CREDENTIALS environment variable.
+
+embedding_encoder = VertextAIEmbeddingEncoder(config=VertexAIEmbeddingConfig())
+elements = embedding_encoder.embed_documents(
+    elements=[Text("This is sentence 1"), Text("This is sentence 2")],
+)
+
+query = "This is the query"
+query_embedding = embedding_encoder.embed_query(query=query)
+
+[print(e.embeddings, e) for e in elements]
+print(query_embedding, query)
+print(embedding_encoder.is_unit_vector(), embedding_encoder.num_of_dimensions())
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -22,8 +22,10 @@ charset-normalizer==3.3.2
 click==8.1.7
     # via nltk
 dataclasses-json==0.6.4
-    # via -r base.in
-dataclasses-json-speakeasy==0.5.11
+    # via
+    #   -r base.in
+    #   unstructured-client
+deepdiff==6.7.1
     # via unstructured-client
 emoji==2.10.1
     # via -r base.in
@@ -41,10 +43,9 @@ langdetect==1.0.9
     # via -r base.in
 lxml==5.1.0
     # via -r base.in
-marshmallow==3.20.2
+marshmallow==3.21.1
     # via
     #   dataclasses-json
-    #   dataclasses-json-speakeasy
     #   unstructured-client
 mypy-extensions==1.0.0
     # via
@@ -54,17 +55,22 @@ nltk==3.8.1
     # via -r base.in
 numpy==1.26.4
     # via -r base.in
+ordered-set==4.1.0
+    # via deepdiff
 packaging==23.2
     # via
+    #   -c constraints.in
     #   marshmallow
     #   unstructured-client
-python-dateutil==2.8.2
+pypdf==4.1.0
+    # via unstructured-client
+python-dateutil==2.9.0.post0
     # via unstructured-client
 python-iso639==2024.2.7
     # via -r base.in
 python-magic==0.4.27
     # via -r base.in
-rapidfuzz==3.6.1
+rapidfuzz==3.6.2
     # via -r base.in
 regex==2023.12.25
     # via nltk
@@ -83,18 +89,20 @@ tabulate==0.9.0
     # via -r base.in
 tqdm==4.66.2
     # via nltk
-typing-extensions==4.9.0
+typing-extensions==4.10.0
     # via
     #   -r base.in
+    #   pypdf
     #   typing-inspect
     #   unstructured-client
 typing-inspect==0.9.0
     # via
     #   dataclasses-json
-    #   dataclasses-json-speakeasy
     #   unstructured-client
-unstructured-client==0.18.0
-    # via -r base.in
+unstructured-client==0.22.0
+    # via
+    #   -c constraints.in
+    #   -r base.in
 urllib3==1.26.18
     # via
     #   -c constraints.in

diff --git a/requirements/build.txt b/requirements/build.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=build.txt build.in
 #
-alabaster==0.7.13
+alabaster==0.7.16
     # via sphinx
 babel==2.14.0
     # via sphinx
@@ -36,7 +36,7 @@ idna==3.6
     #   requests
 imagesize==1.4.1
     # via sphinx
-importlib-metadata==7.0.1
+importlib-metadata==7.0.2
     # via sphinx
 jinja2==3.1.3
     # via
@@ -57,6 +57,7 @@ myst-parser==2.0.0
 packaging==23.2
     # via
     #   -c base.txt
+    #   -c constraints.in
     #   sphinx
 pygments==2.17.2
     # via
@@ -119,5 +120,5 @@ urllib3==1.26.18
     #   -c base.txt
     #   -c constraints.in
     #   requests
-zipp==3.17.0
+zipp==3.18.1
     # via importlib-metadata
diff --git a/requirements/constraints.in b/requirements/constraints.in
@@ -43,3 +43,8 @@ opencv-python==4.8.0.76
 opencv-contrib-python==4.8.0.76
 onnxruntime==1.15.1
 platformdirs==3.10.0
+# pinned in langchain-community
+packaging<24.0
+packaging>=23.2
+# NOTE(robinson): pinning temporarily due to failing tests
+unstructured-client==0.22.0