Skip to content

Commit

Permalink
[Improvements] Improve logging and fix insertion in data_sources table (
Browse files Browse the repository at this point in the history
  • Loading branch information
deshraj committed Apr 11, 2024
1 parent f861987 commit 536f85b
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 16 deletions.
14 changes: 5 additions & 9 deletions embedchain/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,9 @@
import yaml
from tqdm import tqdm

from embedchain.cache import (
Config,
ExactMatchEvaluation,
SearchDistanceEvaluation,
cache,
gptcache_data_manager,
gptcache_pre_function,
)
from embedchain.cache import (Config, ExactMatchEvaluation,
SearchDistanceEvaluation, cache,
gptcache_data_manager, gptcache_pre_function)
from embedchain.client import Client
from embedchain.config import AppConfig, CacheConfig, ChunkerConfig
from embedchain.core.db.database import get_session, init_db, setup_engine
Expand All @@ -25,7 +20,8 @@
from embedchain.embedder.base import BaseEmbedder
from embedchain.embedder.openai import OpenAIEmbedder
from embedchain.evaluation.base import BaseMetric
from embedchain.evaluation.metrics import AnswerRelevance, ContextRelevance, Groundedness
from embedchain.evaluation.metrics import (AnswerRelevance, ContextRelevance,
Groundedness)
from embedchain.factory import EmbedderFactory, LlmFactory, VectorDBFactory
from embedchain.helpers.json_serializable import register_deserializable
from embedchain.llm.base import BaseLlm
Expand Down
14 changes: 9 additions & 5 deletions embedchain/embedchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@ def add(
if data_type in {DataType.DOCS_SITE}:
self.is_docs_site_instance = True

# Convert the source to a string if it is not already
if not isinstance(source, str):
source = str(source)

# Insert the data into the 'ec_data_sources' table
self.db_session.add(
DataSource(
Expand Down Expand Up @@ -310,12 +314,12 @@ def _load_and_embed(
new_doc_id = embeddings_data["doc_id"]

if existing_doc_id and existing_doc_id == new_doc_id:
print("Doc content has not changed. Skipping creating chunks and embeddings")
logger.info("Doc content has not changed. Skipping creating chunks and embeddings")
return [], [], [], 0

# this means that doc content has changed.
if existing_doc_id and existing_doc_id != new_doc_id:
print("Doc content has changed. Recomputing chunks and embeddings intelligently.")
logger.info("Doc content has changed. Recomputing chunks and embeddings intelligently.")
self.db.delete({"doc_id": existing_doc_id})

# get existing ids, and discard doc if any common id exist.
Expand All @@ -341,7 +345,7 @@ def _load_and_embed(
src_copy = src
if len(src_copy) > 50:
src_copy = src[:50] + "..."
print(f"All data from {src_copy} already exists in the database.")
logger.info(f"All data from {src_copy} already exists in the database.")
# Make sure to return a matching return type
return [], [], [], 0

Expand Down Expand Up @@ -388,12 +392,12 @@ def _load_and_embed(
if batch_docs:
self.db.add(documents=batch_docs, metadatas=batch_meta, ids=batch_ids, **kwargs)
except Exception as e:
print(f"Failed to add batch due to a bad request: {e}")
logger.info(f"Failed to add batch due to a bad request: {e}")
# Handle the error, e.g., by logging, retrying, or skipping
pass

count_new_chunks = self.db.count() - chunks_before_addition
print(f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}")
logger.info(f"Successfully saved {str(src)[:100]} ({chunker.data_type}). New chunks count: {count_new_chunks}")

return list(documents), metadatas, ids, count_new_chunks

Expand Down
1 change: 0 additions & 1 deletion embedchain/llm/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def get_llm_model_answer(self, prompt):

@staticmethod
def _get_answer(prompt: str, config: BaseLlmConfig) -> str:

chat = ChatAnthropic(
anthropic_api_key=os.environ["ANTHROPIC_API_KEY"], temperature=config.temperature, model_name=config.model
)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "embedchain"
version = "0.1.99"
version = "0.1.100"
description = "Simplest open source retrieval (RAG) framework"
authors = [
"Taranjeet Singh <taranjeet@embedchain.ai>",
Expand Down

0 comments on commit 536f85b

Please sign in to comment.