implement #18;

sunholo-data · Mar 25, 2024 · a8905e0 · a8905e0
1 parent 74110c4
commit a8905e0
Show file tree

Hide file tree

Showing 5 changed files with 183 additions and 46 deletions.
diff --git a/sunholo/components/README.md b/sunholo/components/README.md
@@ -0,0 +1,38 @@
+# Sunholo Components
+
+This folder contains several Python files that define various functions used in the Sunholo project. Here is a brief overview of each file and the functions it contains:
+
+## llm.py
+
+This file contains functions related to Language Learning Models (LLMs). The functions include:
+
+- pick_llm: Picks an LLM based on the vector_name parameter.
+- pick_streaming: Returns a boolean value based on the llm_str parameter.
+- get_llm: Configures LLMs based on the llm_str parameter.
+- get_llm_chat: Configures LLMs for chat based on the llm_str parameter.
+- get_embeddings: Picks an embedding based on the llm_str parameter.
+- pick_embedding: Configures embeddings based on the llm_str parameter.
+
+## prompt.py
+
+This file contains functions related to prompts. The functions include:
+
+- pick_prompt: Picks a custom prompt based on the vector_name parameter.
+- pick_chat_buddy: Picks a chat buddy based on the vector_name parameter.
+- pick_agent: Returns a boolean value based on the agent_str parameter.
+- pick_shared_vectorstore: Picks a shared vectorstore based on the vector_name and embeddings parameters.
+- get_chat_history: Gets the chat history based on the inputs and vector_name parameters.
+
+## retriever.py
+
+This file contains functions related to retrievers. The functions include:
+
+- load_memories: Loads memories based on the vector_name parameter.
+- pick_retriever: Picks a retriever based on the vector_name and embeddings parameters.
+
+## vectorstore.py
+
+This file contains functions related to vectorstores. The functions include:
+
+- pick_vectorstore: Picks a vectorstore based on the vs_str, vector_name, and embeddings parameters.
+- load_memories: Loads memories based on the vector_name parameter.
diff --git a/sunholo/components/llm.py b/sunholo/components/llm.py
@@ -16,6 +16,17 @@
 
 logging = setup_logging()
 
+"""
+This function selects a Language Learning Model (LLM) based on the vector_name parameter.
+
+It loads the llm_str parameter from a configuration file and then configures the LLM, embeddings, and llm_chat based on this parameter.
+
+If the llm_str parameter matches one of the predefined strings ('openai', 'vertex', 'codey', 'model_garden'), the function configures the LLM, embeddings, and llm_chat accordingly. If the llm_str parameter does not match any of the predefined strings, the function raises a NotImplementedError.
+
+:param vector_name: The name of the vector used to select the LLM.
+:return: A tuple containing the configured LLM, embeddings, and llm_chat.
+:raises NotImplementedError: If the llm_str parameter does not match any of the predefined strings.
+"""
 def pick_llm(vector_name):
     logging.debug('Picking llm')
 
@@ -36,27 +47,27 @@ def pick_llm(vector_name):
         llm_chat = get_llm_chat(vector_name)
         embeddings = get_embeddings(vector_name)
         logging.debug("Chose VertexAI code-bison")
-    elif llm_str == 'model_garden':
-        llm = get_llm(vector_name)
-        llm_chat = llm
-        embeddings = None
-        logging.debug("Chose VertexAIModelGarden")
+"""
+This function determines whether streaming should be used based on the llm_str parameter.
 
-    else:
-        raise NotImplementedError(f'No llm implemented for {llm_str}')   
+It loads the llm_str parameter from a configuration file and then checks if it matches one of the predefined strings ('openai', 'gemini', 'vertex'). If it does, the function returns True, indicating that streaming should be used. Otherwise, it returns False.
 
-    return llm, embeddings, llm_chat
+:param vector_name: The name of the vector used to select the LLM.
+:return: A boolean value indicating whether streaming should be used.
+"""
+    elif llm_str == 'model_garden':
+        llm = get_llm(vector_name)
+"""
+This function configures a Language Learning Model (LLM) based on the vector_name and model parameters.
 
-def pick_streaming(vector_name):
-
-    llm_str = load_config_key("llm", vector_name, filename = "config/llm_config.yaml")
-
-    if llm_str == 'openai' or llm_str == 'gemini' or llm_str == 'vertex':
-        return True
-
-    return False
-
+It loads the llm_str parameter from a configuration file and then configures the LLM based on this parameter. If the llm_str parameter matches one of the predefined strings ('openai', 'vertex', 'model_garden', 'anthropic'), the function configures the LLM accordingly. If the llm_str parameter does not match any of the predefined strings, the function raises a NotImplementedError.
 
+:param vector_name: The name of the vector used to select the LLM.
+:param model: The model to be used for the LLM. If not provided, a default model is selected based on the llm_str parameter.
+:param config_file: The configuration file from which to load the llm_str parameter. Defaults to 'config/llm_config.yaml'.
+:return: The configured LLM.
+:raises NotImplementedError: If the llm_str parameter does not match any of the predefined strings.
+"""
 def get_llm(vector_name, model=None, config_file="config/llm_config.yaml"):
     llm_str = load_config_key("llm", vector_name, filename=config_file)
     model_lookup_filepath = get_module_filepath("lookup/model_lookup.yaml")
@@ -95,17 +106,17 @@ def get_llm(vector_name, model=None, config_file="config/llm_config.yaml"):
         return VertexAIModelGarden(project=model_garden_config['project_id'], 
                                    endpoint_id=model_garden_config['endpoint_id'], 
                                    location=model_garden_config['location'], 
-                                   allowed_model_args=["max_tokens"])
-    elif llm_str == 'anthropic':
-        from langchain_anthropic import AnthropicLLM
-        if model is None:
-            model = 'claude-2.1'
-            logging.info(f"No 'model' value in config file - selecting default {model}")
-        return AnthropicLLM(model_name = model, temperature=0)
+"""
+This function configures a Language Learning Model (LLM) for chat based on the vector_name and model parameters.
 
-    if llm_str is None:
-        raise NotImplementedError(f'No llm implemented for {llm_str}')
+It loads the llm_str parameter from a configuration file and then configures the LLM for chat based on this parameter. If the llm_str parameter matches one of the predefined strings ('openai', 'vertex', 'gemini', 'anthropic'), the function configures the LLM for chat accordingly. If the llm_str parameter does not match any of the predefined strings, the function raises a NotImplementedError.
 
+:param vector_name: The name of the vector used to select the LLM.
+:param model: The model to be used for the LLM. If not provided, a default model is selected based on the llm_str parameter.
+:param config_file: The configuration file from which to load the llm_str parameter. Defaults to 'config/llm_config.yaml'.
+:return: The configured LLM for chat.
+:raises NotImplementedError: If the llm_str parameter does not match any of the predefined strings.
+"""
 def get_llm_chat(vector_name, model=None, config_file="config/llm_config.yaml"):
     llm_str = load_config_key("llm", vector_name, filename=config_file)
     if not model:
@@ -139,24 +150,28 @@ def get_llm_chat(vector_name, model=None, config_file="config/llm_config.yaml"):
 
         return ChatGoogleGenerativeAI(model_name = model, temperature=0)
 
-    elif llm_str == 'anthropic':
-        from langchain_anthropic import ChatAnthropic
-        if model is None:
-            model = 'claude-3-opus-20240229'
-            logging.info(f"No 'model' value in config file - selecting default {model}")
+"""
+This function selects an embedding based on the vector_name parameter.
 
-        return ChatAnthropic(model_name = model, temperature=0)
-
-    if llm_str is None:
-        raise NotImplementedError(f'No llm implemented for {llm_str}')
+It loads the llm_str parameter from a configuration file and then calls the pick_embedding function with this parameter to select an embedding.
 
+:param vector_name: The name of the vector used to select the embedding.
+:return: The selected embedding.
+"""
 def get_embeddings(vector_name):
     llm_str = load_config_key("llm", vector_name, filename="config/llm_config.yaml")
 
     return pick_embedding(llm_str)
 
+"""
+This function selects an embedding based on the llm_str parameter.
 
+If the llm_str parameter matches one of the predefined strings ('openai', 'vertex', 'codey', 'anthropic'), the function selects an embedding accordingly. If the llm_str parameter does not match any of the predefined strings, the function raises a NotImplementedError.
 
+:param llm_str: The string used to select the embedding.
+:return: The selected embedding.
+:raises NotImplementedError: If the llm_str parameter does not match any of the predefined strings.
+"""
 def pick_embedding(llm_str: str):
     # get embedding directly from llm_str
     # Configure embeddings based on llm_str
@@ -175,4 +190,4 @@ def pick_embedding(llm_str: str):
         return GoogleGenerativeAIEmbeddings(model="models/embedding-001") #TODO add embedding type
 
     if llm_str is None:
-        raise NotImplementedError(f'No embeddings implemented for {llm_str}')
+        raise NotImplementedError(f'No embeddings implemented for {llm_str}')
diff --git a/sunholo/components/prompt.py b/sunholo/components/prompt.py
@@ -21,9 +21,19 @@
 from ..utils import load_config_key
 from .vectorstore import pick_vectorstore
 
-
 def pick_prompt(vector_name, chat_history=[]):
-    """Pick a custom prompt"""
+    """
+    This function picks a custom prompt based on the vector_name parameter and an optional chat_history parameter.
+
+    It loads the prompt_str parameter from a configuration file and then configures the prompt based on this parameter. If the prompt_str parameter contains certain predefined strings, the function raises a ValueError.
+
+    The function returns a PromptTemplate object that represents the configured prompt.
+
+    :param vector_name: The name of the vector used to select the prompt.
+    :param chat_history: A list of chat history items. Defaults to an empty list.
+    :return: A PromptTemplate object that represents the configured prompt.
+    :raises ValueError: If the prompt_str parameter contains certain predefined strings.
+    """
     logging.debug('Picking prompt')
 
     prompt_str = load_config_key("prompt", vector_name, filename = "config/llm_config.yaml")
@@ -69,7 +79,8 @@ def pick_prompt(vector_name, chat_history=[]):
     else:
         follow_up += ".\n"
 
-    memory_str = "\n## Your Memory (ignore if not relevant to question)\n{context}\n"
+    memory_str = "\n## Your Memory (ignore if not relevant to question)
+{context}\n"
 
     current_conversation = ""
     if chat_summary != "":
@@ -90,30 +101,66 @@ def pick_prompt(vector_name, chat_history=[]):
     )
 
     return QA_PROMPT
-
 def pick_chat_buddy(vector_name):
+    """
+    This function picks a chat buddy based on the vector_name parameter.
+
+    It loads the chat_buddy parameter from a configuration file and then configures the chat buddy based on this parameter. If the chat_buddy parameter is not None, the function also loads the buddy_description parameter from the configuration file.
+
+    The function returns a tuple containing the chat buddy and the buddy description.
+
+    :param vector_name: The name of the vector used to select the chat buddy.
+    :return: A tuple containing the chat buddy and the buddy description.
+    """
     chat_buddy = load_config_key("chat_buddy", vector_name, filename = "config/llm_config.yaml")
     if chat_buddy is not None:
         logging.info(f"Got chat buddy {chat_buddy} for {vector_name}")
         buddy_description = load_config_key("chat_buddy_description", vector_name)
         return chat_buddy, buddy_description
     return None, None
+def pick_agent(vector_name):
+    """
+    This function determines whether an agent should be picked based on the vector_name parameter.
 
+    It loads the agent_str parameter from a configuration file and then checks if it is equal to 'yes'. If it is, the function returns True. Otherwise, it returns False.
 
-def pick_agent(vector_name):
+    :param vector_name: The name of the vector used to determine whether an agent should be picked.
+    :return: A boolean value indicating whether an agent should be picked.
+    """
     agent_str = load_config_key("agent", vector_name, filename = "config/llm_config.yaml")
     if agent_str == "yes":
         return True
 
     return False
-
 def pick_shared_vectorstore(vector_name, embeddings):
+    """
+    This function picks a shared vectorstore based on the vector_name and embeddings parameters.
+
+    It loads the shared_vectorstore parameter from a configuration file and then calls the pick_vectorstore function with this parameter and the embeddings parameter to pick a shared vectorstore.
+
+    The function returns the picked shared vectorstore.
+
+    :param vector_name: The name of the vector used to pick the shared vectorstore.
+    :param embeddings: The embeddings used to pick the shared vectorstore.
+    :return: The picked shared vectorstore.
+    """
     shared_vectorstore = load_config_key("shared_vectorstore", vector_name, filename = "config/llm_config.yaml")
     vectorstore = pick_vectorstore(shared_vectorstore, embeddings)
     return vectorstore
+def get_chat_history(inputs, vector_name, last_chars=1000, summary_chars=1500) -> str:
+    """
+    This function gets the chat history based on the inputs and vector_name parameters, and optional last_chars and summary_chars parameters.
 
+    It prepares the full chat history, gets the last `last_chars` characters of the full chat history, summarizes the chat history, and then concatenates the summary and the last `last_chars` characters of the chat history.
 
-def get_chat_history(inputs, vector_name, last_chars=1000, summary_chars=1500) -> str:
+    The function returns the concatenated summary and last `last_chars` characters of the chat history.
+
+    :param inputs: A list of inputs used to get the chat history.
+    :param vector_name: The name of the vector used to get the chat history.
+    :param last_chars: The number of last characters of the chat history to get. Defaults to 1000.
+    :param summary_chars: The number of characters of the summary to get. Defaults to 1500.
+    :return: The concatenated summary and last `last_chars` characters of the chat history.
+    """
     from langchain.schema import Document
     from ..summarise import summarise_docs
 
@@ -148,4 +195,4 @@ def get_chat_history(inputs, vector_name, last_chars=1000, summary_chars=1500) -
     summary = text_sum[:summary_chars]
 
     # Concatenate the summary and the last `last_chars` characters of the chat history
-    return summary + "\n### Recent Chat History\n..." + recent_history
+    return summary + "\n### Recent Chat History\n..." + recent_history
diff --git a/sunholo/components/retriever.py b/sunholo/components/retriever.py
@@ -27,6 +27,14 @@
 logging = setup_logging()
 
 def load_memories(vector_name):
+    """
+    This function loads memory settings for a given vector name from a configuration file.
+
+    It loads the memory settings from a configuration file using the load_config_key function and logs the loaded memory settings. If no memory settings are found, it logs this information and returns None.
+
+    :param vector_name: The name of the vector for which to load the memory settings.
+    :return: The loaded memory settings, or None if no memory settings are found.
+    """
     memories = load_config_key("memory", vector_name, filename="config/llm_config.yaml")
     logging.info(f"Found memory settings for {vector_name}: {memories}")
     if len(memories) == 0:
@@ -36,7 +44,15 @@ def load_memories(vector_name):
     return memories
 
 def pick_retriever(vector_name, embeddings=None):
+    """
+    This function creates a list of retrievers based on the memory settings loaded by the load_memories function.
+
+    It first calls the load_memories function to load the memory settings for the vector name. Then it iterates over the memory settings and for each memory, it checks if a vectorstore is specified. If a vectorstore is specified, it picks the vectorstore and creates a retriever for it. If a provider is specified and it is 'GoogleCloudEnterpriseSearchRetriever', it creates a GoogleCloudEnterpriseSearchRetriever. Finally, it merges all the retrievers into a MergerRetriever and returns it.
 
+    :param vector_name: The name of the vector for which to create the retrievers.
+    :param embeddings: The embeddings used to pick the vectorstore. Defaults to None.
+    :return: The created MergerRetriever, or None if no retrievers were created.
+    """
     memories = load_memories(vector_name)
 
     retriever_list = []
@@ -78,4 +94,4 @@ def pick_retriever(vector_name, embeddings=None):
         base_compressor=pipeline, base_retriever=lotr, 
         k=3)
 
-    return retriever
+    return retriever
diff --git a/sunholo/components/vectorstore.py b/sunholo/components/vectorstore.py
@@ -18,6 +18,27 @@
 logging = setup_logging()
 
 
+"""
+This function picks a vectorstore based on the vs_str, vector_name, and embeddings parameters.
+
+It uses the vs_str parameter to determine which type of vectorstore to pick. It supports several types of vectorstores, including 'supabase', 'cloudsql', 'alloydb', and 'lancedb'.
+
+For 'supabase', it imports the necessary modules from supabase and langchain.vectorstores, sets up the vectorstore using the setup_supabase function, and returns a SupabaseVectorStore object.
+
+For 'cloudsql', it imports the necessary modules from langchain.vectorstores.pgvector, sets up the vectorstore using the setup_cloudsql function, and returns a PGVector object.
+
+For 'alloydb', it imports the necessary modules from langchain_google_alloydb_pg and google.cloud.alloydb.connector, sets up the vectorstore using the create_alloydb_table and create_alloydb_engine functions, and returns an AlloyDBVectorStore object.
+
+For 'lancedb', it imports the necessary modules from ..patches.langchain.lancedb and lancedb, sets up the vectorstore using the lancedb.connect function, and returns a LanceDB object.
+
+If the vs_str parameter does not match any of the supported types, it raises a NotImplementedError.
+
+:param vs_str: The string used to select the type of vectorstore.
+:param vector_name: The name of the vector used to select the vectorstore.
+:param embeddings: The embeddings used to select the vectorstore.
+:return: The selected vectorstore.
+:raises NotImplementedError: If the vs_str parameter does not match any of the supported types.
+"""
 def pick_vectorstore(vs_str, vector_name, embeddings):
     logging.debug('Picking vectorstore')
 
@@ -139,4 +160,4 @@ def pick_vectorstore(vs_str, vector_name, embeddings):
         return vectorstore
 
     else:
-        raise NotImplementedError(f'No llm implemented for {vs_str}')   
+        raise NotImplementedError(f'No llm implemented for {vs_str}')