@@ -277,8 +230,6 @@ m = db.add(
- - ## Installation #### # Option 1. SuperDuperDB Library @@ -301,138 +252,10 @@ make testenv_image make testenv_init ``` - ## Preview -Here are snippets which give you a sense of how `superduperdb` works and how simple it is to use. You can visit the docs to learn more. - - -#### - Deploy ML/AI models to your database: -Automatically compute outputs (inference) with your database in a single environment. - -```python -import pymongo -from sklearn.svm import SVC - -from superduperdb import superduper - -# Make your db superduper! -db = superduper(pymongo.MongoClient().my_db) - -# Models client can be converted to SuperDuperDB objects with a simple wrapper. -model = superduper(SVC()) - -# Add the model into the database -db.add(model) - -# Predict on the selected data. -model.predict(X='input_col', db=db, select=Collection(name='test_documents').find({'_fold': 'valid'})) -``` - - -#### - Train models directly from your database. -Simply by querying your database, without additional ingestion and pre-processing: - -```python -import pymongo -from sklearn.svm import SVC - -from superduperdb import superduper - -# Make your db superduper! -db = superduper(pymongo.MongoClient().my_db) - -# Models client can be converted to SuperDuperDB objects with a simple wrapper. -model = superduper(SVC()) - -# Fit model on the training data. -model.fit(X='input_col', y='target_col', db=db, select=Collection(name='test_documents').find({})) -``` - -#### - Vector-Search your data: -Use your existing favorite database as a vector search database, including model management and serving. - -```python -# First a "Listener" makes sure vectors stay up-to-date -indexing_listener = Listener(model=OpenAIEmbedding(), key='text', select=collection.find()) - -# This "Listener" is linked with a "VectorIndex" -db.add(VectorIndex('my-index', indexing_listener=indexing_listener)) - -# The "VectorIndex" may be used to search data. Items to be searched against are passed -# to the registered model and vectorized. No additional app layer is required. -db.execute(collection.like({'text': 'clothing item'}, 'my-index').find({'brand': 'Nike'})) -``` - -#### - Integrate AI APIs to work together with other models. -Use OpenAI, Jina AI, PyTorch or Hugging face model as an embedding model for vector search. - -```python -# Create a ``VectorIndex`` instance with indexing listener as OpenAIEmbedding and add it to the database. -db.add( - VectorIndex( - identifier='my-index', - indexing_listener=Listener( - model=OpenAIEmbedding(identifier='text-embedding-ada-002'), - key='abstract', - select=Collection(name='wikipedia').find(), - ), - ) -) -# The above also executes the embedding model (openai) with the select query on the key. - -# Now we can use the vector-index to search via meaning through the wikipedia abstracts -cur = db.execute( - Collection(name='wikipedia') - .like({'abstract': 'philosophers'}, n=10, vector_index='my-index') -) -``` - - -#### - Add a Llama 2 model to SuperDuperDB!: -```python -model_id = "meta-llama/Llama-2-7b-chat-hf" -tokenizer = AutoTokenizer.from_pretrained(model_id) -pipeline = transformers.pipeline( - "text-generation", - model=model_id, - torch_dtype=torch.float16, - device_map="auto", -) - -model = Pipeline( - identifier='my-sentiment-analysis', - task='text-generation', - preprocess=tokenizer, - object=pipeline, - torch_dtype=torch.float16, - device_map="auto", -) - -# You can easily predict on your collection documents. -model.predict( - X=Collection(name='test_documents').find(), - db=db, - do_sample=True, - top_k=10, - num_return_sequences=1, - eos_token_id=tokenizer.eos_token_id, - max_length=200 -) -``` - -#### - Use models outputs as inputs to downstream models: - -```python -model.predict( - X='input_col', - db=db, - select=coll.find().featurize({'X': '