Skip to content

Commit

Permalink
Indexing speed (#42)
Browse files Browse the repository at this point in the history
* Speed up indexing a lot

* remove title mapper

* fix docker ports in prod
  • Loading branch information
veekaybee committed Jul 6, 2023
1 parent e685ec3 commit da47dee
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 51 deletions.
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ services:
container_name: redis
command: redis-server --port 6379 --appendonly yes --protected-mode no --loadmodule /opt/redis-stack/lib/redisearch.so --loadmodule /opt/redis-stack/lib/rejson.so


flask:
build:
context: .
Expand Down
4 changes: 2 additions & 2 deletions src/api/templates/search.html
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
<tbody>
{% for item in data %}
<tr>
<td>{{ item[0].decode() if item[0] is not none }}</td>
<td>{{ item[1].decode() if item[0] is not none}}</td>
<td>{{ item[0] if item[0] is not none }}</td>
<td>{{ item[1] if item[0] is not none}}</td>
<td>{{ item[2] }}</td>
</tr>
{% endfor %}
Expand Down
6 changes: 3 additions & 3 deletions src/index/index_embeddings.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from pathlib import Path

from index.indexer import Indexer
from index.title_mapper import TitleMapper
from inout import file_reader as f
from inout.redis_conn import RedisConnection

Expand Down Expand Up @@ -31,11 +30,12 @@
# Delete existing index
indexer.drop_index()

# Load Embeddings
indexer.write_embeddings_to_search_index(columns=["title", "index", "author", "embeddings"])

# Recreate schema based on Indexer
indexer.create_search_index_schema()

# Load Embeddings
indexer.write_embeddings_to_search_index(columns=["title", "index", "author", "embeddings"])

# Check Search Index Metadata
indexer.get_search_index_metadata()
14 changes: 9 additions & 5 deletions src/index/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import pyarrow.parquet as pq
from redis.commands.search.field import TextField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType

from inout import file_reader as f

Expand Down Expand Up @@ -95,7 +96,10 @@ def create_search_index_schema(
logging.info(f"using {self.vector_field_name}, {self.float_type}, {self.dim}")
logging.info(f"using {schema}")

r.ft(self.index_name).create_index(schema)
r.ft(self.index_name).create_index(
fields=schema,
definition=IndexDefinition(prefix=["viberary:"], index_type=IndexType.HASH),
)
r.ft(self.index_name).config_set("default_dialect", 2)
logging.info(f"Creating Redis schema: {schema} in index {self.index_name}")

Expand All @@ -109,14 +113,14 @@ def write_embeddings_to_search_index(self, columns):
for i, (k, v) in enumerate(vector_dict.items()):
np_vector = v[2].astype(np.float64)
pipe.hset(
i,
f"{self.index_name}:{i}",
mapping={
self.vector_field_name: np_vector.tobytes(),
self.title_field_name: v[0],
self.author_field_name: v[1],
},
)
if i % 500 == 0:
if i % 5000 == 0:
logging.info(f"Inserting {i} vector into Redis index {self.index_name}")
pipe.execute()

Expand All @@ -125,6 +129,6 @@ def get_search_index_metadata(self):
metadata = r.ft(self.index_name).info()
logging.info(
f"name: {metadata['index_name']}, "
f"docs: {metadata['max_doc_id']}, "
f"time:{metadata['total_indexing_time']} seconds"
f"docs: {metadata['num_records']}, "
f"time:{metadata['total_indexing_time']} ms"
)
41 changes: 0 additions & 41 deletions src/index/title_mapper.py

This file was deleted.

0 comments on commit da47dee

Please sign in to comment.