/
dataset.py
625 lines (515 loc) · 18.4 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
import uuid
import sys
import time
from math import ceil
from typing import List, Dict, Any, Optional, Callable, Union
from tqdm import tqdm
import numpy as np
import deeplake
from deeplake.core.vectorstore.vector_search import utils
from deeplake.core.vectorstore.vector_search.ingestion import ingest_data
from deeplake.constants import (
DEFAULT_VECTORSTORE_DEEPLAKE_PATH,
VECTORSTORE_EXTEND_MAX_SIZE,
DEFAULT_VECTORSTORE_TENSORS,
VECTORSTORE_EXTEND_MAX_SIZE_BY_HTYPE,
MAX_BYTES_PER_MINUTE,
TARGET_BYTE_SIZE,
VECTORSTORE_EXTEND_BATCH_SIZE,
DEFAULT_RATE_LIMITER_KEY_TO_VALUE,
)
from deeplake.util.exceptions import IncorrectEmbeddingShapeError
def create_or_load_dataset(
tensor_params,
dataset_path,
token,
creds,
logger,
read_only,
exec_option,
embedding_function,
overwrite,
runtime,
org_id,
branch="main",
**kwargs,
):
utils.check_indra_installation(exec_option=exec_option)
if not overwrite and dataset_exists(dataset_path, token, creds, **kwargs):
if tensor_params is not None and tensor_params != DEFAULT_VECTORSTORE_TENSORS:
raise ValueError(
"Vector Store is not empty. You shouldn't specify tensor_params if you're loading from existing dataset."
)
return load_dataset(
dataset_path,
token,
creds,
logger,
read_only,
org_id,
branch,
**kwargs,
)
return create_dataset(
logger,
tensor_params,
dataset_path,
token,
exec_option,
embedding_function,
overwrite,
creds,
runtime,
org_id,
branch,
**kwargs,
)
def dataset_exists(dataset_path, token, creds, **kwargs):
return (
deeplake.exists(dataset_path, token=token, creds=creds)
and "overwrite" not in kwargs
)
def load_dataset(
dataset_path,
token,
creds,
logger,
read_only,
org_id,
branch,
**kwargs,
):
if dataset_path == DEFAULT_VECTORSTORE_DEEPLAKE_PATH:
logger.warning(
f"The default deeplake path location is used: {DEFAULT_VECTORSTORE_DEEPLAKE_PATH}"
" and it is not free. All addtionally added data will be added on"
" top of already existing deeplake dataset."
)
dataset = deeplake.load(
dataset_path,
token=token,
read_only=read_only,
creds=creds,
verbose=False,
org_id=org_id,
**kwargs,
)
dataset.checkout(branch)
check_tensors(dataset)
logger.warning(
f"Deep Lake Dataset in {dataset_path} already exists, "
f"loading from the storage"
)
return dataset
def check_tensors(dataset):
tensors = dataset.tensors
embedding_tensor_exist = False
ids_exist = False
for tensor in tensors:
htype = dataset[tensor].htype
if tensor in ("id", "ids"):
ids_exist = True
if tensor in ("embedding", "embeddings"):
embedding_tensor_exist = True
# TODO: Add back once old datasets without embedding htype are not in circulation
# if htype not in (None, "embedding"):
# raise ValueError(
# f"`{htype}` is not supported htype for embedding tensor. "
# "Supported htype for embedding tensor is: `embedding`"
# )
if htype == "embedding":
if tensor in ("id", "ids"):
raise ValueError(
f"`{tensor}` is not valid name for embedding tensor, as the name is preserved for another tensor"
)
embedding_tensor_exist = True
if not embedding_tensor_exist:
raise ValueError("At least one embedding tensor should exist.")
if not ids_exist:
raise ValueError("`id` tensor was not found in the dataset.")
def create_dataset(
logger,
tensor_params,
dataset_path,
token,
exec_option,
embedding_function,
overwrite,
creds,
runtime,
org_id,
branch,
**kwargs,
):
if exec_option == "tensor_db" and (
runtime is None or runtime == {"tensor_db": False}
):
raise ValueError(
"To execute queries using exec_option = 'tensor_db', "
"the Vector Store must be stored in Deep Lake's Managed "
"Tensor Database. To create the Vector Store in the Managed "
"Tensor Database, specify runtime = {'tensor_db': True} when "
"creating the Vector Store."
)
dataset = deeplake.empty(
dataset_path,
token=token,
runtime=runtime,
verbose=False,
overwrite=overwrite,
creds=creds,
org_id=org_id,
**kwargs,
)
dataset.checkout(branch)
create_tensors(tensor_params, dataset, logger, embedding_function)
return dataset
def create_tensors(tensor_params, dataset, logger, embedding_function):
tensor_names = [tensor["name"] for tensor in tensor_params]
if "id" not in tensor_names and "ids" not in tensor_names:
tensor_params.append(
{
"name": "id",
"htype": "text",
"create_id_tensor": False,
"create_sample_info_tensor": False,
"create_shape_tensor": False,
"chunk_compression": "lz4",
},
)
with dataset:
for tensor_args in tensor_params:
dataset.create_tensor(**tensor_args)
update_embedding_info(logger, dataset, embedding_function)
def delete_and_commit(dataset, ids):
with dataset:
for id in sorted(ids)[::-1]:
dataset.pop(id)
dataset.commit(f"deleted {len(ids)} samples", allow_empty=True)
return True
def delete_and_without_commit(dataset, ids, index_maintenance):
with dataset:
for id in sorted(ids)[::-1]:
dataset.pop(id, index_maintenance=index_maintenance)
def delete_all_samples_if_specified(dataset, delete_all):
if delete_all:
# delete any indexes linked to any tensors.
for t in dataset.tensors:
dataset[t]._verify_and_delete_vdb_indexes()
dataset = deeplake.like(
dataset.path,
dataset,
overwrite=True,
verbose=False,
)
return dataset, True
return dataset, False
def fetch_embeddings(view, embedding_tensor: str = "embedding"):
return view[embedding_tensor].numpy()
def get_embedding(embedding, embedding_data, embedding_function=None):
if (
embedding is None
and embedding_function is not None
and embedding_data is not None
):
if isinstance(embedding_data, list):
if len(embedding_data) > 1:
raise NotImplementedError("Batched quering is not supported yet.")
elif len(embedding_data) == 0:
raise ValueError("embedding_data must not be empty.")
else:
embedding_data = embedding_data[0]
if not isinstance(embedding_data, str):
raise ValueError("embedding_data must be a string.")
embedding = embedding_function.embed_query(embedding_data) # type: ignore
if embedding is not None and (
isinstance(embedding, list) or embedding.dtype != "float32"
):
embedding = np.array(embedding, dtype=np.float32)
if isinstance(embedding, np.ndarray):
assert (
embedding.ndim == 1 or embedding.shape[0] == 1
), "Query embedding must be 1-dimensional. Please consider using another embedding function for converting query string to embedding."
return embedding
def preprocess_tensors(
embedding_data=None, embedding_tensor=None, dataset=None, **tensors
):
# generate id list equal to the length of the tensors
# dont use None tensors to get length of tensor
not_none_tensors, num_items = get_not_none_tensors(tensors, embedding_data)
ids_tensor = get_id_tensor(dataset)
tensors = populate_id_tensor_if_needed(
ids_tensor, tensors, not_none_tensors, num_items
)
processed_tensors = {ids_tensor: tensors[ids_tensor]}
for tensor_name, tensor_data in tensors.items():
tensor_data = convert_tensor_data_to_list(tensor_data, tensors, ids_tensor)
tensor_data = read_tensor_data_if_needed(tensor_data, dataset, tensor_name)
processed_tensors[tensor_name] = tensor_data
if embedding_data:
for k, v in zip(embedding_tensor, embedding_data):
processed_tensors[k] = v
return processed_tensors, tensors[ids_tensor]
def read_tensor_data_if_needed(tensor_data, dataset, tensor_name):
# generalize this method for other htypes that need reading.
if dataset and tensor_name != "id" and dataset[tensor_name].htype == "image":
tensor_data = [
deeplake.read(data) if isinstance(data, str) else data
for data in tensor_data
]
return tensor_data
def convert_tensor_data_to_list(tensor_data, tensors, ids_tensor):
if tensor_data is None:
tensor_data = [None] * len(tensors[ids_tensor])
elif not isinstance(tensor_data, list):
tensor_data = list(tensor_data)
return tensor_data
def get_not_none_tensors(tensors, embedding_data):
not_none_tensors = {k: v for k, v in tensors.items() if v is not None}
try:
num_items = len(next(iter(not_none_tensors.values())))
except StopIteration:
if embedding_data:
num_items = len(embedding_data[0])
else:
num_items = 0
return not_none_tensors, num_items
def populate_id_tensor_if_needed(ids_tensor, tensors, not_none_tensors, num_items):
if "id" not in not_none_tensors and "ids" not in not_none_tensors:
found_id = [str(uuid.uuid1()) for _ in range(num_items)]
tensors[ids_tensor] = found_id
else:
for tensor in not_none_tensors:
if tensor in ("id", "ids"):
break
tensors[ids_tensor] = list(
map(
lambda x: str(x) if isinstance(x, uuid.UUID) else x,
not_none_tensors[tensor],
)
)
return tensors
def get_id_tensor(dataset):
return "ids" if "ids" in dataset.tensors else "id"
def create_elements(
processed_tensors: Dict[str, List[Any]],
):
tensor_names = list(processed_tensors)
elements = [
{tensor_name: processed_tensors[tensor_name][i] for tensor_name in tensor_names}
for i in range(len(processed_tensors[tensor_names[0]]))
]
return elements
def set_embedding_info(tensor, embedding_function):
embedding_info = tensor.info.get("embedding")
if embedding_function and not embedding_info:
tensor.info["embedding"] = {
"model": embedding_function.__dict__.get("model"),
"deployment": embedding_function.__dict__.get("deployment"),
"embedding_ctx_length": embedding_function.__dict__.get(
"embedding_ctx_length"
),
"chunk_size": embedding_function.__dict__.get("chunk_size"),
"max_retries": embedding_function.__dict__.get("max_retries"),
}
def update_embedding_info(logger, dataset, embedding_function):
embeddings_tensors = utils.find_embedding_tensors(dataset)
num_embedding_tensors = len(embeddings_tensors)
if num_embedding_tensors == 0:
logger.warning(
"No embedding tensors were found, so the embedding function metadata will not be added to any tensor. "
"Consider doing that manually using `vector_store.dataset.tensor_name.info. = <embedding_function_info_dictionary>`"
)
return
if num_embedding_tensors > 1:
logger.warning(
f"{num_embedding_tensors} embedding tensors were found. "
"It is not clear to which tensor the embedding function information should be added, so the embedding function metadata will not be added to any tensor. "
"Consider doing that manually using `vector_store.dataset.tensor_name.info = <embedding_function_info_dictionary>`"
)
return
set_embedding_info(dataset[embeddings_tensors[0]], embedding_function)
def _compute_batched_embeddings(
embedding_function,
embedding_data,
embedding_tensor,
start_idx,
end_idx,
rate_limiter,
):
"""
Computes embeddings for a given slice of data.
"""
batched_processed_tensors = {}
for func, data, tensor in zip(embedding_function, embedding_data, embedding_tensor):
data_slice = data[start_idx:end_idx]
embedded_data = func(data_slice, rate_limiter=rate_limiter)
try:
return_embedded_data = np.vstack(embedded_data).astype(dtype=np.float32)
except ValueError:
raise IncorrectEmbeddingShapeError()
if len(return_embedded_data) == 0:
raise ValueError("embedding function returned empty list")
batched_processed_tensors[tensor] = return_embedded_data
return batched_processed_tensors
def _slice_non_embedding_tensors(
processed_tensors, embedding_tensor, start_idx, end_idx
):
"""
Slices tensors that are not embeddings for a given range.
"""
batched_processed_tensors = {}
for tensor_name, tensor_data in processed_tensors.items():
if tensor_name not in embedding_tensor:
batched_processed_tensors[tensor_name] = tensor_data[start_idx:end_idx]
return batched_processed_tensors
def extend(
embedding_function: List[Callable],
embedding_data: List[Any],
embedding_tensor: Union[str, List[str]],
processed_tensors: Dict[str, Union[List[Any], np.ndarray]],
dataset: deeplake.core.dataset.Dataset,
rate_limiter: Dict,
_extend_batch_size: int = VECTORSTORE_EXTEND_BATCH_SIZE,
logger=None,
):
"""
Function to extend the dataset with new data.
"""
if embedding_data and not isinstance(embedding_data[0], list):
embedding_data = [embedding_data]
if embedding_function:
number_of_batches = ceil(len(embedding_data[0]) / _extend_batch_size)
progressbar_str = (
f"Creating {len(embedding_data[0])} embeddings in "
f"{number_of_batches} batches of size {min(_extend_batch_size, len(embedding_data[0]))}:"
)
for idx in tqdm(
range(0, len(embedding_data[0]), _extend_batch_size),
progressbar_str,
):
batch_start, batch_end = idx, idx + _extend_batch_size
batched_embeddings = _compute_batched_embeddings(
embedding_function,
embedding_data,
embedding_tensor,
batch_start,
batch_end,
rate_limiter,
)
batched_tensors = _slice_non_embedding_tensors(
processed_tensors, embedding_tensor, batch_start, batch_end
)
batched_processed_tensors = {**batched_embeddings, **batched_tensors}
dataset.extend(batched_processed_tensors, progressbar=False)
else:
logger.info("Uploading data to deeplake dataset.")
dataset.extend(processed_tensors, progressbar=True)
def populate_rate_limiter(rate_limiter):
if rate_limiter is None or rate_limiter == {}:
return {
"enabled": False,
"bytes_per_minute": MAX_BYTES_PER_MINUTE,
"batch_byte_size": TARGET_BYTE_SIZE,
}
else:
rate_limiter_keys = ["enabled", "bytes_per_minute", "batch_byte_size"]
for key in rate_limiter_keys:
if key not in rate_limiter:
rate_limiter[key] = DEFAULT_RATE_LIMITER_KEY_TO_VALUE[key]
for item in rate_limiter:
if item not in rate_limiter_keys:
raise ValueError(
f"Invalid rate_limiter key: {item}. Valid keys are: 'enabled', 'bytes_per_minute', 'batch_byte_size'."
)
return rate_limiter
def extend_or_ingest_dataset(
processed_tensors,
dataset,
embedding_function,
embedding_tensor,
embedding_data,
rate_limiter,
logger,
):
rate_limiter = populate_rate_limiter(rate_limiter)
# TODO: Add back the old logic with checkpointing after indexing is fixed
extend(
embedding_function,
embedding_data,
embedding_tensor,
processed_tensors,
dataset,
rate_limiter,
logger=logger,
)
def convert_id_to_row_id(ids, dataset, search_fn, query, exec_option, filter):
if ids is None:
delete_view = search_fn(
embedding_data=None,
embedding_function=None,
embedding=None,
distance_metric=None,
embedding_tensor=None,
filter=filter,
query=query,
exec_option=exec_option,
return_tensors=False,
return_view=True,
k=int(1e9),
deep_memory=False,
return_tql=False,
)
else:
# backwards compatibility
tensors = dataset.tensors
id_tensor = "id"
if "ids" in tensors:
id_tensor = "ids"
delete_view = dataset.filter(lambda x: x[id_tensor].data()["value"] in ids)
row_ids = list(delete_view.sample_indices)
return row_ids
def check_arguments_compatibility(
ids, filter, query, exec_option, select_all=None, row_ids=None
):
if (
ids is None
and filter is None
and query is None
and row_ids is None
and select_all is None
):
raise ValueError(
"Either ids, row_ids, filter, query, or select_all must be specified."
)
if exec_option not in ("python", "compute_engine", "tensor_db"):
raise ValueError(
"Invalid `exec_option` it should be either `python`, `compute_engine` or `tensor_db`."
)
def search_row_ids(
dataset: deeplake.core.dataset.Dataset,
search_fn: Callable,
ids: Optional[List[str]] = None,
filter: Optional[Union[Dict, Callable]] = None,
query: Optional[str] = None,
exec_option: Optional[str] = "python",
select_all: Optional[bool] = None,
):
check_arguments_compatibility(
ids=ids,
filter=filter,
query=query,
select_all=select_all,
exec_option=exec_option,
)
if select_all:
return None
row_ids = convert_id_to_row_id(
ids=ids,
dataset=dataset,
search_fn=search_fn,
query=query,
exec_option=exec_option,
filter=filter,
)
return row_ids