Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

How to split manifests into several parts #1334

Open
OswaldoBornemann opened this issue May 8, 2024 · 1 comment
Open

How to split manifests into several parts #1334

OswaldoBornemann opened this issue May 8, 2024 · 1 comment

Comments

@OswaldoBornemann
Copy link

How to split manifests into several parts ? I noticed that the cut file can be splited using split. Does the manifests has the similar operation?

@pzelasko
Copy link
Collaborator

pzelasko commented May 8, 2024

There are two methods split and split_lazy defined on each manifest type

lhotse/lhotse/cut/set.py

Lines 821 to 882 in 4f014b1

def split(
self,
num_splits: int,
shuffle: bool = False,
drop_last: bool = False,
) -> List["CutSet"]:
"""
Split the :class:`~lhotse.CutSet` into ``num_splits`` pieces of equal size.
:param num_splits: Requested number of splits.
:param shuffle: Optionally shuffle the recordings order first.
:param drop_last: determines how to handle splitting when ``len(seq)`` is not divisible
by ``num_splits``. When ``False`` (default), the splits might have unequal lengths.
When ``True``, it may discard the last element in some splits to ensure they are
equally long.
:return: A list of :class:`~lhotse.CutSet` pieces.
"""
return [
CutSet(subset)
for subset in split_sequence(
self,
num_splits=num_splits,
shuffle=shuffle,
drop_last=drop_last,
)
]
def split_lazy(
self,
output_dir: Pathlike,
chunk_size: int,
prefix: str = "",
num_digits: int = 8,
start_idx: int = 0,
) -> List["CutSet"]:
"""
Splits a manifest (either lazily or eagerly opened) into chunks, each
with ``chunk_size`` items (except for the last one, typically).
In order to be memory efficient, this implementation saves each chunk
to disk in a ``.jsonl.gz`` format as the input manifest is sampled.
.. note:: For lowest memory usage, use ``load_manifest_lazy`` to open the
input manifest for this method.
:param it: any iterable of Lhotse manifests.
:param output_dir: directory where the split manifests are saved.
Each manifest is saved at: ``{output_dir}/{prefix}.{split_idx}.jsonl.gz``
:param chunk_size: the number of items in each chunk.
:param prefix: the prefix of each manifest.
:param num_digits: the width of ``split_idx``, which will be left padded with zeros to achieve it.
:param start_idx: The split index to start counting from (default is ``0``).
:return: a list of lazily opened chunk manifests.
"""
return split_manifest_lazy(
self,
output_dir=output_dir,
chunk_size=chunk_size,
prefix=prefix,
num_digits=num_digits,
start_idx=start_idx,
)

Also accessible from CLI:

@cli.command()
@click.argument("num_splits", type=int)
@click.argument(
"manifest", type=click.Path(exists=True, dir_okay=False, allow_dash=True)
)
@click.argument("output_dir", type=click.Path())
@click.option(
"-s",
"--shuffle",
is_flag=True,
help="Optionally shuffle the sequence before splitting.",
)
@click.option(
"--pad/--no-pad",
default=True,
help="Whether to pad the split output idx with zeros (e.g. 00, 01, 02, .., 10).",
)
@click.option(
"-i",
"--start-idx",
type=int,
default=0,
help="Count splits starting from this index.",
)
def split(
num_splits: int,
manifest: Pathlike,
output_dir: Pathlike,
shuffle: bool,
pad: bool,
start_idx: int,
):
"""
Load MANIFEST, split it into NUM_SPLITS equal parts and save as separate manifests in OUTPUT_DIR.
When your manifests are very large, prefer to use "lhotse split-lazy" instead.
"""
from lhotse.serialization import load_manifest_lazy_or_eager
output_dir = Path(output_dir)
manifest = Path(manifest)
suffix = "".join(manifest.suffixes)
any_set = load_manifest_lazy_or_eager(manifest)
parts = any_set.split(num_splits=num_splits, shuffle=shuffle)
output_dir.mkdir(parents=True, exist_ok=True)
num_digits = len(str(num_splits))
for idx, part in enumerate(parts, start=start_idx):
idx = f"{idx}".zfill(num_digits) if pad else str(idx)
part.to_file((output_dir / manifest.stem).with_suffix(f".{idx}{suffix}"))
@cli.command()
@click.argument(
"manifest", type=click.Path(exists=True, dir_okay=False, allow_dash=True)
)
@click.argument("output_dir", type=click.Path(allow_dash=True))
@click.argument("chunk_size", type=int)
@click.option(
"-i",
"--start-idx",
type=int,
default=0,
help="Count splits starting from this index.",
)
def split_lazy(
manifest: Pathlike, output_dir: Pathlike, chunk_size: int, start_idx: int
):
"""
Load MANIFEST (lazily if in JSONL format) and split it into parts,
each with CHUNK_SIZE items.
The parts are saved to separate files with pattern
"{output_dir}/{manifest.stem}.{chunk_idx}.jsonl.gz".
Prefer this to "lhotse split" when your manifests are very large.
"""
from lhotse.serialization import load_manifest_lazy_or_eager
output_dir = Path(output_dir)
manifest = Path(manifest)
any_set = load_manifest_lazy_or_eager(manifest)
any_set.split_lazy(
output_dir=output_dir,
chunk_size=chunk_size,
prefix=manifest.stem,
start_idx=start_idx,
)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants