Error when processing speech_commands dataset #5377

guillaumelorre28 · 2024-04-24T11:26:37Z

/!\ PLEASE INCLUDE THE FULL STACKTRACE AND CODE SNIPPET

Short description
An error occurs when processing the speech_commands dataset.

Environment information

Operating System: mac0S
Python version: Python 3.10
tensorflow-datasets/tfds-nightly version: 4.9.4
tensorflow/tf-nightly version: 2.16.1

Reproduction instructions

import tensorflow_datasets as tfds

dataset_name = "speech_commands"
tfds_path = "/Users/glo/tensorflow-datasets"  # Change the path

builder = tfds.builder(dataset_name, data_dir=tfds_path)
builder.download_and_prepare()

Link to logs

File ~/o 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 -> 1584 1585 1586 1587 1588 1589 1590 1592 pt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tensorflow_datasets/core/dataset_builder.py:1584, in GeneratorBasedBuilder._download_and_prepare(self, dl_manager, download_config)
for split_name, generator in utils.tqdm(
split_generators.items(),
desc="Generating splits...",
unit=" splits",
leave=False,
):
filename_template = naming.ShardedFileTemplate(
split=split_name,
dataset_name=self.name,
data_dir=self.data_path,
filetype_suffix=path_suffix,
)
future = split_builder.submit_split_generation(
split_name=split_name,
generator=generator,
filename_template=filename_template,
disable_shuffling=self.info.disable_shuffling,
)
split_info_futures.append(future)
# Process the result of the beam pipeline.

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tensorflow_datasets/core/split_builder.py:341, in SplitBuilder.submit_split_generation(self, split_name, generator, filename_template, disable_shuffling)
338 # Depending on the type of generator, we use the corresponding
339 # _build_from_xyz method.
340 if isinstance(generator, collections.abc.Iterable):
--> 341 return self._build_from_generator(**build_kwargs)
342 else: # Otherwise, beam required
343 unknown_generator_type = TypeError(
344 f'Invalid split generator value for split {split_name}. '
345 'Expected generator or apache_beam object. Got: '
346 f'{type(generator)}'
347 )

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tensorflow_datasets/core/split_builder.py:406, in SplitBuilder._build_from_generator(self, split_name, generator, filename_template, disable_shuffling)
396 serialized_info = self._features.get_serialized_info()
397 writer = writer_lib.Writer(
398 serializer=example_serializer.ExampleSerializer(serialized_info),
399 filename_template=filename_template,
(...)
404 shard_config=self._shard_config,
405 )
--> 406 for key, example in utils.tqdm(
407 generator,
408 desc=f'Generating {split_name} examples...',
409 unit=' examples',
410 total=total_num_examples,
411 leave=False,
412 mininterval=1.0,
413 ):
414 try:
415 example = self._features.encode_example(example)

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tqdm/notebook.py:249, in tqdm_notebook.iter(self)
247 try:
248 it = super(tqdm_notebook, self).iter()
--> 249 for obj in it:
250 # return super(tqdm...) will not catch exception
251 yield obj
252 # NB: except ... [ as ...] breaks IPython async KeyboardInterrupt

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tqdm/std.py:1182, in tqdm.iter(self)
1179 time = self._time
1181 try:
-> 1182 for obj in iterable:
1183 yield obj
1184 # Update and possibly print the progressbar.
1185 # Note: does not call self.update(1) for speed optimisation.

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/tensorflow_datasets/datasets/speech_commands/speech_commands_dataset_builder.py:138, in Builder._generate_examples(self, archive, file_list)
134 else:
135 try:
136 example = {
137 'audio': np.array(
--> 138 lazy_imports_lib.lazy_imports.pydub.AudioSegment.from_file(
139 file_obj, format='wav'
140 ).get_array_of_samples()
141 ),
142 'label': label,
143 }
144 yield example_id, example
145 except (
146 lazy_imports_lib.lazy_imports.pydub.exceptions.CouldntDecodeError
147 ):

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/pydub/audio_segment.py:728, in AudioSegment.from_file(cls, file, format, codec, parameters, start_second, duration, **kwargs)
726 info = None
727 else:
--> 728 info = mediainfo_json(orig_file, read_ahead_limit=read_ahead_limit)
729 if info:
730 audio_streams = [x for x in info['streams']
731 if x['codec_type'] == 'audio']

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/site-packages/pydub/utils.py:279, in mediainfo_json(filepath, read_ahead_limit)
276 output = output.decode("utf-8", 'ignore')
277 stderr = stderr.decode("utf-8", 'ignore')
--> 279 info = json.loads(output)
281 if not info:
282 # If ffprobe didn't give any information, just return it
283 # (for example, because the file doesn't exist)
284 return info

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/json/init.py:346, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
341 s = s.decode(detect_encoding(s), 'surrogatepass')
343 if (cls is None and object_hook is None and
344 parse_int is None and parse_float is None and
345 parse_constant is None and object_pairs_hook is None and not kw):
--> 346 return _default_decoder.decode(s)
347 if cls is None:
348 cls = JSONDecoder

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/json/decoder.py:337, in JSONDecoder.decode(self, s, _w)
332 def decode(self, s, _w=WHITESPACE.match):
333 """Return the Python representation of s (a str instance
334 containing a JSON document).
335
336 """
--> 337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
338 end = _w(s, end).end()
339 if end != len(s):

File ~/opt/anaconda3/envs/tensorflow_latest/lib/python3.10/json/decoder.py:355, in JSONDecoder.raw_decode(self, s, idx)
353 obj, end = self.scan_once(s, idx)
354 except StopIteration as err:
--> 355 raise JSONDecodeError("Expecting value", s, err.value) from None
356 return obj, end

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

Expected behavior
No error during the processing of the speech_command dataset.

The text was updated successfully, but these errors were encountered:

fylux · 2024-04-29T07:53:58Z

Did you also get this error without a custom data dir? I cannot reproduce the error running in Colab the following:

import tensorflow_datasets as tfds

dataset_name = "speech_commands"

builder = tfds.builder(dataset_name)
builder.download_and_prepare()

guillaumelorre28 added the bug Something isn't working label Apr 24, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Error when processing speech_commands dataset #5377

Error when processing speech_commands dataset #5377

guillaumelorre28 commented Apr 24, 2024

fylux commented Apr 29, 2024

Error when processing speech_commands dataset #5377

Error when processing speech_commands dataset #5377

Comments

guillaumelorre28 commented Apr 24, 2024

fylux commented Apr 29, 2024