Error: DatasetGenerationError: An error occurred while generating the dataset #25

hissain opened this issue Jun 8, 2024 · 1 comment


hissain commented Jun 8, 2024

What I have done?

How to solve the code that did produce the following error:


TypeError                                 Traceback (most recent call last)
File ~/anaconda3/lib/python3.11/site-packages/datasets/, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1866     writer = writer_class(
   1867         features=writer._features,
   1868         path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
   1871         embed_local_files=embed_local_files,
   1872     )
-> 1873 writer.write_table(table)
   1874 num_examples_progress_update += len(table)

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in ArrowWriter.write_table(self, pa_table, writer_batch_size)
    567 pa_table = pa_table.combine_chunks()
--> 568 pa_table = table_cast(pa_table, self._schema)
    569 if self.embed_local_files:

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in table_cast(table, schema)
   2289 if table.schema != schema:
-> 2290     return cast_table_to_schema(table, schema)
   2291 elif table.schema.metadata != schema.metadata:

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in cast_table_to_schema(table, schema)
   2248     raise ValueError(f"Couldn't cast\n{table.schema}\nto\n{features}\nbecause column names don't match")
-> 2249 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
   2250 return pa.Table.from_arrays(arrays, schema=schema)

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in <listcomp>(.0)
   2248     raise ValueError(f"Couldn't cast\n{table.schema}\nto\n{features}\nbecause column names don't match")
-> 2249 arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
   2250 return pa.Table.from_arrays(arrays, schema=schema)

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
   1816 if isinstance(array, pa.ChunkedArray):
-> 1817     return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
   1818 else:

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in <listcomp>(.0)
   1816 if isinstance(array, pa.ChunkedArray):
-> 1817     return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
   1818 else:

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in cast_array_to_feature(array, feature, allow_number_to_str)
   2108 elif not isinstance(feature, (Sequence, dict, list, tuple)):
-> 2109     return array_cast(array, feature(), allow_number_to_str=allow_number_to_str)
   2110 raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{feature}")

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in _wrap_for_chunked_arrays.<locals>.wrapper(array, *args, **kwargs)
   1818 else:
-> 1819     return func(array, *args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in array_cast(array, pa_type, allow_number_to_str)
   1999     return array.cast(pa_type)
-> 2000 raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{pa_type}")

TypeError: Couldn't cast array of type
struct<ca: string, en: string>
struct<ca: string, de: string>

The above exception was the direct cause of the following exception:

DatasetGenerationError                    Traceback (most recent call last)
Cell In[1], line 9
      5 cfg['num_epochs'] = 1
      7 from train import train_model
----> 9 train_model(cfg)

File ~/git/github/pytorch-transformer/, in train_model(config)
    195 # Make sure the weights folder exists
    196 Path(f"{config['datasource']}_{config['model_folder']}").mkdir(parents=True, exist_ok=True)
--> 198 train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    199 model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    200 # Tensorboard

File ~/git/github/pytorch-transformer/, in get_ds(config)
    141 def get_ds(config):
    142     # It only has the train split, so we divide it overselves
--> 143     ds_raw = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='train')
    145     # Build tokenizers
    146     tokenizer_src = get_or_build_tokenizer(config, ds_raw, config['lang_src'])

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, storage_options, **config_kwargs)
   1794 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
   1796 # Download and prepare data
-> 1797 builder_instance.download_and_prepare(
   1798     download_config=download_config,
   1799     download_mode=download_mode,
   1800     verification_mode=verification_mode,
   1801     try_from_hf_gcs=try_from_hf_gcs,
   1802     num_proc=num_proc,
   1803     storage_options=storage_options,
   1804 )
   1806 # Build dataset for splits
   1807 keep_in_memory = (
   1808     keep_in_memory if keep_in_memory is not None else is_small_dataset(
   1809 )

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
    888     if num_proc is not None:
    889         prepare_split_kwargs["num_proc"] = num_proc
--> 890     self._download_and_prepare(
    891         dl_manager=dl_manager,
    892         verification_mode=verification_mode,
    893         **prepare_split_kwargs,
    894         **download_and_prepare_kwargs,
    895     )
    896 # Sync info
    897 = sum(split.num_bytes for split in

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
    981 split_dict.add(split_generator.split_info)
    983 try:
    984     # Prepare split will record examples associated to the split
--> 985     self._prepare_split(split_generator, **prepare_split_kwargs)
    986 except OSError as e:
    987     raise OSError(
    988         "Cannot find data file. "
    989         + (self.manual_download_instructions or "")
    990         + "\nOriginal error:\n"
    991         + str(e)
    992     ) from None

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
   1744 job_id = 0
   1745 with pbar:
-> 1746     for job_id, done, content in self._prepare_split_single(
   1747         gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
   1748     ):
   1749         if done:
   1750             result = content

File ~/anaconda3/lib/python3.11/site-packages/datasets/, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
   1889     if isinstance(e, SchemaInferenceError) and e.__context__ is not None:
   1890         e = e.__context__
-> 1891     raise DatasetGenerationError("An error occurred while generating the dataset") from e
   1893 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)

DatasetGenerationError: An error occurred while generating the dataset

My '!pip list' output in Jypyter Notebook

SleepTight-Cat commented Dec 11, 2024

Hi, if you do
and get only
then this maybe caused by using an old version of datasets.
For me the code fails with datasets-2.13.2, yet works with datasets-3.2.0. So you may consider update your datasets package as I saw that you have datasets-2.12.0.

You may also refer to this post which is basically just me asking the same question in the Huggingface forum.

Hope these help!

