Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multidataset #1010

Merged
merged 22 commits into from
Apr 21, 2023
26 changes: 21 additions & 5 deletions egs/librispeech/ASR/local/compute_fbank_librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached

from icefall.utils import get_executor
from icefall.utils import get_executor, str2bool

# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
Expand All @@ -61,12 +61,20 @@ def get_args():
help="""Dataset parts to compute fbank. If None, we will use all""",
)

parser.add_argument(
"--perturb-speed",
type=str2bool,
default=True,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use str2bool.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

help="""Perturb speed with factor 0.9 and 1.1 on train subset.""",
)

return parser.parse_args()


def compute_fbank_librispeech(
bpe_model: Optional[str] = None,
dataset: Optional[str] = None,
perturb_speed: Optional[bool] = True,
):
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
Expand Down Expand Up @@ -125,9 +133,13 @@ def compute_fbank_librispeech(
if "train" in partition:
if bpe_model:
cut_set = filter_cuts(cut_set, sp)
cut_set = (
cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
)
if perturb_speed:
logging.info(f"Doing speed perturb")
cut_set = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a log saying it is doing speed perturb.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

cut_set
+ cut_set.perturb_speed(0.9)
+ cut_set.perturb_speed(1.1)
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
Expand All @@ -145,4 +157,8 @@ def compute_fbank_librispeech(
logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args()
logging.info(vars(args))
compute_fbank_librispeech(bpe_model=args.bpe_model, dataset=args.dataset)
compute_fbank_librispeech(
bpe_model=args.bpe_model,
dataset=args.dataset,
perturb_speed=args.perturb_speed,
)
117 changes: 117 additions & 0 deletions egs/librispeech/ASR/prepare_common_voice.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env bash

set -eou pipefail

nj=16
stage=-1
stop_stage=100

# Split data/${lang}set to this number of pieces
# This is to avoid OOM during feature extraction.
num_splits=1000

# We assume dl_dir (download dir) contains the following
# directories and files. If not, they will be downloaded
# by this script automatically.
#
# - $dl_dir/$release/$lang
# This directory contains the following files downloaded from
# https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/${release}/${release}-${lang}.tar.gz
#
# - clips
# - dev.tsv
# - invalidated.tsv
# - other.tsv
# - reported.tsv
# - test.tsv
# - train.tsv
# - validated.tsv

dl_dir=$PWD/download
release=cv-corpus-13.0-2023-03-09
lang=en

. shared/parse_options.sh || exit 1

# All files generated by this script are saved in "data/${lang}".
# You can safely remove "data/${lang}" and rerun this script to regenerate it.
mkdir -p data/${lang}

log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

log "dl_dir: $dl_dir"

if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"

# If you have pre-downloaded it to /path/to/$release,
# you can create a symlink
#
# ln -sfv /path/to/$release $dl_dir/$release
#
if [ ! -d $dl_dir/$release/$lang/clips ]; then
lhotse download commonvoice --languages $lang --release $release $dl_dir
fi
fi

if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare CommonVoice manifest"
# We assume that you have downloaded the CommonVoice corpus
# to $dl_dir/$release
mkdir -p data/${lang}/manifests
if [ ! -e data/${lang}/manifests/.cv-${lang}.done ]; then
lhotse prepare commonvoice --language $lang -j $nj $dl_dir/$release data/${lang}/manifests
touch data/${lang}/manifests/.cv-${lang}.done
fi
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Preprocess CommonVoice manifest"
if [ ! -e data/${lang}/fbank/.preprocess_complete ]; then
./local/preprocess_commonvoice.py --language $lang
touch data/${lang}/fbank/.preprocess_complete
fi
fi

if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute fbank for dev and test subsets of CommonVoice"
mkdir -p data/${lang}/fbank
if [ ! -e data/${lang}/fbank/.cv-${lang}_dev_test.done ]; then
./local/compute_fbank_commonvoice_dev_test.py --language $lang
touch data/${lang}/fbank/.cv-${lang}_dev_test.done
fi
fi

if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Split train subset into ${num_splits} pieces"
split_dir=data/${lang}/fbank/cv-${lang}_train_split_${num_splits}
if [ ! -e $split_dir/.cv-${lang}_train_split.done ]; then
lhotse split $num_splits ./data/${lang}/fbank/cv-${lang}_cuts_train_raw.jsonl.gz $split_dir
touch $split_dir/.cv-${lang}_train_split.done
fi
fi

if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Compute features for train subset of CommonVoice"
if [ ! -e data/${lang}/fbank/.cv-${lang}_train.done ]; then
./local/compute_fbank_commonvoice_splits.py \
--num-workers $nj \
--batch-duration 600 \
--start 0 \
--num-splits $num_splits \
--language $lang
touch data/${lang}/fbank/.cv-${lang}_train.done
fi
fi

if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Combine features for train"
if [ ! -f data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz ]; then
pieces=$(find data/${lang}/fbank/cv-${lang}_train_split_${num_splits} -name "cv-${lang}_cuts_train.*.jsonl.gz")
lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz
fi
fi
61 changes: 39 additions & 22 deletions egs/librispeech/ASR/prepare_giga_speech.sh
Original file line number Diff line number Diff line change
Expand Up @@ -95,48 +95,65 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare GigaSpeech manifest (may take 30 minutes)"
# We assume that you have downloaded the GigaSpeech corpus
# to $dl_dir/GigaSpeech
mkdir -p data/manifests
lhotse prepare gigaspeech \
--subset XL \
--subset L \
--subset M \
--subset S \
--subset XS \
--subset DEV \
--subset TEST \
-j $nj \
$dl_dir/GigaSpeech data/manifests
if [ ! -f data/manifests/.gigaspeech.done ]; then
mkdir -p data/manifests
lhotse prepare gigaspeech \
--subset XL \
--subset L \
--subset M \
--subset S \
--subset XS \
--subset DEV \
--subset TEST \
-j $nj \
$dl_dir/GigaSpeech data/manifests
touch data/manifests/.gigaspeech.done
fi
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Preprocess GigaSpeech manifest"
if [ ! -f data/fbank/.preprocess_complete ]; then
log "It may take 2 hours for this stage"
python3 ./local/preprocess_gigaspeech.py
touch data/fbank/.preprocess_complete
if [ ! -f data/fbank/.gigaspeech_preprocess.done ]; then
log "It may take 2 hours for this stage"
./local/preprocess_gigaspeech.py
touch data/fbank/.gigaspeech_preprocess.done
fi
fi

if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute features for DEV and TEST subsets of GigaSpeech (may take 2 minutes)"
python3 ./local/compute_fbank_gigaspeech_dev_test.py
if [ ! -f data/fbank/.gigaspeech_dev_test.done ]; then
./local/compute_fbank_gigaspeech_dev_test.py
touch data/fbank/.gigaspeech_dev_test.done
fi
fi

if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Split XL subset into ${num_splits} pieces"
split_dir=data/fbank/gigaspeech_XL_split_${num_splits}
if [ ! -f $split_dir/.split_completed ]; then
if [ ! -f $split_dir/.gigaspeech_XL_split.done ]; then
lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size
touch $split_dir/.split_completed
touch $split_dir/.gigaspeech_XL_split.done
fi
fi

if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Compute features for XL"
# Note: The script supports --start and --stop options.
# You can use several machines to compute the features in parallel.
python3 ./local/compute_fbank_gigaspeech_splits.py \
--num-workers $nj \
--batch-duration 600 \
--num-splits $num_splits
if [ ! -f data/fbank/.gigaspeech_XL.done ]; then
./local/compute_fbank_gigaspeech_splits.py \
--num-workers $nj \
--batch-duration 600 \
--num-splits $num_splits
touch data/fbank/.gigaspeech_XL.done
fi
fi

if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Combine features for XL (may take 15 hours)"
if [ ! -f data/fbank/gigaspeech_cuts_XL.jsonl.gz ]; then
pieces=$(find data/fbank/gigaspeech_XL_split_${num_splits} -name "gigaspeech_cuts_XL.*.jsonl.gz")
lhotse combine $pieces data/fbank/gigaspeech_cuts_XL.jsonl.gz
fi
fi
Loading