diff --git a/VERSION.txt b/VERSION.txt index 7168dea..46d9844 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -2.0.25 \ No newline at end of file +2.0.26 \ No newline at end of file diff --git a/fast_bert/data_lm.py b/fast_bert/data_lm.py index 4a1665e..c9c93d4 100644 --- a/fast_bert/data_lm.py +++ b/fast_bert/data_lm.py @@ -21,7 +21,7 @@ Dataset, ) from torch.utils.data.distributed import DistributedSampler -import spacy + from tqdm import tqdm, trange from fastprogress.fastprogress import master_bar, progress_bar @@ -73,8 +73,6 @@ def create_corpus(text_list, target_path, logger=None): - # nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner", "textcat"]) - with open(target_path, "w") as f: # Split sentences for each document logger.info("Formatting corpus for {}".format(target_path)) @@ -317,7 +315,7 @@ def __init__( # Mask tokens def mask_tokens(self, inputs, mlm_probability=0.15): - """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ + """Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.""" labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability mlm_probability defaults to 0.15 in Bert/RoBERTa) diff --git a/requirements.txt b/requirements.txt index b6e5b99..983a94a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,6 @@ pytorch-lamb tensorboardX fastprogress scikit-learn -spacy seqeval transformers==4.22.* pandas