From ca4d3c4b3fa5ec4f1329fb8d4f5dbc37ea89456d Mon Sep 17 00:00:00 2001 From: Jeremy Dohmann Date: Thu, 14 Dec 2023 12:02:21 -0600 Subject: [PATCH 01/10] add custome gen kwargs and stopping on eos token --- .../in_context_learning_evaluation.py | 52 +++++++++++++------ 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py index a7f87d95d1..431a031ffc 100644 --- a/composer/datasets/in_context_learning_evaluation.py +++ b/composer/datasets/in_context_learning_evaluation.py @@ -153,6 +153,7 @@ def __init__( question_prelimiter: str, fewshot_random_seed: int, cot_delimiter: str = '', + generation_kwargs: Optional[dict] = None ): try: from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues] @@ -172,6 +173,7 @@ def __init__( self.padding_side = 'left' self.max_answer_length = 0 fewshot_rng = random.Random(fewshot_random_seed) + self.generation_kwargs = generation_kwargs if generation_kwargs else {} self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter, question_prelimiter, fewshot_rng, cot_delimiter) @@ -298,16 +300,19 @@ def collate_fn(self, data): # beginning with `cot_delimiter` cot_delimiter = sample['cot_delimiter'] + generation_kwargs = { + 'pad_token_id': self.pad_tok_id, + 'use_cache': True, + 'eos_token_id': self.tokenizer.eos_token_id + } + generation_kwargs.update(self.generation_kwargs) batch = { 'input_ids': torch.stack(inputs), 'mode': 'generate', 'labels': answers, 'cot_delimiter': cot_delimiter, 'generation_length': self.max_answer_length, - 'generation_kwargs': { - 'pad_token_id': self.pad_tok_id, - 'use_cache': True - } + 'generation_kwargs': self.generation_kwargs } batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id) @@ -947,6 +952,8 @@ def __init__( pass_at_k: int = 1, top_p: Optional[float] = 0.95, top_k: Optional[int] = 40, + generation_kwargs: Optional[dict] = None + ): try: from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues] @@ -958,6 +965,8 @@ def __init__( if dist.get_local_rank() == 0: get_file(dataset_uri, destination_path, overwrite=True) dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False) + self.generation_kwargs = generation_kwargs if generation_kwargs else {} + self.samples = list( dataset.map( lambda examples: { @@ -1089,6 +1098,17 @@ def collate_fn(self, data): test_outputs.append(test_output) languages.append(language) + generation_kwargs = { + 'pad_token_id': self.pad_tok_id, + 'num_beams': 1, # single beam + 'num_return_sequences': self.generations_per_sample, # how many gens per prompt + 'do_sample': True, + 'top_p': self.top_p, + 'top_k': self.top_k, + 'use_cache': True, + 'eos_token_id': self.tokenizer.eos_token_id + } + generation_kwargs.update(self.generation_kwargs) batch = { 'input_ids': torch.stack(inputs), 'mode': 'generate', @@ -1102,15 +1122,7 @@ def collate_fn(self, data): 'languages': languages, # list of languages 'pass_at_k': self.pass_at_k, 'generation_length': self.max_seq_len - self.max_prompt_length, - 'generation_kwargs': { - 'pad_token_id': self.pad_tok_id, - 'num_beams': 1, # single beam - 'num_return_sequences': self.generations_per_sample, # how many gens per prompt - 'do_sample': True, - 'top_p': self.top_p, - 'top_k': self.top_k, - 'use_cache': True, - } + 'generation_kwargs': generation_kwargs } batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id) return batch @@ -1165,6 +1177,7 @@ def build_icl_dataloader( fewshot_random_seed: int = 1234, pass_at_k: int = 1, generations_per_sample: int = 1, + generation_kwargs: Optional[dict] = None ) -> DataSpec: if icl_task_type == 'multiple_choice': dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri, @@ -1216,7 +1229,8 @@ def build_icl_dataloader( destination_path=destination_path, question_prelimiter=question_prelimiter, fewshot_random_seed=fewshot_random_seed, - cot_delimiter=cot_delimiter) + cot_delimiter=cot_delimiter, + generation_kwargs=generation_kwargs) effective_batchsize = batch_size elif icl_task_type == 'code_evaluation': dataset = InContextLearningCodeEvalDataset(dataset_uri, @@ -1230,7 +1244,9 @@ def build_icl_dataloader( code_prelimiter=question_prelimiter, fewshot_random_seed=fewshot_random_seed, pass_at_k=pass_at_k, - generations_per_sample=generations_per_sample) + generations_per_sample=generations_per_sample, + generation_kwargs=generation_kwargs +) effective_batchsize = batch_size else: raise Exception(f'Unrecognized ICL task type: {icl_task_type}') @@ -1316,7 +1332,9 @@ def get_icl_task_dataloader( pass_at_k: int = 1, generations_per_sample: int = 1, cot_delimiter: str = '', - has_categories: bool = False) -> Union[DataSpec, Dict[str, DataSpec]]: + has_categories: bool = False, + generation_kwargs: Optional[dict] = None +) -> Union[DataSpec, Dict[str, DataSpec]]: """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below: >>> dl = get_icl_task_dataloader( @@ -1386,6 +1404,7 @@ def get_icl_task_dataloader( fewshot_random_seed, pass_at_k, generations_per_sample, + generation_kwargs ) return result_dls else: @@ -1406,4 +1425,5 @@ def get_icl_task_dataloader( fewshot_random_seed, pass_at_k, generations_per_sample, + generation_kwargs ) From 1e39623262f12e410347caa49e4d739a05814234 Mon Sep 17 00:00:00 2001 From: Jeremy Dohmann Date: Thu, 14 Dec 2023 14:05:03 -0600 Subject: [PATCH 02/10] modify test --- composer/datasets/in_context_learning_evaluation.py | 2 +- composer/models/huggingface.py | 1 - tests/datasets/test_in_context_learning_datasets.py | 3 +++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py index 431a031ffc..762d2a3af3 100644 --- a/composer/datasets/in_context_learning_evaluation.py +++ b/composer/datasets/in_context_learning_evaluation.py @@ -312,7 +312,7 @@ def collate_fn(self, data): 'labels': answers, 'cot_delimiter': cot_delimiter, 'generation_length': self.max_answer_length, - 'generation_kwargs': self.generation_kwargs + 'generation_kwargs': generation_kwargs } batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id) diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py index 8a944e29c2..771dac5f5c 100644 --- a/composer/models/huggingface.py +++ b/composer/models/huggingface.py @@ -411,7 +411,6 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): max_new_tokens=batch['generation_length'], synced_gpus=dist.get_world_size() > 1, **batch.get('generation_kwargs', {})) - # don't remove prefix space to sentencepiece models if len(self.tokenizer(' a', add_special_tokens=False)['input_ids']) == 1: return self.tokenizer.batch_decode(generation[:, batch['input_ids'].shape[1]:], diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py index 0617cb7847..334077950e 100644 --- a/tests/datasets/test_in_context_learning_datasets.py +++ b/tests/datasets/test_in_context_learning_datasets.py @@ -545,6 +545,7 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fews assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen - maximum_answer_length) assert batch['mode'] == 'generate' # the maximum generation length from the small test data + assert batch['generation_length'] == maximum_answer_length assert all(item[0] == tokenizer.eos_token_id for item in batch['input_ids']) @@ -559,6 +560,7 @@ def test_qa_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fews for found, expected in zip(batch['labels'], [['David Seville'], ['Skorpio', 'Scorpio']])) assert decoded_batch[0].endswith('Q: Who was the man behind The Chipmunks?\nA:') assert decoded_batch[1].endswith('Q: What star sign is Jamie Lee Curtis?\nA:') + assert 'eos_token_id' in batch['generation_kwargs'] @pytest.mark.parametrize('dataset_uri', ['gsm8k_small.jsonl']) @@ -623,6 +625,7 @@ def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, assert decoded_batch[1].endswith( "Q: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\nA: Let's think step by step. Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market. #### 18\nQ: Kylar went to the store to buy glasses for his new apartment. One glass costs $5, but every second glass costs only 60% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?\nA: Let's think step by step. The discount price of one glass is 60/100 * 5 = $<<60/100*5=3>>3.\nIf every second glass is cheaper, that means Kylar is going to buy 16 / 2 = <<16/2=8>>8 cheaper glasses.\nSo for the cheaper glasses, Kylar is going to pay 8 * 3 = $<<8*3=24>>24.\nAnd for the regular-priced glasses, Kylar will pay 8 * 5 = $<<8*5=40>>40.\nSo in total Kylar needs to pay 24 + 40 = $<<24+40=64>>64 for the glasses he wants to buy. #### 64\nQ: A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?\nA: Let's think step by step." ) + @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl']) From 09af753fe5e296db3015827e4410e32af3298798 Mon Sep 17 00:00:00 2001 From: Jeremy Dohmann Date: Thu, 14 Dec 2023 14:09:50 -0600 Subject: [PATCH 03/10] modify test --- .../in_context_learning_evaluation.py | 180 +++++++----------- .../test_in_context_learning_datasets.py | 1 - 2 files changed, 72 insertions(+), 109 deletions(-) diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py index 762d2a3af3..436fcce503 100644 --- a/composer/datasets/in_context_learning_evaluation.py +++ b/composer/datasets/in_context_learning_evaluation.py @@ -139,22 +139,20 @@ def _read_dataset(self, dataset: Dataset) -> List[Dict[str, str]]: }) return result - def __init__( - self, - dataset_uri: str, - tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], - max_seq_len: int, - pad_tok_id: int, - num_fewshot: int, - prompt_string: str, - example_delimiter: str, - continuation_delimiter: str, - destination_path: str, - question_prelimiter: str, - fewshot_random_seed: int, - cot_delimiter: str = '', - generation_kwargs: Optional[dict] = None - ): + def __init__(self, + dataset_uri: str, + tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], + max_seq_len: int, + pad_tok_id: int, + num_fewshot: int, + prompt_string: str, + example_delimiter: str, + continuation_delimiter: str, + destination_path: str, + question_prelimiter: str, + fewshot_random_seed: int, + cot_delimiter: str = '', + generation_kwargs: Optional[dict] = None): try: from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues] except ImportError as e: @@ -301,10 +299,10 @@ def collate_fn(self, data): cot_delimiter = sample['cot_delimiter'] generation_kwargs = { - 'pad_token_id': self.pad_tok_id, - 'use_cache': True, - 'eos_token_id': self.tokenizer.eos_token_id - } + 'pad_token_id': self.pad_tok_id, + 'use_cache': True, + 'eos_token_id': self.tokenizer.eos_token_id + } generation_kwargs.update(self.generation_kwargs) batch = { 'input_ids': torch.stack(inputs), @@ -936,25 +934,22 @@ class InContextLearningCodeEvalDataset(Dataset): top_k: top_k sampling parameter for number of samples to consider """ - def __init__( - self, - dataset_uri: str, - tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], - max_seq_len: int, - pad_tok_id: int, - num_fewshot: int, - prompt_string: str, - example_delimiter: str, - destination_path: str, - code_prelimiter: str, - fewshot_random_seed: int, - generations_per_sample: int, - pass_at_k: int = 1, - top_p: Optional[float] = 0.95, - top_k: Optional[int] = 40, - generation_kwargs: Optional[dict] = None - - ): + def __init__(self, + dataset_uri: str, + tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], + max_seq_len: int, + pad_tok_id: int, + num_fewshot: int, + prompt_string: str, + example_delimiter: str, + destination_path: str, + code_prelimiter: str, + fewshot_random_seed: int, + generations_per_sample: int, + pass_at_k: int = 1, + top_p: Optional[float] = 0.95, + top_k: Optional[int] = 40, + generation_kwargs: Optional[dict] = None): try: from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues] except ImportError as e: @@ -1099,15 +1094,15 @@ def collate_fn(self, data): languages.append(language) generation_kwargs = { - 'pad_token_id': self.pad_tok_id, - 'num_beams': 1, # single beam - 'num_return_sequences': self.generations_per_sample, # how many gens per prompt - 'do_sample': True, - 'top_p': self.top_p, - 'top_k': self.top_k, - 'use_cache': True, - 'eos_token_id': self.tokenizer.eos_token_id - } + 'pad_token_id': self.pad_tok_id, + 'num_beams': 1, # single beam + 'num_return_sequences': self.generations_per_sample, # how many gens per prompt + 'do_sample': True, + 'top_p': self.top_p, + 'top_k': self.top_k, + 'use_cache': True, + 'eos_token_id': self.tokenizer.eos_token_id + } generation_kwargs.update(self.generation_kwargs) batch = { 'input_ids': torch.stack(inputs), @@ -1161,24 +1156,23 @@ def split_batch(self, batch: Any, microbatch_size: int): def build_icl_dataloader( - icl_task_type: str, - dataset_uri: str, - tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], - batch_size: int, - max_seq_len: int, - pad_tok_id: int, - num_fewshot: int, - prompt_string: str, # e.g. 'translate english to french:' - example_delimiter: str, # e.g. '\n' - continuation_delimiter: str, # e.g. '' - destination_path: str, - question_prelimiter: str = '', # e.g. 'Question: ' - cot_delimiter: str = '', - fewshot_random_seed: int = 1234, - pass_at_k: int = 1, - generations_per_sample: int = 1, - generation_kwargs: Optional[dict] = None -) -> DataSpec: + icl_task_type: str, + dataset_uri: str, + tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], + batch_size: int, + max_seq_len: int, + pad_tok_id: int, + num_fewshot: int, + prompt_string: str, # e.g. 'translate english to french:' + example_delimiter: str, # e.g. '\n' + continuation_delimiter: str, # e.g. '' + destination_path: str, + question_prelimiter: str = '', # e.g. 'Question: ' + cot_delimiter: str = '', + fewshot_random_seed: int = 1234, + pass_at_k: int = 1, + generations_per_sample: int = 1, + generation_kwargs: Optional[dict] = None) -> DataSpec: if icl_task_type == 'multiple_choice': dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri, tokenizer, @@ -1245,8 +1239,7 @@ def build_icl_dataloader( fewshot_random_seed=fewshot_random_seed, pass_at_k=pass_at_k, generations_per_sample=generations_per_sample, - generation_kwargs=generation_kwargs -) + generation_kwargs=generation_kwargs) effective_batchsize = batch_size else: raise Exception(f'Unrecognized ICL task type: {icl_task_type}') @@ -1333,8 +1326,7 @@ def get_icl_task_dataloader( generations_per_sample: int = 1, cot_delimiter: str = '', has_categories: bool = False, - generation_kwargs: Optional[dict] = None -) -> Union[DataSpec, Dict[str, DataSpec]]: + generation_kwargs: Optional[dict] = None) -> Union[DataSpec, Dict[str, DataSpec]]: """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below: >>> dl = get_icl_task_dataloader( @@ -1387,43 +1379,15 @@ def get_icl_task_dataloader( categories = sorted(output_files.keys()) for category in categories: partition_uri = output_files[category] - result_dls[category] = build_icl_dataloader( - icl_task_type, - partition_uri, - tokenizer, - batch_size, - max_seq_len, - pad_tok_id, - num_fewshot, - prompt_string, - example_delimiter, - continuation_delimiter, - partition_uri + '_tmp', - question_prelimiter, - cot_delimiter, - fewshot_random_seed, - pass_at_k, - generations_per_sample, - generation_kwargs - ) + result_dls[category] = build_icl_dataloader(icl_task_type, partition_uri, tokenizer, batch_size, + max_seq_len, pad_tok_id, num_fewshot, prompt_string, + example_delimiter, continuation_delimiter, + partition_uri + '_tmp', question_prelimiter, cot_delimiter, + fewshot_random_seed, pass_at_k, generations_per_sample, + generation_kwargs) return result_dls else: - return build_icl_dataloader( - icl_task_type, - dataset_uri, - tokenizer, - batch_size, - max_seq_len, - pad_tok_id, - num_fewshot, - prompt_string, - example_delimiter, - continuation_delimiter, - destination_path, - question_prelimiter, - cot_delimiter, - fewshot_random_seed, - pass_at_k, - generations_per_sample, - generation_kwargs - ) + return build_icl_dataloader(icl_task_type, dataset_uri, tokenizer, batch_size, max_seq_len, pad_tok_id, + num_fewshot, prompt_string, example_delimiter, continuation_delimiter, + destination_path, question_prelimiter, cot_delimiter, fewshot_random_seed, + pass_at_k, generations_per_sample, generation_kwargs) diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py index 334077950e..dc8340d2cd 100644 --- a/tests/datasets/test_in_context_learning_datasets.py +++ b/tests/datasets/test_in_context_learning_datasets.py @@ -625,7 +625,6 @@ def test_qa_task_with_cot_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path, assert decoded_batch[1].endswith( "Q: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\nA: Let's think step by step. Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market. #### 18\nQ: Kylar went to the store to buy glasses for his new apartment. One glass costs $5, but every second glass costs only 60% of the price. Kylar wants to buy 16 glasses. How much does he need to pay for them?\nA: Let's think step by step. The discount price of one glass is 60/100 * 5 = $<<60/100*5=3>>3.\nIf every second glass is cheaper, that means Kylar is going to buy 16 / 2 = <<16/2=8>>8 cheaper glasses.\nSo for the cheaper glasses, Kylar is going to pay 8 * 3 = $<<8*3=24>>24.\nAnd for the regular-priced glasses, Kylar will pay 8 * 5 = $<<8*5=40>>40.\nSo in total Kylar needs to pay 24 + 40 = $<<24+40=64>>64 for the glasses he wants to buy. #### 64\nQ: A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?\nA: Let's think step by step." ) - @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl']) From a3501e9ec6acb976f80ee2e00b08fec4064c4555 Mon Sep 17 00:00:00 2001 From: Jeremy Dohmann Date: Mon, 18 Dec 2023 11:54:14 -0500 Subject: [PATCH 04/10] finish --- .../in_context_learning_evaluation.py | 57 +++++++------------ 1 file changed, 22 insertions(+), 35 deletions(-) diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py index 5614d928e7..badfe37483 100644 --- a/composer/datasets/in_context_learning_evaluation.py +++ b/composer/datasets/in_context_learning_evaluation.py @@ -151,8 +151,7 @@ def __init__(self, destination_path: str, question_prelimiter: str, fewshot_random_seed: int, - cot_delimiter: str = '', - generation_kwargs: Optional[dict] = None): + cot_delimiter: str = ''): try: from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues] except ImportError as e: @@ -171,7 +170,6 @@ def __init__(self, self.padding_side = 'left' self.max_answer_length = 0 fewshot_rng = random.Random(fewshot_random_seed) - self.generation_kwargs = generation_kwargs if generation_kwargs else {} self.encoded_dataset = self._prep_examples(num_fewshot, prompt_string, example_delimiter, continuation_delimiter, question_prelimiter, fewshot_rng, cot_delimiter) @@ -298,19 +296,17 @@ def collate_fn(self, data): # beginning with `cot_delimiter` cot_delimiter = sample['cot_delimiter'] - generation_kwargs = { - 'pad_token_id': self.pad_tok_id, - 'use_cache': True, - 'eos_token_id': self.tokenizer.eos_token_id - } - generation_kwargs.update(self.generation_kwargs) batch = { 'input_ids': torch.stack(inputs), 'mode': 'generate', 'labels': answers, 'cot_delimiter': cot_delimiter, 'generation_length': self.max_answer_length, - 'generation_kwargs': generation_kwargs + 'generation_kwargs': { + 'pad_token_id': self.pad_tok_id, + 'use_cache': True, + 'eos_token_id': self.tokenizer.eos_token_id + } } batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id) @@ -948,8 +944,7 @@ def __init__(self, generations_per_sample: int, pass_at_k: int = 1, top_p: Optional[float] = 0.95, - top_k: Optional[int] = 40, - generation_kwargs: Optional[dict] = None): + top_k: Optional[int] = 40): try: from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues] except ImportError as e: @@ -960,7 +955,6 @@ def __init__(self, if dist.get_local_rank() == 0: get_file(dataset_uri, destination_path, overwrite=True) dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False) - self.generation_kwargs = generation_kwargs if generation_kwargs else {} self.samples = list( dataset.map( @@ -1099,16 +1093,6 @@ def collate_fn(self, data): test_outputs.append(test_output) languages.append(language) - generation_kwargs = { - 'pad_token_id': self.pad_tok_id, - 'num_beams': 1, # single beam - 'num_return_sequences': self.generations_per_sample, # how many gens per prompt - 'do_sample': True, - 'top_p': self.top_p, - 'top_k': self.top_k, - 'use_cache': True, - } - generation_kwargs.update(self.generation_kwargs) batch = { 'input_ids': torch.stack(inputs), 'mode': 'generate', @@ -1121,7 +1105,15 @@ def collate_fn(self, data): 'test_outputs': test_outputs, # list of test outputs 'languages': languages, # list of languages 'pass_at_k': self.pass_at_k, - 'generation_kwargs': generation_kwargs, + 'generation_kwargs': { + 'pad_token_id': self.pad_tok_id, + 'num_beams': 1, # single beam + 'num_return_sequences': self.generations_per_sample, # how many gens per prompt + 'do_sample': True, + 'top_p': self.top_p, + 'top_k': self.top_k, + 'use_cache': True, + }, 'generation_length': min(self.max_answer_length, self.max_seq_len - self.max_prompt_length), } batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id) @@ -1176,8 +1168,7 @@ def build_icl_dataloader( cot_delimiter: str = '', fewshot_random_seed: int = 1234, pass_at_k: int = 1, - generations_per_sample: int = 1, - generation_kwargs: Optional[dict] = None) -> DataSpec: + generations_per_sample: int = 1) -> DataSpec: if icl_task_type == 'multiple_choice': dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri, tokenizer, @@ -1228,8 +1219,7 @@ def build_icl_dataloader( destination_path=destination_path, question_prelimiter=question_prelimiter, fewshot_random_seed=fewshot_random_seed, - cot_delimiter=cot_delimiter, - generation_kwargs=generation_kwargs) + cot_delimiter=cot_delimiter) effective_batchsize = batch_size elif icl_task_type == 'code_evaluation': dataset = InContextLearningCodeEvalDataset(dataset_uri, @@ -1243,8 +1233,7 @@ def build_icl_dataloader( code_prelimiter=question_prelimiter, fewshot_random_seed=fewshot_random_seed, pass_at_k=pass_at_k, - generations_per_sample=generations_per_sample, - generation_kwargs=generation_kwargs) + generations_per_sample=generations_per_sample) effective_batchsize = batch_size else: raise Exception(f'Unrecognized ICL task type: {icl_task_type}') @@ -1330,8 +1319,7 @@ def get_icl_task_dataloader( pass_at_k: int = 1, generations_per_sample: int = 1, cot_delimiter: str = '', - has_categories: bool = False, - generation_kwargs: Optional[dict] = None) -> Union[DataSpec, Dict[str, DataSpec]]: + has_categories: bool = False) -> Union[DataSpec, Dict[str, DataSpec]]: """This constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below: >>> dl = get_icl_task_dataloader( @@ -1388,11 +1376,10 @@ def get_icl_task_dataloader( max_seq_len, pad_tok_id, num_fewshot, prompt_string, example_delimiter, continuation_delimiter, partition_uri + '_tmp', question_prelimiter, cot_delimiter, - fewshot_random_seed, pass_at_k, generations_per_sample, - generation_kwargs) + fewshot_random_seed, pass_at_k, generations_per_sample) return result_dls else: return build_icl_dataloader(icl_task_type, dataset_uri, tokenizer, batch_size, max_seq_len, pad_tok_id, num_fewshot, prompt_string, example_delimiter, continuation_delimiter, destination_path, question_prelimiter, cot_delimiter, fewshot_random_seed, - pass_at_k, generations_per_sample, generation_kwargs) + pass_at_k, generations_per_sample) From fadce0e5a79b74ea138674f282a439afffd2a6af Mon Sep 17 00:00:00 2001 From: Jeremy Dohmann Date: Mon, 18 Dec 2023 12:02:57 -0500 Subject: [PATCH 05/10] finish --- .../in_context_learning_evaluation.py | 141 +++++++++++------- 1 file changed, 86 insertions(+), 55 deletions(-) diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py index badfe37483..a0d34274f1 100644 --- a/composer/datasets/in_context_learning_evaluation.py +++ b/composer/datasets/in_context_learning_evaluation.py @@ -139,19 +139,21 @@ def _read_dataset(self, dataset: Dataset) -> List[Dict[str, str]]: }) return result - def __init__(self, - dataset_uri: str, - tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], - max_seq_len: int, - pad_tok_id: int, - num_fewshot: int, - prompt_string: str, - example_delimiter: str, - continuation_delimiter: str, - destination_path: str, - question_prelimiter: str, - fewshot_random_seed: int, - cot_delimiter: str = ''): + def __init__( + self, + dataset_uri: str, + tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], + max_seq_len: int, + pad_tok_id: int, + num_fewshot: int, + prompt_string: str, + example_delimiter: str, + continuation_delimiter: str, + destination_path: str, + question_prelimiter: str, + fewshot_random_seed: int, + cot_delimiter: str = '', + ): try: from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues] except ImportError as e: @@ -930,21 +932,23 @@ class InContextLearningCodeEvalDataset(Dataset): top_k: top_k sampling parameter for number of samples to consider """ - def __init__(self, - dataset_uri: str, - tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], - max_seq_len: int, - pad_tok_id: int, - num_fewshot: int, - prompt_string: str, - example_delimiter: str, - destination_path: str, - code_prelimiter: str, - fewshot_random_seed: int, - generations_per_sample: int, - pass_at_k: int = 1, - top_p: Optional[float] = 0.95, - top_k: Optional[int] = 40): + def __init__( + self, + dataset_uri: str, + tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], + max_seq_len: int, + pad_tok_id: int, + num_fewshot: int, + prompt_string: str, + example_delimiter: str, + destination_path: str, + code_prelimiter: str, + fewshot_random_seed: int, + generations_per_sample: int, + pass_at_k: int = 1, + top_p: Optional[float] = 0.95, + top_k: Optional[int] = 40, + ): try: from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues] except ImportError as e: @@ -955,7 +959,6 @@ def __init__(self, if dist.get_local_rank() == 0: get_file(dataset_uri, destination_path, overwrite=True) dataset = load_dataset('json', data_files=destination_path, split='train', streaming=False) - self.samples = list( dataset.map( lambda examples: { @@ -1105,6 +1108,7 @@ def collate_fn(self, data): 'test_outputs': test_outputs, # list of test outputs 'languages': languages, # list of languages 'pass_at_k': self.pass_at_k, + 'generation_length': min(self.max_answer_length, self.max_seq_len - self.max_prompt_length), 'generation_kwargs': { 'pad_token_id': self.pad_tok_id, 'num_beams': 1, # single beam @@ -1114,7 +1118,6 @@ def collate_fn(self, data): 'top_k': self.top_k, 'use_cache': True, }, - 'generation_length': min(self.max_answer_length, self.max_seq_len - self.max_prompt_length), } batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id) return batch @@ -1153,22 +1156,23 @@ def split_batch(self, batch: Any, microbatch_size: int): def build_icl_dataloader( - icl_task_type: str, - dataset_uri: str, - tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], - batch_size: int, - max_seq_len: int, - pad_tok_id: int, - num_fewshot: int, - prompt_string: str, # e.g. 'translate english to french:' - example_delimiter: str, # e.g. '\n' - continuation_delimiter: str, # e.g. '' - destination_path: str, - question_prelimiter: str = '', # e.g. 'Question: ' - cot_delimiter: str = '', - fewshot_random_seed: int = 1234, - pass_at_k: int = 1, - generations_per_sample: int = 1) -> DataSpec: + icl_task_type: str, + dataset_uri: str, + tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], + batch_size: int, + max_seq_len: int, + pad_tok_id: int, + num_fewshot: int, + prompt_string: str, # e.g. 'translate english to french:' + example_delimiter: str, # e.g. '\n' + continuation_delimiter: str, # e.g. '' + destination_path: str, + question_prelimiter: str = '', # e.g. 'Question: ' + cot_delimiter: str = '', + fewshot_random_seed: int = 1234, + pass_at_k: int = 1, + generations_per_sample: int = 1, +) -> DataSpec: if icl_task_type == 'multiple_choice': dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri, tokenizer, @@ -1372,14 +1376,41 @@ def get_icl_task_dataloader( categories = sorted(output_files.keys()) for category in categories: partition_uri = output_files[category] - result_dls[category] = build_icl_dataloader(icl_task_type, partition_uri, tokenizer, batch_size, - max_seq_len, pad_tok_id, num_fewshot, prompt_string, - example_delimiter, continuation_delimiter, - partition_uri + '_tmp', question_prelimiter, cot_delimiter, - fewshot_random_seed, pass_at_k, generations_per_sample) + result_dls[category] = build_icl_dataloader( + icl_task_type, + partition_uri, + tokenizer, + batch_size, + max_seq_len, + pad_tok_id, + num_fewshot, + prompt_string, + example_delimiter, + continuation_delimiter, + partition_uri + '_tmp', + question_prelimiter, + cot_delimiter, + fewshot_random_seed, + pass_at_k, + generations_per_sample, + ) return result_dls else: - return build_icl_dataloader(icl_task_type, dataset_uri, tokenizer, batch_size, max_seq_len, pad_tok_id, - num_fewshot, prompt_string, example_delimiter, continuation_delimiter, - destination_path, question_prelimiter, cot_delimiter, fewshot_random_seed, - pass_at_k, generations_per_sample) + return build_icl_dataloader( + icl_task_type, + dataset_uri, + tokenizer, + batch_size, + max_seq_len, + pad_tok_id, + num_fewshot, + prompt_string, + example_delimiter, + continuation_delimiter, + destination_path, + question_prelimiter, + cot_delimiter, + fewshot_random_seed, + pass_at_k, + generations_per_sample, + ) From 92157da4967560a80a1ee34696388e31eac7ab17 Mon Sep 17 00:00:00 2001 From: Jeremy Dohmann Date: Mon, 18 Dec 2023 12:09:59 -0500 Subject: [PATCH 06/10] finish --- composer/datasets/in_context_learning_evaluation.py | 3 ++- composer/models/huggingface.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py index a0d34274f1..760abc276f 100644 --- a/composer/datasets/in_context_learning_evaluation.py +++ b/composer/datasets/in_context_learning_evaluation.py @@ -1117,7 +1117,8 @@ def collate_fn(self, data): 'top_p': self.top_p, 'top_k': self.top_k, 'use_cache': True, - }, + 'eos_token_id': self.tokenizer.eos_token_id + } } batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id) return batch diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py index 771dac5f5c..6271fbba9c 100644 --- a/composer/models/huggingface.py +++ b/composer/models/huggingface.py @@ -411,6 +411,7 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): max_new_tokens=batch['generation_length'], synced_gpus=dist.get_world_size() > 1, **batch.get('generation_kwargs', {})) + # don't remove prefix space to sentencepiece models if len(self.tokenizer(' a', add_special_tokens=False)['input_ids']) == 1: return self.tokenizer.batch_decode(generation[:, batch['input_ids'].shape[1]:], From d137bbcd34c1bd1f42b577e544274067fe388948 Mon Sep 17 00:00:00 2001 From: Jeremy Dohmann Date: Mon, 18 Dec 2023 12:19:05 -0500 Subject: [PATCH 07/10] finish --- composer/models/huggingface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/models/huggingface.py b/composer/models/huggingface.py index 6271fbba9c..8a944e29c2 100644 --- a/composer/models/huggingface.py +++ b/composer/models/huggingface.py @@ -411,7 +411,7 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): max_new_tokens=batch['generation_length'], synced_gpus=dist.get_world_size() > 1, **batch.get('generation_kwargs', {})) - + # don't remove prefix space to sentencepiece models if len(self.tokenizer(' a', add_special_tokens=False)['input_ids']) == 1: return self.tokenizer.batch_decode(generation[:, batch['input_ids'].shape[1]:], From 909ed639a0e1151ee5b7d5d5c1c52614f02227bb Mon Sep 17 00:00:00 2001 From: Jeremy Dohmann Date: Wed, 20 Dec 2023 12:05:53 -0500 Subject: [PATCH 08/10] finish pr --- composer/datasets/in_context_learning_evaluation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py index 760abc276f..fb3efcd4da 100644 --- a/composer/datasets/in_context_learning_evaluation.py +++ b/composer/datasets/in_context_learning_evaluation.py @@ -154,6 +154,8 @@ def __init__( fewshot_random_seed: int, cot_delimiter: str = '', ): + if not hasattr(tokenizer, 'eos_token_id'): + raise ValueError('`InContextLearningQATaskDataset` tokenizer must have `eos_token_id`') try: from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues] except ImportError as e: @@ -949,6 +951,8 @@ def __init__( top_p: Optional[float] = 0.95, top_k: Optional[int] = 40, ): + if not hasattr(tokenizer, 'eos_token_id'): + raise ValueError('`InContextLearningCodeEvalDataset` tokenizer must have `eos_token_id`') try: from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues] except ImportError as e: From 83a60b73ff9d984da5a6ef005c387216d3331ae8 Mon Sep 17 00:00:00 2001 From: Jeremy Dohmann Date: Wed, 20 Dec 2023 15:24:33 -0500 Subject: [PATCH 09/10] add tesT --- .../in_context_learning_evaluation.py | 8 ++--- .../test_in_context_learning_datasets.py | 29 +++++++++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py index fb3efcd4da..8ad18f9b1e 100644 --- a/composer/datasets/in_context_learning_evaluation.py +++ b/composer/datasets/in_context_learning_evaluation.py @@ -154,8 +154,8 @@ def __init__( fewshot_random_seed: int, cot_delimiter: str = '', ): - if not hasattr(tokenizer, 'eos_token_id'): - raise ValueError('`InContextLearningQATaskDataset` tokenizer must have `eos_token_id`') + if tokenizer.eos_token_id is None: + raise ValueError('`InContextLearningQATaskDataset` tokenizer must have non-null `eos_token_id`') try: from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues] except ImportError as e: @@ -951,8 +951,8 @@ def __init__( top_p: Optional[float] = 0.95, top_k: Optional[int] = 40, ): - if not hasattr(tokenizer, 'eos_token_id'): - raise ValueError('`InContextLearningCodeEvalDataset` tokenizer must have `eos_token_id`') + if tokenizer.eos_token_id is None: + raise ValueError('`InContextLearningCodeEvalDataset` tokenizer must have non-null `eos_token_id`') try: from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues] except ImportError as e: diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py index 5ea4540844..b16dd90de7 100644 --- a/tests/datasets/test_in_context_learning_datasets.py +++ b/tests/datasets/test_in_context_learning_datasets.py @@ -510,6 +510,35 @@ def test_qa_split_batch(tiny_opt_tokenizer, dataset_uri, tmp_path): assert isinstance(split2['generation_kwargs'], dict) +@pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl']) +@pytest.mark.parametrize('num_fewshot', [0]) +@pytest.mark.parametrize('prompt_string', ['I am a prompt', '']) +def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer, tmp_path, num_fewshot, prompt_string): + pytest.importorskip('datasets') + + local_data = os.path.join(os.path.dirname(__file__), 'local_data') + + tokenizer = tiny_gpt2_tokenizer + dataset_uri = f'{local_data}/{dataset_uri}' + batch_size = 4 + seqlen = 512 + # empirical number from the small test dataset + tiny_gpt2_tokenizer.eos_token_id = None + with pytest.raises(ValueError): + _ = get_icl_task_dataloader('question_answering', + dataset_uri, + tokenizer, + batch_size, + max_seq_len=seqlen, + pad_tok_id=tokenizer.eos_token_id, + num_fewshot=num_fewshot, + prompt_string=prompt_string, + example_delimiter='\n', + question_prelimiter='Q: ', + continuation_delimiter='\nA:', + destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl')) + + @pytest.mark.parametrize('dataset_uri', ['triviaqa_small.jsonl']) @pytest.mark.parametrize('num_fewshot', [0, 2]) @pytest.mark.parametrize('prompt_string', ['I am a prompt', '']) From 5a34421a03e5b531e27da433c147cb9794cb6367 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 20 Dec 2023 13:15:41 -0800 Subject: [PATCH 10/10] Update tests/datasets/test_in_context_learning_datasets.py --- tests/datasets/test_in_context_learning_datasets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py index b16dd90de7..2e9a461fcf 100644 --- a/tests/datasets/test_in_context_learning_datasets.py +++ b/tests/datasets/test_in_context_learning_datasets.py @@ -522,7 +522,6 @@ def test_qa_task_dataloader_w_null_eos(dataset_uri, tiny_gpt2_tokenizer, tmp_pat dataset_uri = f'{local_data}/{dataset_uri}' batch_size = 4 seqlen = 512 - # empirical number from the small test dataset tiny_gpt2_tokenizer.eos_token_id = None with pytest.raises(ValueError): _ = get_icl_task_dataloader('question_answering',