-
Notifications
You must be signed in to change notification settings - Fork 287
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
update documentation and example scripts
- Loading branch information
Showing
15 changed files
with
258 additions
and
24 deletions.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#!/bin/bash | ||
#SBATCH --nodes=1 | ||
#SBATCH --ntasks-per-node=2 | ||
#SBATCH --gpus-per-task=1 | ||
|
||
<<com | ||
Example Slurm evaluation script. | ||
Notes: | ||
- VQAv2 test-dev and test-std annotations are not publicly available. | ||
To evaluate on these splits, please follow the VQAv2 instructions and submit to EvalAI. | ||
This script will evaluate on the val split. | ||
- Vizwiz test-dev annotations are also not publicly available; please go through EvalAI. | ||
com | ||
|
||
export PYTHONFAULTHANDLER=1 | ||
export CUDA_LAUNCH_BLOCKING=0 | ||
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | ||
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | ||
export MASTER_PORT=$(shuf -i 0-65535 -n 1) | ||
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | ||
|
||
echo go $COUNT_NODE | ||
echo $HOSTNAMES | ||
|
||
export PYTHONPATH="$PYTHONPATH:open_flamingo" | ||
srun --cpu_bind=v --accel-bind=gn python | ||
deepspeed open_flamingo/open_flamingo/eval/evaluate.py \ | ||
--vision_encoder_path ViT-L-14 \ | ||
--vision_encoder_pretrained openai\ | ||
--lm_path anas-awadalla/mpt-1b-redpajama-200b \ | ||
--tokenizer_path anas-awadalla/mpt-1b-redpajama-200b \ | ||
--cross_attn_every_n_layers 1 \ | ||
--checkpoint_path "openflamingo/OpenFlamingo-3B-vitl-mpt1b/checkpoint.pt" \ | ||
--results_file "results.json" \ | ||
--precision fp32 \ | ||
--batch_size 8 \ | ||
--deepspeed \ | ||
--eval_coco \ | ||
--eval_vqav2 \ | ||
--eval_flickr30 \ | ||
--eval_ok_vqa \ | ||
--eval_textvqa \ | ||
--eval_vizwiz \ | ||
--eval_hateful_memes \ | ||
--coco_train_image_dir_path "/path/to/mscoco_karpathy/train2014" \ | ||
--coco_val_image_dir_path "/path/to/mscoco_karpathy/val2014" \ | ||
--coco_karpathy_json_path "/path/to/mscoco_karpathy/dataset_coco.json" \ | ||
--coco_annotations_json_path "/path/to/mscoco_karpathy/annotations/captions_val2014.json" \ | ||
--vqav2_train_image_dir_path "/path/to/vqav2/train2014" \ | ||
--vqav2_train_annotations_json_path "/path/to/vqav2/v2_mscoco_train2014_annotations.json" \ | ||
--vqav2_train_questions_json_path "/path/to/vqav2/v2_OpenEnded_mscoco_train2014_questions.json" \ | ||
--vqav2_test_image_dir_path "/path/to/vqav2/val2014" \ | ||
--vqav2_test_annotations_json_path "/path/to/vqav2/v2_mscoco_val2014_annotations.json" \ | ||
--vqav2_test_questions_json_path "/path/to/vqav2/v2_OpenEnded_mscoco_val2014_questions.json" \ | ||
--flickr_image_dir_path "/path/to/flickr30k/flickr30k-images" \ | ||
--flickr_karpathy_json_path "/path/to/flickr30k/dataset_flickr30k.json" \ | ||
--flickr_annotations_json_path "/path/to/flickr30k/dataset_flickr30k_coco_style.json" \ | ||
--ok_vqa_train_image_dir_path "/path/to/okvqa/train2014" \ | ||
--ok_vqa_train_annotations_json_path "/path/to/okvqa/mscoco_train2014_annotations.json" \ | ||
--ok_vqa_train_questions_json_path "/path/to/okvqa/OpenEnded_mscoco_train2014_questions.json" \ | ||
--ok_vqa_test_image_dir_path "/path/to/okvqa/val2014" \ | ||
--ok_vqa_test_annotations_json_path "/path/to/okvqa/mscoco_val2014_annotations.json" \ | ||
--ok_vqa_test_questions_json_path "/path/to/okvqa/OpenEnded_mscoco_val2014_questions.json" \ | ||
--textvqa_image_dir_path "/path/to/textvqa/train_images/" \ | ||
--textvqa_train_questions_json_path "/path/to/textvqa/train_questions_vqa_format.json" \ | ||
--textvqa_train_annotations_json_path "/path/to/textvqa/train_annotations_vqa_format.json" \ | ||
--textvqa_test_questions_json_path "/path/to/textvqa/val_questions_vqa_format.json" \ | ||
--textvqa_test_annotations_json_path "/path/to/textvqa/val_annotations_vqa_format.json" \ | ||
--vizwiz_train_image_dir_path "/path/to/v7w/train" \ | ||
--vizwiz_test_image_dir_path "/path/to/v7w/val" \ | ||
--vizwiz_train_questions_json_path "/path/to/v7w/train_questions_vqa_format.json" \ | ||
--vizwiz_train_annotations_json_path "/path/to/v7w/train_annotations_vqa_format.json" \ | ||
--vizwiz_test_questions_json_path "/path/to/v7w/val_questions_vqa_format.json" \ | ||
--vizwiz_test_annotations_json_path "/path/to/v7w/val_annotations_vqa_format.json" \ | ||
--hateful_memes_image_dir_path "/path/to/hateful_memes/img" \ | ||
--hateful_memes_train_annotations_json_path "/path/to/hateful_memes/train.json" \ | ||
--hateful_memes_test_annotations_json_path "/path/to/hateful_memes/dev.json" \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#!/bin/bash | ||
#SBATCH --nodes 1 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --gpus-per-task=1 | ||
#SBATCH --time=5-00:00:00 | ||
#SBATCH --job-name=openflamingo | ||
|
||
export PYTHONFAULTHANDLER=1 | ||
export CUDA_LAUNCH_BLOCKING=0 | ||
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | ||
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | ||
export MASTER_PORT=$(shuf -i 0-65535 -n 1) | ||
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | ||
|
||
export PYTHONPATH="$PYTHONPATH:open_flamingo" | ||
srun --cpu_bind=v --accel-bind=gn python open_flamingo/open_flamingo/train/train.py \ | ||
--lm_path meta-llama/Llama-2-13b \ | ||
--tokenizer_path meta-llama/Llama-2-13b \ | ||
--model_family flamingo \ | ||
--cross_attn_every_n_layers 4 \ | ||
--dataset_resampled \ | ||
--batch_size_mmc4 16 \ | ||
--batch_size_laion 32 \ | ||
--train_num_samples_mmc4 125000\ | ||
--train_num_samples_laion 250000 \ | ||
--loss_multiplier_laion 0.2 \ | ||
--workers=4 \ | ||
--run_name "fsdp" \ | ||
--num_epochs 480 \ | ||
--warmup_steps 0 \ | ||
--mmc4_textsim_threshold 0.0 \ | ||
--laion_shards "/path/to/laion-samples/{000000..000001}.tar" \ | ||
--mmc4_shards "/path/to/mmc4-samples/{000000..000001}.tar" \ | ||
--report_to_wandb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/bin/bash | ||
#SBATCH --nodes 1 | ||
#SBATCH --ntasks-per-node=8 | ||
#SBATCH --gpus-per-task=1 | ||
#SBATCH --time=5-00:00:00 | ||
#SBATCH --job-name=openflamingo | ||
|
||
<<com | ||
To use FSDP, please make sure to use Pytorch Nightly > 2.0.1! | ||
com | ||
|
||
export PYTHONFAULTHANDLER=1 | ||
export CUDA_LAUNCH_BLOCKING=0 | ||
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | ||
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | ||
export MASTER_PORT=$(shuf -i 0-65535 -n 1) | ||
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | ||
|
||
export PYTHONPATH="$PYTHONPATH:open_flamingo" | ||
srun --cpu_bind=v --accel-bind=gn python open_flamingo/open_flamingo/train/train.py \ | ||
--lm_path meta-llama/Llama-2-13b \ | ||
--tokenizer_path meta-llama/Llama-2-13b \ | ||
--model_family flamingo \ | ||
--cross_attn_every_n_layers 4 \ | ||
--dataset_resampled \ | ||
--batch_size_mmc4 16 \ | ||
--batch_size_laion 32 \ | ||
--fsdp \ | ||
--fsdp_sharding_strategy hybrid \ | ||
--train_num_samples_mmc4 125000\ | ||
--train_num_samples_laion 250000 \ | ||
--loss_multiplier_laion 0.2 \ | ||
--workers=4 \ | ||
--run_name "fsdp" \ | ||
--num_epochs 480 \ | ||
--warmup_steps 0 \ | ||
--mmc4_textsim_threshold 0.0 \ | ||
--laion_shards "/path/to/laion-samples/{000000..000001}.tar" \ | ||
--mmc4_shards "/path/to/mmc4-samples/{000000..000001}.tar" \ | ||
--report_to_wandb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# OpenFlamingo: Modeling | ||
We provide modules to mix-and-match into several vision-language model architectures. | ||
|
||
## What is a VLM? | ||
A **vision-language model (VLM)** is a language model capable of processing a sequence of arbitraily interleaved images/videos with text to output text. | ||
|
||
![A VLM takes in a sequence of interleaved images/videos with text and outputs text.](../../docs/signature.png) | ||
|
||
The forward signature of a VLM is as follows: | ||
|
||
* `vision_x`: The batch of images / videos to process. This is a tensor of the shape `(B, T_img, F, C, H, W)`, where `B` is the batch dimension, `T_img` collates the images/videos within one input sequence, `F` collates frames within a video, and `(C, H, W)` are the channel, height, and width dimensions respectively. | ||
* `lang_x`: The batch of input_ids (text) to process. This is a tensor of the shape `(B, T_txt)`, where `T_txt` is the number of text tokens within one input sequence. | ||
|
||
To explain to the model how to interleave the image/text elements within a sequence, `lang_x` should include `<image>` tokens ("media tokens") that specify where the images/videos are placed. (See figure below) | ||
|
||
![Illustration of what the inputs to a VLM look like.](../../docs/inputs.png) | ||
|
||
|
||
## VLM modeling with the open_flamingo repository | ||
This repository provides modules for constructing various VLM architectures. | ||
|
||
All models inherit from the `VLM` (vision-language model) class defined in `src/vlm.py`. As documented there, a VLM is defined by four component modules: | ||
1. A **vision encoder** that extracts features from pixels (e.g. CLIP). This module should take in vision inputs of the shape `(B, T_img, F, C, H, W)` and output features of the shape `(B, T_img, F, v, d)`. | ||
2. A **vision tokenizer** that converts features from the vision encoder into token-like embeddings (e.g. PerceiverResampler). This module should take in vision features of the shape `(B, T_img, F, v, d)` and output tokens of the shape `(B, T_img, n, d)`. | ||
3. A fusion method that allows the language model to attend to these tokens, e.g. cross-attention (as done in [Flamingo](https://arxiv.org/abs/2204.14198)), or placing the tokens directly in the language model's input sequence (as done in [Kosmos](https://arxiv.org/abs/2306.14824)). | ||
4. A language model. | ||
|
||
This repository allows us to construct architectures by mixing-and-matching options for all four kinds of modules. | ||
|
||
### Supported vision encoders | ||
All CLIP-style encoders from the [OpenCLIP](https://github.com/mlfoundations/open_clip) library are supported. This includes OpenAI's models. | ||
|
||
### Supported vision tokenizers | ||
* [Perceiver Resampler](https://arxiv.org/abs/2103.03206) | ||
* [Q-former](https://arxiv.org/abs/2301.12597) | ||
* Linear projection | ||
|
||
### Supported fusion methods | ||
Models are further split into those that inherit from `VLMWithCrossAttention` (dense cross attention to fuse vision + language, Flamingo-style) vs. `VLMWithLanguageStream` (insert vision tokens into the language stream, Kosmos-style). | ||
|
||
![A VLM with cross attention and a VLM with language stream represent two methods for fusing the vision and language inputs.](../../docs/xattn_langstream.png) | ||
|
||
### Supported language models | ||
All autoregressive language models from [Huggingface Transformers](https://huggingface.co/models) are supported. | ||
|
||
## Example architectures | ||
Using these modules, the following architectures are implemented as examples. | ||
|
||
|Model|Vision tokenizer|Fusion method|Trainable parameters| | ||
|----|------------|------------|------------| | ||
|[Flamingo](https://arxiv.org/abs/2204.14198)|Perceiver|Cross attention|Added language model embeddings, vision tokenizer| | ||
|[Kosmos](https://arxiv.org/abs/2306.14824)|Perceiver|Language stream|Everything except the vision encoder| | ||
|[BLIP](https://arxiv.org/abs/2301.12597)|Q-former|Language stream|Added language model embeddings, vision tokenizer| | ||
|
||
We welcome contributions! If you'd like to add additional vision tokenizers, fusion methods, or model types, please open a PR. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,9 +5,7 @@ inflection | |
pycocoevalcap | ||
pycocotools | ||
tqdm | ||
|
||
black | ||
mypy | ||
pylint | ||
pytest | ||
requests | ||
requests |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,3 +3,4 @@ braceexpand | |
webdataset | ||
tqdm | ||
wandb | ||
deepspeed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
einops | ||
einops-exts | ||
transformers>=4.28.1 | ||
torch==2.0.1 | ||
torch>=2.0.1 | ||
pillow | ||
open_clip_torch>=2.16.0 | ||
sentencepiece==0.1.98 |