support deepseekvl

open-compass · Mar 20, 2024 · 2b573fb · 2b573fb
1 parent f38fd35
commit 2b573fb
Show file tree

Hide file tree

Showing 4 changed files with 142 additions and 31 deletions.
diff --git a/vlmeval/config.py b/vlmeval/config.py
@@ -9,46 +9,20 @@
 OmniLMM_ROOT = None
 LLAVA_V1_7B_MODEL_PTH = 'Please set your local path to LLaVA-7B-v1.1 here, the model weight is obtained by merging LLaVA delta weight based on vicuna-7b-v1.1 in https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md with vicuna-7b-v1.1. '
 
-models = {
-    'qwen_base': partial(QwenVL, model_path='Qwen/Qwen-VL'),
+ungrouped = {
     'TransCore_M': partial(TransCoreM, root=TransCore_ROOT),
-    'qwen_chat': partial(QwenVLChat, model_path='Qwen/Qwen-VL-Chat'),
     'PandaGPT_13B': partial(PandaGPT, name='PandaGPT_13B', root=PandaGPT_ROOT),
     'flamingov2': partial(OpenFlamingo, name='v2', mpt_pth='anas-awadalla/mpt-7b', ckpt_pth='openflamingo/OpenFlamingo-9B-vitl-mpt7b'),
-    'flamingov2_fs': partial(OpenFlamingo, name='v2', with_context=True, mpt_pth='anas-awadalla/mpt-7b', ckpt_pth='openflamingo/OpenFlamingo-9B-vitl-mpt7b'),
-    'idefics_9b_instruct': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-9b-instruct'),
-    'idefics_80b_instruct': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-80b-instruct'),
-    'idefics_9b_instruct_fs': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-9b-instruct', with_context=True),
-    'idefics_80b_instruct_fs': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-80b-instruct', with_context=True),
-    'llava_v1.5_7b': partial(LLaVA, model_pth='liuhaotian/llava-v1.5-7b'),
-    'llava_v1.5_13b': partial(LLaVA, model_pth='liuhaotian/llava-v1.5-13b'),
-    'llava_v1_7b': partial(LLaVA, model_pth=LLAVA_V1_7B_MODEL_PTH),
-    'sharegpt4v_7b': partial(LLaVA, model_pth='Lin-Chen/ShareGPT4V-7B'),
-    'sharegpt4v_13b': partial(LLaVA, model_pth='Lin-Chen/ShareGPT4V-13B'),
-    'instructblip_7b': partial(InstructBLIP, name='instructblip_7b'),
-    'instructblip_13b': partial(InstructBLIP, name='instructblip_13b'),
     'VisualGLM_6b': partial(VisualGLM, model_path='THUDM/visualglm-6b'),
-    'MiniGPT-4-v2': partial(MiniGPT4, mode='v2', root=MiniGPT4_ROOT),
-    'MiniGPT-4-v1-7B': partial(MiniGPT4, mode='v1_7b', root=MiniGPT4_ROOT),
-    'MiniGPT-4-v1-13B': partial(MiniGPT4, mode='v1_13b', root=MiniGPT4_ROOT),
-    'XComposer': partial(XComposer, model_path='internlm/internlm-xcomposer-vl-7b'),
-    'XComposer2': partial(XComposer2, model_path='internlm/internlm-xcomposer2-vl-7b'),
     'mPLUG-Owl2': partial(mPLUG_Owl2, model_path='MAGAer13/mplug-owl2-llama2-7b'),
     'cogvlm-grounding-generalist':partial(CogVlm, name='cogvlm-grounding-generalist',tokenizer_name ='lmsys/vicuna-7b-v1.5'),
     'cogvlm-chat':partial(CogVlm, name='cogvlm-chat',tokenizer_name ='lmsys/vicuna-7b-v1.5'),
     'sharedcaptioner':partial(SharedCaptioner, model_path='Lin-Chen/ShareCaptioner'),
     'emu2':partial(Emu, name='emu2'),
     'emu2_chat':partial(Emu, name='emu2_chat'),
-    'monkey':partial(Monkey, model_path='echo840/Monkey'),
-    'monkey-chat':partial(MonkeyChat, model_path='echo840/Monkey-Chat'),
-    'Yi_VL_6B':partial(Yi_VL, model_path='01-ai/Yi-VL-6B', root=Yi_ROOT),
-    'Yi_VL_34B':partial(Yi_VL, model_path='01-ai/Yi-VL-34B', root=Yi_ROOT),
     'MMAlaya':partial(MMAlaya, model_path='DataCanvas/MMAlaya'),
     'MiniCPM-V':partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'),
     'OmniLMM_12B':partial(OmniLMM12B, model_path='openbmb/OmniLMM-12B', root=OmniLMM_ROOT),
-    'InternVL-Chat-V1-1':partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-1'),
-    'InternVL-Chat-V1-2': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-2'),
-    'InternVL-Chat-V1-2-Plus': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus'),
 }
 
 api_models = {
@@ -65,22 +39,75 @@
     'GeminiProVision': partial(GeminiProVision, temperature=0, retry=10),
     'QwenVLPlus': partial(QwenVLAPI, model='qwen-vl-plus', temperature=0, retry=10),
     'QwenVLMax': partial(QwenVLAPI, model='qwen-vl-max', temperature=0, retry=10),
-    # Internal Only
     'Step1V': partial(Step1V, temperature=0, retry=10),
     # Internal Only
     'Claude3V_Opus': partial(Claude3V, model='claude-3-opus-20240229', temperature=0, retry=10),
     'Claude3V_Sonnet': partial(Claude3V, model='claude-3-sonnet-20240229', temperature=0, retry=10),
     'Claude3V_Haiku': partial(Claude3V, model='claude-3-haiku-20240307', temperature=0, retry=10),
 }
 
-xtuner_models = {
+xtuner_series = {
     'llava-internlm2-7b': partial(LLaVA_XTuner, llm_path='internlm/internlm2-chat-7b', llava_path='xtuner/llava-internlm2-7b', visual_select_layer=-2, prompt_template='internlm2_chat'),
     'llava-internlm2-20b': partial(LLaVA_XTuner, llm_path='internlm/internlm2-chat-20b', llava_path='xtuner/llava-internlm2-20b', visual_select_layer=-2, prompt_template='internlm2_chat'),
     'llava-internlm-7b': partial(LLaVA_XTuner, llm_path='internlm/internlm-chat-7b', llava_path='xtuner/llava-internlm-7b', visual_select_layer=-2, prompt_template='internlm_chat'),
     'llava-v1.5-7b-xtuner': partial(LLaVA_XTuner, llm_path='lmsys/vicuna-7b-v1.5', llava_path='xtuner/llava-v1.5-7b-xtuner', visual_select_layer=-2, prompt_template='vicuna'),
     'llava-v1.5-13b-xtuner': partial(LLaVA_XTuner, llm_path='lmsys/vicuna-13b-v1.5', llava_path='xtuner/llava-v1.5-13b-xtuner', visual_select_layer=-2, prompt_template='vicuna'),
 }
 
+qwen_series = {
+    'qwen_base': partial(QwenVL, model_path='Qwen/Qwen-VL'),
+    'qwen_chat': partial(QwenVLChat, model_path='Qwen/Qwen-VL-Chat'),
+    'monkey':partial(Monkey, model_path='echo840/Monkey'),
+    'monkey-chat':partial(MonkeyChat, model_path='echo840/Monkey-Chat')
+}
+
+llava_series = {
+    'llava_v1.5_7b': partial(LLaVA, model_pth='liuhaotian/llava-v1.5-7b'),
+    'llava_v1.5_13b': partial(LLaVA, model_pth='liuhaotian/llava-v1.5-13b'),
+    'llava_v1_7b': partial(LLaVA, model_pth=LLAVA_V1_7B_MODEL_PTH),
+    'sharegpt4v_7b': partial(LLaVA, model_pth='Lin-Chen/ShareGPT4V-7B'),
+    'sharegpt4v_13b': partial(LLaVA, model_pth='Lin-Chen/ShareGPT4V-13B'),
+}
+
+internvl_series = {
+    'InternVL-Chat-V1-1':partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-1'),
+    'InternVL-Chat-V1-2': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-2'),
+    'InternVL-Chat-V1-2-Plus': partial(InternVLChat, model_path='OpenGVLab/InternVL-Chat-Chinese-V1-2-Plus'),
+}
+
+yivl_series = {
+    'Yi_VL_6B':partial(Yi_VL, model_path='01-ai/Yi-VL-6B', root=Yi_ROOT),
+    'Yi_VL_34B':partial(Yi_VL, model_path='01-ai/Yi-VL-34B', root=Yi_ROOT),
+}
+
+xcomposer_series = {
+    'XComposer': partial(XComposer, model_path='internlm/internlm-xcomposer-vl-7b'),
+    'XComposer2': partial(XComposer2, model_path='internlm/internlm-xcomposer2-vl-7b'),
+}
+
+minigpt4_series = {
+    'MiniGPT-4-v2': partial(MiniGPT4, mode='v2', root=MiniGPT4_ROOT),
+    'MiniGPT-4-v1-7B': partial(MiniGPT4, mode='v1_7b', root=MiniGPT4_ROOT),
+    'MiniGPT-4-v1-13B': partial(MiniGPT4, mode='v1_13b', root=MiniGPT4_ROOT),
+}
+
+idefics_series = {
+    'idefics_9b_instruct': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-9b-instruct'),
+    'idefics_80b_instruct': partial(IDEFICS, model_pth='HuggingFaceM4/idefics-80b-instruct'),
+}
+
+instructblip_series = {
+    'instructblip_7b': partial(InstructBLIP, name='instructblip_7b'),
+    'instructblip_13b': partial(InstructBLIP, name='instructblip_13b'),
+}
+
 supported_VLM = {}
-for model_set in [models, api_models, xtuner_models]:
-    supported_VLM.update(model_set)
+
+model_groups = [
+    ungrouped, api_models, 
+    xtuner_series, qwen_series, llava_series, internvl_series, yivl_series,
+    xcomposer_series, minigpt4_series, idefics_series, instructblip_series,
+]
+
+for grp in model_groups:
+    supported_VLM.update(grp)
diff --git a/vlmeval/smp/misc.py b/vlmeval/smp/misc.py
@@ -168,3 +168,16 @@ def load_env():
             os.environ[k] = v
     print(f'API Keys successfully loaded from {pth}')
     return
+
+def pip_install_robust(package):
+    import sys
+    retry = 3
+    while retry > 0:
+        try:
+            package_base = package.split('=')[0]
+            module = __import__(package)
+            return True
+        except ImportError:
+            subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])
+            retry -= 1
+    return False
diff --git a/vlmeval/vlm/__init__.py b/vlmeval/vlm/__init__.py
@@ -24,3 +24,4 @@
 from .xcomposer2 import XComposer2
 from .yi_vl import Yi_VL
 from .internvl_chat import InternVLChat
+from .deepseek_vl import DeepSeekVL
diff --git a/vlmeval/vlm/deepseek_vl.py b/vlmeval/vlm/deepseek_vl.py
@@ -0,0 +1,70 @@
+import sys
+import torch
+from transformers import AutoModelForCausalLM
+import warnings
+from vlmeval.smp import isimg, pip_install
+
+
+class DeepSeekVL:
+
+    INSTALL_REQ = True
+
+    def check_install(self):
+        installed = pip_install('deepseek_vl')
+        if not installed:
+            warnings.warn(
+                'Please first install deepseek_vl from source codes in: https://github.com/deepseek-ai/DeepSeek-VL')
+            sys.exit(-1)
+
+    def __init__(self, model_path='deepseek-ai/deepseek-vl-7b-chat', **kwargs):
+        self.check_install()
+        assert model_path is not None
+        self.model_path = model_path
+        from deepseek_vl.models import VLChatProcessor
+
+        self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
+        self.tokenizer = self.vl_chat_processor.tokenizer
+
+        model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, device='cpu')
+        self.model = model.to(torch.bfloat16).cuda().eval()
+
+        torch.cuda.empty_cache()
+        default_kwargs = dict(max_new_tokens=512, do_sample=False, use_cache=True)
+        default_kwargs.update(kwargs)
+        self.kwargs = default_kwargs
+        warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
+
+    def prepare_inputs(self, msgs):
+        content, images = '', []
+        for s in msgs:
+            if isimg(s):
+                images.append(s)
+                content += '<image_placeholder>'
+            else:
+                content += s
+        conversation = [
+            dict(role='User', content=content, images=images),
+            dict(role='Assistant', content='')
+        ]
+        return conversation
+
+    def interleave_generate(self, ti_list, dataset=None):
+        conversation = self.prepare_inputs(ti_list)
+        from deepseek_vl.utils.io import load_pil_images
+        pil_images = load_pil_images(conversation)
+        prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True)
+        prepare_inputs = prepare_inputs.to(self.model.device)
+        input_embeds = self.model.prepare_inputs_embeds(**prepare_inputs)
+
+        outputs = self.model.language_model.generate(
+            input_embeds=input_embeds,
+            attention_mask=prepare_inputs.attention_mask,
+            pad_token_id=self.tokenizer.eos_token_id,
+            bos_token_id=self.tokenizer.bos_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            **self.kwargs)
+        answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+        return answer
+
+    def generate(self, image_path, prompt, dataset=None):
+        return self.interleave_generate([image_path, prompt], dataset=dataset)