diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index 1c907f2135..961e2cb678 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -139,6 +139,7 @@ def to(self, device: str): if isinstance(device, str): self._device = device.upper() self.request = None + self.request_dict.clear() else: logger.debug(f"device must be of type {str} but got {type(device)} instead") diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 7937deea52..0355a260da 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -88,7 +88,9 @@ def __init__( self.output_names = output_names self.model = model + self.compiled_model = None self.request = None + self.request_dict = {} self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None self._openvino_config = None @@ -457,11 +459,11 @@ def compile(self): cache_dir = Path(self.model_save_dir).joinpath("model_cache") ov_config["CACHE_DIR"] = str(cache_dir) logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}") - self.request = core.compile_model(self.model, self._device, ov_config) + self.compiled_model = core.compile_model(self.model, self._device, ov_config) # OPENVINO_LOG_LEVEL can be found in https://docs.openvino.ai/2023.2/openvino_docs_OV_UG_supported_plugins_AUTO_debugging.html if "OPENVINO_LOG_LEVEL" in os.environ and int(os.environ["OPENVINO_LOG_LEVEL"]) > 2: logger.info(f"{self._device} SUPPORTED_PROPERTIES:") - _print_compiled_model_properties(self.request) + _print_compiled_model_properties(self.compiled_model) def _reshape( self, @@ -500,6 +502,7 @@ def reshape(self, batch_size: int, sequence_length: int, height: int = None, wid self.is_dynamic = True if batch_size == -1 and sequence_length == -1 else False self.model = self._reshape(self.model, batch_size, sequence_length, height, width) self.request = None + self.request_dict.clear() return self def half(self): @@ -509,6 +512,7 @@ def half(self): apply_moc_transformations(self.model, cf=False) compress_model_transformation(self.model) self.request = None + self.request_dict.clear() return self def eval(self): diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index fe7cf14c17..bc2cd93c83 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -210,6 +210,7 @@ def update_pkv_precision(self, force_fp32=False): if self.is_dynamic: self.model = self._reshape(self.model, -1, -1) self.request = None + self.request_dict.clear() def _save_pretrained(self, save_directory: Union[str, Path]): """ @@ -345,7 +346,7 @@ def normalized_config(self): def compile(self): if self.request is None: super().compile() - self.request = self.request.create_infer_request() + self.request = self.compiled_model.create_infer_request() def _make_stateful(self): patch_stateful(self.config, self.model) @@ -424,9 +425,14 @@ def prepare_inputs( else: # past_key_values are not used explicitly, instead they are handled inside the model if past_key_values is None: + infer_req = self.request + if 'kwargs' in kwargs.keys(): + tid = kwargs['kwargs']['tid'] + if tid in self.request_dict: + infer_req = self.request_dict[tid] # This is the first iteration in a sequence, reset all states - if self.request is not None: - self.request.reset_state() + if infer_req is not None: + infer_req.reset_state() # Set initial value for the next beam_idx input that will be used at the current iteration # and will be optionally updated by _reorder_cache at the next iterations if beam_search is used self.next_beam_idx = np.arange(batch_size, dtype=int) @@ -473,6 +479,17 @@ def forward( ) -> CausalLMOutputWithPast: self.compile() + if 'kwargs' in kwargs.keys(): + tid = kwargs['kwargs']['tid'] + if tid in self.request_dict: + infer_req = self.request_dict[tid] + else: + infer_req = self.compiled_model.create_infer_request() + self.request_dict[tid] = infer_req + else: + tid = -1 + infer_req = self.request + inputs = self.prepare_inputs( input_ids=input_ids, attention_mask=attention_mask, @@ -484,9 +501,11 @@ def forward( if self._first_iter_beam_search: inputs, duplication_indices = self._deduplicate_inputs(inputs) # Run inference - self.request.start_async(inputs, share_inputs=True) - self.request.wait() - logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) + print(f'.... {tid} infer start ....\n') + infer_req.start_async(inputs, share_inputs=True) + infer_req.wait() + print(f'..... {tid} infer end .....\n') + logits = torch.from_numpy(infer_req.get_tensor("logits").data).to(self.device) if self.stateful: # Need a marker to differentiate the first generate iteration from the others in # the first condition at the function beginning above. @@ -497,7 +516,7 @@ def forward( if not self.stateful: if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) - past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) + past_key_values = tuple(infer_req.get_tensor(key).data for key in self.key_value_output_names) if self.config.model_type not in MULTI_QUERY_ATTN_MODELS or ( self.config.model_type == "falcon" and self.config.new_decoder_architecture ):