# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Gemma 4 multimodal model (image + audio + video support). Adds vision tower, audio tower, and multimodal embedders on top of the text-only Gemma4ForCausalLM. The vision/audio encoders are loaded via AutoModel.from_config and run in eager mode while the language model uses the vLLM-optimized path. Video support: Gemma4 does **not** have a native video tower. Videos are decomposed into timestamped image frames (up to 32 frames at 70 soft tokens each) and fed through the same vision tower as regular images. The processor inserts ``mm:ss`` timestamps between frames so the model can reason about temporal order. """ import math from collections.abc import Iterable, Mapping, Sequence from typing import Annotated, Any, Literal import numpy as np import torch from PIL import Image as PILImage from torch import nn from transformers import AutoModel, BatchFeature from transformers.models.gemma4 import ( Gemma4Config, Gemma4Processor, Gemma4VisionConfig, ) from transformers.models.gemma4.configuration_gemma4 import ( Gemma4AudioConfig, Gemma4TextConfig, ) from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions from vllm.inputs import MultiModalDataDict from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.models.gemma4 import Gemma4ForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalFieldConfig, MultiModalKwargsItems, VideoItem, ) from vllm.multimodal.parse import ( AudioProcessorItems, ImageProcessorItems, MultiModalDataItems, MultiModalDataParser, ) from vllm.multimodal.processing import BaseDummyInputsBuilder from vllm.multimodal.processing.processor import ( BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, PromptUpdateDetails, ) from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( MultiModalEmbeddings, SupportsEagle3, SupportsLoRA, SupportsMultiModal, SupportsPP, ) from .utils import ( AutoWeightsLoader, WeightsMapper, init_vllm_registered_model, maybe_prefix, )