docs: add canonical tooling corpus (147 files) from Google/HF/frameworks

Five-lane parallel research pass. Each subdir under tooling/ has its own README indexing downloaded files with verified upstream sources. - google-official/: deepmind-gemma JAX examples, gemma_pytorch scripts, gemma.cpp API server docs, google-gemma/cookbook notebooks, ai.google.dev HTML snapshots, Gemma 3 tech report - huggingface/: 8 gemma-4-* model cards, chat-template .jinja files, tokenizer_config.json, transformers gemma4/ source, launch blog posts, official HF Spaces app.py - inference-frameworks/: vLLM/llama.cpp/MLX/Keras-hub/TGI/Gemini API/Vertex AI comparison, run_commands.sh with 8 working launches, 9 code snippets - gemma-family/: 12 per-variant briefs (ShieldGemma 2, CodeGemma, PaliGemma 2, Recurrent/Data/Med/TxGemma, Embedding/Translate/Function/Dolphin/SignGemma) - fine-tuning/: Unsloth Gemma 4 notebooks, Axolotl YAMLs (incl 26B-A4B MoE), TRL scripts, Google cookbook fine-tune notebooks, recipe-recommendation.md Findings that update earlier CORPUS_* docs are flagged in tooling/README.md (not applied) — notably the new <|turn>/<turn|> prompt format, gemma_pytorch abandonment, gemma.cpp Gemini-API server, transformers AutoModelForMultimodalLM, FA2 head_dim=512 break, 26B-A4B MoE quantization rules, no Gemma 4 tech report PDF yet, no Gemma-4-generation specialized siblings yet. Pre-commit secrets hook bypassed per user authorization — flagged "secrets" are base64 notebook cell outputs and example Ed25519 keys in the HDP agentic-security demo, not real credentials. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 12:24:48 -04:00
parent 5011059f5d
commit eecebe7ef5
149 changed files with 181297 additions and 0 deletions
@@ -0,0 +1,389 @@
+# Gemma4_(E2B)-Multimodal.ipynb — extracted cells
+# Source: https://github.com/huggingface/huggingface-gemma-recipes/blob/main/notebooks/Gemma4_(E2B)-Multimodal.ipynb
+
+# ===== CELL 0 (markdown) =====
+# This notebook has vibe test examples to test image, text, audio capabilities of Gemma-4 model. To get started, let's install latest stable release of transformers.
+
+# ===== CELL 1 (code) =====
+!pip install -U transformers
+
+# ===== CELL 2 (markdown) =====
+# We can load model into `AutoModelForMultimodalLM` to make use of all capabilities.
+
+# ===== CELL 3 (code) =====
+import torch
+from PIL import Image
+
+from transformers import AutoModelForMultimodalLM, AutoProcessor
+#model_list = ["google/gemma-4-26B-A4B-it", "google/gemma-4-E4B-it",
+#              "google/gemma-4-E2B-it", "google/gemma-4-31B-it"]
+model_id = "google/gemma-4-E2B-it"
+model = AutoModelForMultimodalLM.from_pretrained(model_id, device_map="auto")
+processor = AutoProcessor.from_pretrained(model_id)
+
+# ===== CELL 4 (markdown) =====
+# ## Code completion
+
+# ===== CELL 5 (markdown) =====
+# We give Gemma-4 a website screenshot to reproduce the code.
+
+# ===== CELL 6 (code) =====
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/landing_page.png",
+            },
+            {"type": "text", "text": "Write HTML code for this page."},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    add_generation_prompt=True,
+    enable_thinking=True,
+).to(model.device)
+
+output = model.generate(**inputs, max_new_tokens=4000)
+
+# ===== CELL 7 (code) =====
+input_len = inputs.input_ids.shape[-1]
+generated_text_ids = output[0][input_len:]
+generated_text = processor.decode(generated_text_ids, skip_special_tokens=True)
+result = processor.parse_response(generated_text)
+
+print(result["content"])
+
+# ===== CELL 8 (markdown) =====
+# ## Video Inference
+
+# ===== CELL 9 (markdown) =====
+# We test Gemma-4 on video understanding. If you want to run this example with larger models which don't take audio input, disable `load_audio_from_video`.
+
+# ===== CELL 10 (code) =====
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "video", "url": "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/concert.mp4"},
+            {"type": "text", "text": "What is happening in the video? What is the song about?"},
+        ],
+    },
+]
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    add_generation_prompt=True,
+    load_audio_from_video=True,
+).to(model.device)
+output = model.generate(**inputs, max_new_tokens=200)
+input_len = inputs.input_ids.shape[-1]
+generated_text_ids = output[0][input_len:]
+generated_text = processor.decode(generated_text_ids, skip_special_tokens=True)
+result = processor.parse_response(generated_text)
+
+# ===== CELL 11 (code) =====
+print(result["content"])
+
+# ===== CELL 12 (markdown) =====
+# ## Multimodal Function Calling
+
+# ===== CELL 13 (code) =====
+import re
+
+WEATHER_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Gets the current weather for a specific location.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "city": {"type": "string", "description": "The city name"},
+            },
+            "required": ["city"],
+        },
+    },
+}
+tools = [WEATHER_TOOL]
+
+messages = [
+    {"role": "user", "content": [
+        {"type": "image", "image": "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/thailand.jpg"},
+        {"type": "text", "text": "What is the city in this image? Check the weather there right now."},
+    ]},
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    tools=[WEATHER_TOOL],
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    add_generation_prompt=True,
+    enable_thinking=True,
+).to(model.device)
+
+# ===== CELL 14 (code) =====
+output = model.generate(**inputs, max_new_tokens=1000)
+
+# ===== CELL 15 (code) =====
+input_len = inputs.input_ids.shape[-1]
+generated_text_ids = output[0][input_len:]
+generated_text = processor.decode(generated_text_ids, skip_special_tokens=True)
+result = processor.parse_response(generated_text)
+
+# ===== CELL 16 (code) =====
+print(result["content"])
+
+# ===== CELL 17 (markdown) =====
+# # Any-to-any inference
+
+# ===== CELL 18 (markdown) =====
+# We can also run the model with `any-to-any` pipeline.
+
+# ===== CELL 19 (code) =====
+from transformers import pipeline
+
+pipe = pipeline("any-to-any", model="google/gemma-4-e2b-it")
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "image": "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/rockets.mp4",
+            },
+            {"type": "text", "text": "What is happening in this video?"},
+        ],
+    }
+]
+
+# ===== CELL 20 (code) =====
+pipe(messages)#, load_audio_from_video=True)
+
+# ===== CELL 21 (code) =====
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "image": "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/rockets.mp4",
+            },
+            {"type": "text", "text": "What is happening in this video?"},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+    return_tensors="pt"
+)
+inputs = inputs.to(model.device)
+
+generated_ids = model.generate(**inputs, max_new_tokens=128)
+generated_ids_trimmed = [
+    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+]
+output_text = processor.batch_decode(
+    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+print(output_text)
+
+# ===== CELL 22 (markdown) =====
+# # Object detection and pointing
+
+# ===== CELL 23 (code) =====
+import re
+import torch
+from transformers.image_utils import load_image
+from PIL import Image
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import json
+
+# ===== CELL 24 (code) =====
+image_url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/bike.png"
+image = load_image(image_url)
+
+# ===== CELL 25 (code) =====
+def resize_to_48_multiple(image):
+    w, h = image.size
+    new_w = (w // 48) * 48
+    new_h = (h // 48) * 48
+    return image.crop((0, 0, new_w, new_h))
+
+# ===== CELL 26 (code) =====
+def inputs_for_object_detection(image, what_object):
+    messages = [
+        {
+            "role": "user", "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": f"What's the bounding box for the {what_object} in the image?"}
+            ]
+        }
+    ]
+
+    inputs = processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt",
+        enable_thinking=False,
+    )
+
+    return inputs.to(model.device)
+
+# ===== CELL 27 (code) =====
+def extract_json(text: str):
+    text = text.strip()
+
+    text = re.sub(r"^```(?:json)?\s*", "", text)
+    text = re.sub(r"\s*```$", "", text)
+
+    # Try direct parse first
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+
+    # Fallback: extract first JSON object or array
+    match = re.search(r'(\{.*\}|\[.*\])', text, re.DOTALL)
+    if match:
+        candidate = match.group(1)
+        return json.loads(candidate)
+
+    raise ValueError("No valid JSON found")
+
+# ===== CELL 28 (code) =====
+def detect_object(image_url, what_object):
+    image = load_image(image_url)
+    image = resize_to_48_multiple(image)
+    inputs = inputs_for_object_detection(image, what_object)
+    input_len = inputs["input_ids"].shape[-1]
+    generated_outputs = model.generate(**inputs, max_new_tokens=1000, do_sample=False)
+    generated = processor.decode(generated_outputs[0, input_len:])
+    parsed_json = extract_json(generated)[0]
+    return parsed_json
+
+# ===== CELL 29 (code) =====
+def draw_pascal_voc_boxes(i, image, box, label, resize_shape=(1000,1000)):
+    dpi = 72
+    width, height = image.size
+    fig, ax = plt.subplots(1, figsize=[width/dpi, height/dpi], tight_layout={'pad':0})
+
+    ax.imshow(image)
+
+    ymin, xmin, ymax, xmax = box
+    re_h, re_w = resize_shape if resize_shape is not None else (height, width)
+    xmin = (xmin / re_w) * width
+    ymin = (ymin/ re_h) * height
+    xmax = (xmax / re_w) * width
+    ymax = (ymax/ re_h) * height
+
+    w = xmax - xmin
+    h = ymax - ymin
+
+    rect = patches.Rectangle(
+        (xmin, ymin),
+        w,
+        h,
+        linewidth=10,
+        edgecolor="green",
+        facecolor="none"
+    )
+    ax.add_patch(rect)
+
+    if label is not None:
+        ax.text(xmin, ymin-25, label, fontsize=24, bbox=dict(facecolor="yellow", alpha=0.5))
+
+    plt.axis("off")
+    plt.savefig(f"boxes_{i}.png")
+    plt.close(fig)
+    display(fig)
+
+# ===== CELL 30 (code) =====
+def display_detected_object(image_url, what_object):
+    image = load_image(image_url)
+    image = resize_to_48_multiple(image)
+    detection = detect_object(image_url, what_object)
+    box = detection["box_2d"]
+    label = detection.get("label", f"{what_object}")
+    draw_pascal_voc_boxes("1000", image, box, label)
+
+# ===== CELL 31 (code) =====
+display_detected_object("https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/bike.png", "bike")
+
+# ===== CELL 32 (markdown) =====
+# ## Captioning
+
+# ===== CELL 33 (code) =====
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/bird.png"},
+            {"type": "text", "text": "Write single detailed caption for this image."},
+        ],
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    add_generation_prompt=True,
+).to(model.device)
+
+output = model.generate(**inputs, max_new_tokens=512)
+input_len = inputs.input_ids.shape[-1]
+generated_text_ids = output[0][input_len:]
+generated_text = processor.decode(generated_text_ids, skip_special_tokens=True)
+result = processor.parse_response(generated_text)
+print(result["content"])
+
+# ===== CELL 34 (markdown) =====
+# ## Audio Understanding
+
+# ===== CELL 35 (code) =====
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "audio", "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama_first_45_secs.mp3"},
+            {"type": "text", "text": "Can you describe this audio in detail?"},
+        ],
+    },
+]
+
+inputs = processor.apply_chat_template(
+    messages,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+    add_generation_prompt=True,
+).to(model.device)
+
+output = model.generate(
+    **inputs,
+    max_new_tokens=1000,
+    do_sample=False,
+)
+
+print(processor.decode(output[0], skip_special_tokens=True))
+
@@ -0,0 +1,595 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "This notebook has vibe test examples to test image, text, audio capabilities of Gemma-4 model. To get started, let's install latest stable release of transformers."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "!pip install -U transformers"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We can load model into `AutoModelForMultimodalLM` to make use of all capabilities."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import torch\n",
+    "from PIL import Image\n",
+    "\n",
+    "from transformers import AutoModelForMultimodalLM, AutoProcessor\n",
+    "#model_list = [\"google/gemma-4-26B-A4B-it\", \"google/gemma-4-E4B-it\",\n",
+    "#              \"google/gemma-4-E2B-it\", \"google/gemma-4-31B-it\"]\n",
+    "model_id = \"google/gemma-4-E2B-it\"\n",
+    "model = AutoModelForMultimodalLM.from_pretrained(model_id, device_map=\"auto\")\n",
+    "processor = AutoProcessor.from_pretrained(model_id)"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Code completion"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We give Gemma-4 a website screenshot to reproduce the code."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            {\n",
+    "                \"type\": \"image\",\n",
+    "                \"image\": \"https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/landing_page.png\",\n",
+    "            },\n",
+    "            {\"type\": \"text\", \"text\": \"Write HTML code for this page.\"},\n",
+    "        ],\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "inputs = processor.apply_chat_template(\n",
+    "    messages,\n",
+    "    tokenize=True,\n",
+    "    return_dict=True,\n",
+    "    return_tensors=\"pt\",\n",
+    "    add_generation_prompt=True,\n",
+    "    enable_thinking=True,\n",
+    ").to(model.device)\n",
+    "\n",
+    "output = model.generate(**inputs, max_new_tokens=4000)"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "input_len = inputs.input_ids.shape[-1]\n",
+    "generated_text_ids = output[0][input_len:]\n",
+    "generated_text = processor.decode(generated_text_ids, skip_special_tokens=True)\n",
+    "result = processor.parse_response(generated_text)\n",
+    "\n",
+    "print(result[\"content\"])"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Video Inference"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We test Gemma-4 on video understanding. If you want to run this example with larger models which don't take audio input, disable `load_audio_from_video`."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            {\"type\": \"video\", \"url\": \"https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/concert.mp4\"},\n",
+    "            {\"type\": \"text\", \"text\": \"What is happening in the video? What is the song about?\"},\n",
+    "        ],\n",
+    "    },\n",
+    "]\n",
+    "inputs = processor.apply_chat_template(\n",
+    "    messages,\n",
+    "    tokenize=True,\n",
+    "    return_dict=True,\n",
+    "    return_tensors=\"pt\",\n",
+    "    add_generation_prompt=True,\n",
+    "    load_audio_from_video=True,\n",
+    ").to(model.device)\n",
+    "output = model.generate(**inputs, max_new_tokens=200)\n",
+    "input_len = inputs.input_ids.shape[-1]\n",
+    "generated_text_ids = output[0][input_len:]\n",
+    "generated_text = processor.decode(generated_text_ids, skip_special_tokens=True)\n",
+    "result = processor.parse_response(generated_text)\n"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "print(result[\"content\"])"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Multimodal Function Calling"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import re\n",
+    "\n",
+    "WEATHER_TOOL = {\n",
+    "    \"type\": \"function\",\n",
+    "    \"function\": {\n",
+    "        \"name\": \"get_weather\",\n",
+    "        \"description\": \"Gets the current weather for a specific location.\",\n",
+    "        \"parameters\": {\n",
+    "            \"type\": \"object\",\n",
+    "            \"properties\": {\n",
+    "                \"city\": {\"type\": \"string\", \"description\": \"The city name\"},\n",
+    "            },\n",
+    "            \"required\": [\"city\"],\n",
+    "        },\n",
+    "    },\n",
+    "}\n",
+    "tools = [WEATHER_TOOL]\n",
+    "\n",
+    "messages = [\n",
+    "    {\"role\": \"user\", \"content\": [\n",
+    "        {\"type\": \"image\", \"image\": \"https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/thailand.jpg\"},\n",
+    "        {\"type\": \"text\", \"text\": \"What is the city in this image? Check the weather there right now.\"},\n",
+    "    ]},\n",
+    "]\n",
+    "\n",
+    "inputs = processor.apply_chat_template(\n",
+    "    messages,\n",
+    "    tools=[WEATHER_TOOL],\n",
+    "    tokenize=True,\n",
+    "    return_dict=True,\n",
+    "    return_tensors=\"pt\",\n",
+    "    add_generation_prompt=True,\n",
+    "    enable_thinking=True,\n",
+    ").to(model.device)"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "output = model.generate(**inputs, max_new_tokens=1000)"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "input_len = inputs.input_ids.shape[-1]\n",
+    "generated_text_ids = output[0][input_len:]\n",
+    "generated_text = processor.decode(generated_text_ids, skip_special_tokens=True)\n",
+    "result = processor.parse_response(generated_text)"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "print(result[\"content\"])"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Any-to-any inference"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "We can also run the model with `any-to-any` pipeline."
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "from transformers import pipeline\n",
+    "\n",
+    "pipe = pipeline(\"any-to-any\", model=\"google/gemma-4-e2b-it\")\n",
+    "\n",
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            {\n",
+    "                \"type\": \"video\",\n",
+    "                \"image\": \"https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/rockets.mp4\",\n",
+    "            },\n",
+    "            {\"type\": \"text\", \"text\": \"What is happening in this video?\"},\n",
+    "        ],\n",
+    "    }\n",
+    "]\n"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "pipe(messages)#, load_audio_from_video=True)"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            {\n",
+    "                \"type\": \"video\",\n",
+    "                \"image\": \"https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/rockets.mp4\",\n",
+    "            },\n",
+    "            {\"type\": \"text\", \"text\": \"What is happening in this video?\"},\n",
+    "        ],\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "inputs = processor.apply_chat_template(\n",
+    "    messages,\n",
+    "    tokenize=True,\n",
+    "    add_generation_prompt=True,\n",
+    "    return_dict=True,\n",
+    "    return_tensors=\"pt\"\n",
+    ")\n",
+    "inputs = inputs.to(model.device)\n",
+    "\n",
+    "generated_ids = model.generate(**inputs, max_new_tokens=128)\n",
+    "generated_ids_trimmed = [\n",
+    "    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
+    "]\n",
+    "output_text = processor.batch_decode(\n",
+    "    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
+    ")\n",
+    "print(output_text)\n"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Object detection and pointing"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "import re\n",
+    "import torch\n",
+    "from transformers.image_utils import load_image\n",
+    "from PIL import Image\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.patches as patches\n",
+    "import json"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "image_url = \"https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/bike.png\"\n",
+    "image = load_image(image_url)"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "def resize_to_48_multiple(image):\n",
+    "    w, h = image.size\n",
+    "    new_w = (w // 48) * 48\n",
+    "    new_h = (h // 48) * 48\n",
+    "    return image.crop((0, 0, new_w, new_h))"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "def inputs_for_object_detection(image, what_object):\n",
+    "    messages = [\n",
+    "        {\n",
+    "            \"role\": \"user\", \"content\": [\n",
+    "                {\"type\": \"image\", \"image\": image},\n",
+    "                {\"type\": \"text\", \"text\": f\"What's the bounding box for the {what_object} in the image?\"}\n",
+    "            ]\n",
+    "        }\n",
+    "    ]\n",
+    "\n",
+    "    inputs = processor.apply_chat_template(\n",
+    "        messages,\n",
+    "        tokenize=True,\n",
+    "        add_generation_prompt=True,\n",
+    "        return_dict=True,\n",
+    "        return_tensors=\"pt\",\n",
+    "        enable_thinking=False,\n",
+    "    )\n",
+    "\n",
+    "    return inputs.to(model.device)"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "def extract_json(text: str):\n",
+    "    text = text.strip()\n",
+    "\n",
+    "    text = re.sub(r\"^```(?:json)?\\s*\", \"\", text)\n",
+    "    text = re.sub(r\"\\s*```$\", \"\", text)\n",
+    "\n",
+    "    # Try direct parse first\n",
+    "    try:\n",
+    "        return json.loads(text)\n",
+    "    except json.JSONDecodeError:\n",
+    "        pass\n",
+    "\n",
+    "    # Fallback: extract first JSON object or array\n",
+    "    match = re.search(r'(\\{.*\\}|\\[.*\\])', text, re.DOTALL)\n",
+    "    if match:\n",
+    "        candidate = match.group(1)\n",
+    "        return json.loads(candidate)\n",
+    "\n",
+    "    raise ValueError(\"No valid JSON found\")"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "def detect_object(image_url, what_object):\n",
+    "    image = load_image(image_url)\n",
+    "    image = resize_to_48_multiple(image)\n",
+    "    inputs = inputs_for_object_detection(image, what_object)\n",
+    "    input_len = inputs[\"input_ids\"].shape[-1]\n",
+    "    generated_outputs = model.generate(**inputs, max_new_tokens=1000, do_sample=False)\n",
+    "    generated = processor.decode(generated_outputs[0, input_len:])\n",
+    "    parsed_json = extract_json(generated)[0]\n",
+    "    return parsed_json"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "def draw_pascal_voc_boxes(i, image, box, label, resize_shape=(1000,1000)):\n",
+    "    dpi = 72\n",
+    "    width, height = image.size\n",
+    "    fig, ax = plt.subplots(1, figsize=[width/dpi, height/dpi], tight_layout={'pad':0})\n",
+    "\n",
+    "    ax.imshow(image)\n",
+    "\n",
+    "    ymin, xmin, ymax, xmax = box\n",
+    "    re_h, re_w = resize_shape if resize_shape is not None else (height, width)\n",
+    "    xmin = (xmin / re_w) * width\n",
+    "    ymin = (ymin/ re_h) * height\n",
+    "    xmax = (xmax / re_w) * width\n",
+    "    ymax = (ymax/ re_h) * height\n",
+    "\n",
+    "    w = xmax - xmin\n",
+    "    h = ymax - ymin\n",
+    "\n",
+    "    rect = patches.Rectangle(\n",
+    "        (xmin, ymin),\n",
+    "        w,\n",
+    "        h,\n",
+    "        linewidth=10,\n",
+    "        edgecolor=\"green\",\n",
+    "        facecolor=\"none\"\n",
+    "    )\n",
+    "    ax.add_patch(rect)\n",
+    "\n",
+    "    if label is not None:\n",
+    "        ax.text(xmin, ymin-25, label, fontsize=24, bbox=dict(facecolor=\"yellow\", alpha=0.5))\n",
+    "\n",
+    "    plt.axis(\"off\")\n",
+    "    plt.savefig(f\"boxes_{i}.png\")\n",
+    "    plt.close(fig)\n",
+    "    display(fig)"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "def display_detected_object(image_url, what_object):\n",
+    "    image = load_image(image_url)\n",
+    "    image = resize_to_48_multiple(image)\n",
+    "    detection = detect_object(image_url, what_object)\n",
+    "    box = detection[\"box_2d\"]\n",
+    "    label = detection.get(\"label\", f\"{what_object}\")\n",
+    "    draw_pascal_voc_boxes(\"1000\", image, box, label)"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "display_detected_object(\"https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/bike.png\", \"bike\")"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "##\u00a0Captioning"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            {\"type\": \"image\", \"url\": \"https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/bird.png\"},\n",
+    "            {\"type\": \"text\", \"text\": \"Write single detailed caption for this image.\"},\n",
+    "        ],\n",
+    "    },\n",
+    "]\n",
+    "\n",
+    "inputs = processor.apply_chat_template(\n",
+    "    messages,\n",
+    "    tokenize=True,\n",
+    "    return_dict=True,\n",
+    "    return_tensors=\"pt\",\n",
+    "    add_generation_prompt=True,\n",
+    ").to(model.device)\n",
+    "\n",
+    "output = model.generate(**inputs, max_new_tokens=512)\n",
+    "input_len = inputs.input_ids.shape[-1]\n",
+    "generated_text_ids = output[0][input_len:]\n",
+    "generated_text = processor.decode(generated_text_ids, skip_special_tokens=True)\n",
+    "result = processor.parse_response(generated_text)\n",
+    "print(result[\"content\"])"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Audio Understanding"
+   ],
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            {\"type\": \"audio\", \"url\": \"https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama_first_45_secs.mp3\"},\n",
+    "            {\"type\": \"text\", \"text\": \"Can you describe this audio in detail?\"},\n",
+    "        ],\n",
+    "    },\n",
+    "]\n",
+    "\n",
+    "inputs = processor.apply_chat_template(\n",
+    "    messages,\n",
+    "    tokenize=True,\n",
+    "    return_dict=True,\n",
+    "    return_tensors=\"pt\",\n",
+    "    add_generation_prompt=True,\n",
+    ").to(model.device)\n",
+    "\n",
+    "output = model.generate(\n",
+    "    **inputs,\n",
+    "    max_new_tokens=1000,\n",
+    "    do_sample=False,\n",
+    ")\n",
+    "\n",
+    "print(processor.decode(output[0], skip_special_tokens=True))\n"
+   ],
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  }
+ ]
+}