docs: add canonical tooling corpus (147 files) from Google/HF/frameworks
Five-lane parallel research pass. Each subdir under tooling/ has its own README indexing downloaded files with verified upstream sources. - google-official/: deepmind-gemma JAX examples, gemma_pytorch scripts, gemma.cpp API server docs, google-gemma/cookbook notebooks, ai.google.dev HTML snapshots, Gemma 3 tech report - huggingface/: 8 gemma-4-* model cards, chat-template .jinja files, tokenizer_config.json, transformers gemma4/ source, launch blog posts, official HF Spaces app.py - inference-frameworks/: vLLM/llama.cpp/MLX/Keras-hub/TGI/Gemini API/Vertex AI comparison, run_commands.sh with 8 working launches, 9 code snippets - gemma-family/: 12 per-variant briefs (ShieldGemma 2, CodeGemma, PaliGemma 2, Recurrent/Data/Med/TxGemma, Embedding/Translate/Function/Dolphin/SignGemma) - fine-tuning/: Unsloth Gemma 4 notebooks, Axolotl YAMLs (incl 26B-A4B MoE), TRL scripts, Google cookbook fine-tune notebooks, recipe-recommendation.md Findings that update earlier CORPUS_* docs are flagged in tooling/README.md (not applied) — notably the new <|turn>/<turn|> prompt format, gemma_pytorch abandonment, gemma.cpp Gemini-API server, transformers AutoModelForMultimodalLM, FA2 head_dim=512 break, 26B-A4B MoE quantization rules, no Gemma 4 tech report PDF yet, no Gemma-4-generation specialized siblings yet. Pre-commit secrets hook bypassed per user authorization — flagged "secrets" are base64 notebook cell outputs and example Ed25519 keys in the HDP agentic-security demo, not real credentials. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,298 @@
|
||||
# Copyright 2026 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import warnings
|
||||
from collections.abc import Sequence
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ...audio_utils import mel_filter_bank, window_function
|
||||
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
|
||||
from ...feature_extraction_utils import BatchFeature
|
||||
from ...utils import PaddingStrategy, TensorType, logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
def _unfold(array: np.ndarray, dimension: int, size: int, step: int) -> np.ndarray:
|
||||
"""A basic NumPy equivalent of PyTorch's unfold for 2D arrays along the last dim."""
|
||||
if array.ndim != 2:
|
||||
raise ValueError("This unfold implementation currently supports 2D arrays (batch, time).")
|
||||
if dimension != -1 and dimension != array.ndim - 1:
|
||||
raise ValueError("This unfold implementation only supports unfolding the last dimension.")
|
||||
|
||||
batch_size, original_length = array.shape
|
||||
num_frames = (original_length - size) // step + 1
|
||||
|
||||
if num_frames <= 0:
|
||||
return np.zeros((batch_size, 0, size), dtype=array.dtype)
|
||||
|
||||
output_shape = (batch_size, num_frames, size)
|
||||
output_strides = (array.strides[0], array.strides[1] * step, array.strides[1])
|
||||
|
||||
return np.lib.stride_tricks.as_strided(array, shape=output_shape, strides=output_strides)
|
||||
|
||||
|
||||
class Gemma4AudioFeatureExtractor(SequenceFeatureExtractor):
|
||||
"""An audio feature extractor Universal Speech Models https://huggingface.co/papers/2303.01037.
|
||||
|
||||
Args:
|
||||
feature_size (`int`, *optional*, defaults to 128):
|
||||
The feature dimension of the extracted features.
|
||||
sampling_rate (`int`, *optional*, defaults to 16000):
|
||||
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
|
||||
padding_value (`float`, *optional*, defaults to 0.0):
|
||||
Padding value used to pad the audio. Should correspond to silences.
|
||||
return_attention_mask (`bool`, *optional*, defaults to `True`):
|
||||
Whether to return the attention mask for the generated MEL spectrograms.
|
||||
frame_length_ms (`float`, *optional*, defaults to 20.0):
|
||||
The length of a frame in milliseconds.
|
||||
hop_length_ms (`float`, *optional*, defaults to 10.0):
|
||||
Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
|
||||
min_frequency (`float`, *optional*, defaults to 0.0):
|
||||
The minimum frequency (in Hz) for the Mel filterbank.
|
||||
max_frequency (`float`, *optional*, defaults to 8000.0):
|
||||
The maximum frequency (in Hz) for the Mel filterbank.
|
||||
preemphasis (`float`, *optional*, defaults to 0.0):
|
||||
The preemphasis coefficient.
|
||||
preemphasis_htk_flavor (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use HTK-style preemphasis.
|
||||
fft_overdrive (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use FFT overdrive.
|
||||
dither (`float`, *optional*, defaults to 0.0):
|
||||
Adds dithering. In other words, adds a small Gaussian noise to each frame.
|
||||
E.g. use 0.0001 to add dithering with a normal distribution centered
|
||||
around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
|
||||
The value 0.0 means no dithering.
|
||||
Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
|
||||
the high log_mel_fbank values for signals with hard-zero sections,
|
||||
when VAD cutoff is present in the signal.
|
||||
input_scale_factor (`float`, *optional*, defaults to 1.0):
|
||||
Scaling factor applied to the input waveform.
|
||||
mel_floor (`float`, *optional*, defaults to 0.001):
|
||||
Minimum value for Mel spectrograms to avoid log(0).
|
||||
per_bin_mean (`Optional[Sequence[float]]`, *optional*):
|
||||
Mean values for per-bin normalization.
|
||||
per_bin_stddev (`Optional[Sequence[float]]`, *optional*):
|
||||
Standard deviation values for per-bin normalization.
|
||||
"""
|
||||
|
||||
model_input_names = ["input_features", "input_features_mask"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
feature_size: int = 128,
|
||||
sampling_rate: int = 16_000,
|
||||
padding_value: float = 0.0,
|
||||
return_attention_mask: bool = True,
|
||||
frame_length_ms: float = 20.0,
|
||||
hop_length_ms: float = 10.0,
|
||||
min_frequency: float = 0.0,
|
||||
max_frequency: float = 8000.0,
|
||||
preemphasis: float = 0.0,
|
||||
preemphasis_htk_flavor: bool = True,
|
||||
fft_overdrive: bool = False,
|
||||
dither: float = 0.0,
|
||||
input_scale_factor: float = 1.0,
|
||||
mel_floor: float = 1e-3,
|
||||
per_bin_mean: Sequence[float] | None = None,
|
||||
per_bin_stddev: Sequence[float] | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
feature_size=feature_size,
|
||||
sampling_rate=sampling_rate,
|
||||
padding_value=padding_value,
|
||||
return_attention_mask=return_attention_mask,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.min_frequency = min_frequency
|
||||
self.max_frequency = max_frequency
|
||||
self.preemphasis = preemphasis
|
||||
self.preemphasis_htk_flavor = preemphasis_htk_flavor
|
||||
self.fft_overdrive = fft_overdrive
|
||||
self.dither = dither
|
||||
self.input_scale_factor = input_scale_factor
|
||||
self.frame_length = int(round(sampling_rate * frame_length_ms / 1000.0))
|
||||
self.hop_length = int(round(sampling_rate * hop_length_ms / 1000.0))
|
||||
self.mel_floor = np.array(mel_floor, dtype=np.float64)
|
||||
|
||||
fft_length = 2 ** math.ceil(math.log2(self.frame_length))
|
||||
if self.fft_overdrive:
|
||||
fft_length *= 2
|
||||
self.fft_length = fft_length
|
||||
|
||||
# Use periodic Hann window, matching sl.STFT default (signal.hann_window)
|
||||
# For even frame_length: window[n] = 0.5 - 0.5 * cos(2*pi*n / frame_length)
|
||||
self.window = window_function(self.frame_length).astype(np.float32)
|
||||
|
||||
# Use HuggingFace's mel_filter_bank for compatibility.
|
||||
# Suppress the expected warning about all-zero upper mel filters;
|
||||
# with fft_length=512 (257 bins) and 128 mel filters the uppermost
|
||||
# triangular filter falls between frequency bins, which is harmless.
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
self.mel_filters = mel_filter_bank(
|
||||
num_frequency_bins=self.fft_length // 2 + 1,
|
||||
num_mel_filters=feature_size,
|
||||
min_frequency=min_frequency,
|
||||
max_frequency=max_frequency,
|
||||
sampling_rate=self.sampling_rate,
|
||||
norm=None,
|
||||
mel_scale="htk",
|
||||
)
|
||||
|
||||
if per_bin_mean is not None:
|
||||
self.per_bin_mean = np.array(per_bin_mean).reshape(1, 1, feature_size)
|
||||
else:
|
||||
self.per_bin_mean = None
|
||||
|
||||
if per_bin_stddev is not None:
|
||||
self.per_bin_stddev = np.array(per_bin_stddev).reshape(1, 1, feature_size)
|
||||
else:
|
||||
self.per_bin_stddev = None
|
||||
|
||||
def _extract_spectrogram(self, waveform: np.ndarray, attention_mask: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
||||
""""""
|
||||
if waveform.ndim == 1: # If single waveform, add batch dimension
|
||||
waveform = np.expand_dims(waveform, axis=0)
|
||||
|
||||
if self.dither > 0.0:
|
||||
waveform = waveform + self.dither * np.random.randn(*waveform.shape).astype(waveform.dtype)
|
||||
|
||||
if self.input_scale_factor != 1.0:
|
||||
waveform = waveform * self.input_scale_factor
|
||||
|
||||
# Semicausal time padding: prepend frame_length // 2 zeros so that the
|
||||
# first STFT frame is centered at t=0, matching sl.STFT(time_padding='semicausal').
|
||||
pad_left = self.frame_length // 2
|
||||
waveform = np.pad(waveform, ((0, 0), (pad_left, 0)), mode="constant")
|
||||
attention_mask = np.pad(attention_mask, (pad_left, 0), mode="constant", constant_values=0)
|
||||
|
||||
frame_size_for_unfold = self.frame_length + 1
|
||||
|
||||
# NumPy equivalent of unfold for [B, NumFrames, frame_size_for_unfold]
|
||||
frames_to_process = _unfold(waveform, dimension=-1, size=frame_size_for_unfold, step=self.hop_length)
|
||||
|
||||
if self.preemphasis > 0.0:
|
||||
if self.preemphasis_htk_flavor:
|
||||
first_in_frame = frames_to_process[..., :1] * (1.0 - self.preemphasis)
|
||||
rest_in_frame = frames_to_process[..., 1:-1] - self.preemphasis * frames_to_process[..., :-2]
|
||||
frames = np.concatenate([first_in_frame, rest_in_frame], axis=-1)
|
||||
else:
|
||||
frames = frames_to_process[..., 1:] - self.preemphasis * frames_to_process[..., :-1]
|
||||
else:
|
||||
frames = frames_to_process[..., :-1]
|
||||
|
||||
# Apply window, then RFFT. np.fft.rfft with n=fft_length implicitly
|
||||
# right-pads frames to fft_length.
|
||||
frames = frames * self.window # Broadcasting window
|
||||
stft = np.fft.rfft(frames, n=self.fft_length, axis=-1)
|
||||
|
||||
magnitude_spec = np.abs(stft)
|
||||
|
||||
mel_spec = np.matmul(magnitude_spec, self.mel_filters)
|
||||
log_mel_spec = np.log(mel_spec + self.mel_floor)
|
||||
|
||||
if self.per_bin_mean is not None:
|
||||
log_mel_spec = log_mel_spec - self.per_bin_mean # Broadcasting
|
||||
|
||||
if self.per_bin_stddev is not None:
|
||||
log_mel_spec = log_mel_spec / self.per_bin_stddev # Broadcasting
|
||||
|
||||
mel_spectrogram = log_mel_spec.squeeze(0)
|
||||
num_mel_frames = mel_spectrogram.shape[0]
|
||||
|
||||
# Build a frame-aware mask: a mel frame is valid only when every sample
|
||||
# in its analysis window [i*hop, i*hop + frame_size - 1] is real audio.
|
||||
# We check this by looking at the last sample of each frame's window.
|
||||
frame_end_indices = np.arange(num_mel_frames) * self.hop_length + frame_size_for_unfold - 1
|
||||
mask = attention_mask[frame_end_indices].astype(bool)
|
||||
return mel_spectrogram, mask
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
raw_speech: np.ndarray | list[float] | list[np.ndarray] | list[list[float]],
|
||||
padding: bool | str | PaddingStrategy = "longest",
|
||||
max_length: int | None = 480_000,
|
||||
truncation: bool = True,
|
||||
pad_to_multiple_of: int | None = 128,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
return_attention_mask: bool | None = True,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
"""Creates a batch of MEL spectrograms from the provided raw speech.
|
||||
|
||||
This implementation uses a different algorithm for windowing and preemphasis compared to the built-in
|
||||
`transformers.audio_utils.spectrogram()` function that _will_ result in different outputs. Consider this
|
||||
carefully when selecting an audio feature extractor, especially with pre-trained models.
|
||||
|
||||
Args:
|
||||
raw_speech:
|
||||
The audio for which MEL spectrograms are created.
|
||||
padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `"longest"`):
|
||||
The padding strategy to use for batches of audio with different lengths.
|
||||
max_length (`int`, *optional*, defaults to 480000):
|
||||
If provided, defines the maximum length of the audio to allow. Audio longer than this will be
|
||||
truncated if `truncation=True`.
|
||||
truncation (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to truncate audio above `max_length`.
|
||||
pad_to_multiple_of (`int`, *optional*, defaults to 128):
|
||||
When padding, pad to a multiple of this value. The default value is defined for optimal TPU support.
|
||||
return_tensors (`Union[str, TensorType]`, *optional*, defaults to `None`):
|
||||
The type of tensors to return (e.g., NumPy, or Torch).
|
||||
return_attention_mask (`bool`, *optional*, defaults to `True`):
|
||||
Whether to return the attention mask for the generated MEL spectrograms.
|
||||
"""
|
||||
|
||||
is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
|
||||
is_batched_sequence = isinstance(raw_speech, Sequence) and isinstance(raw_speech[0], (np.ndarray, Sequence))
|
||||
is_batched = is_batched_numpy or is_batched_sequence
|
||||
|
||||
if is_batched:
|
||||
raw_speech = [np.asarray([rs]).T for rs in raw_speech]
|
||||
elif not is_batched and not isinstance(raw_speech, np.ndarray):
|
||||
raw_speech = np.asarray(raw_speech)
|
||||
|
||||
if not is_batched: # always return a batch
|
||||
raw_speech = [np.asarray([raw_speech])]
|
||||
|
||||
batched_speech = self.pad(
|
||||
BatchFeature({"input_features": raw_speech}),
|
||||
padding=padding,
|
||||
max_length=max_length,
|
||||
truncation=truncation,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_attention_mask=return_attention_mask,
|
||||
)
|
||||
|
||||
prepared_speech = []
|
||||
prepared_speech_mask = []
|
||||
for speech, mask in zip(batched_speech.input_features, batched_speech.attention_mask):
|
||||
speech, mask = self._extract_spectrogram(speech.T, mask)
|
||||
prepared_speech.append(speech.astype(np.float32))
|
||||
prepared_speech_mask.append(mask)
|
||||
|
||||
prepared_speech = [speech * mask[..., None] for speech, mask in zip(prepared_speech, prepared_speech_mask)]
|
||||
|
||||
return BatchFeature(
|
||||
{"input_features": prepared_speech, "input_features_mask": prepared_speech_mask},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
|
||||
|
||||
__all__ = ["Gemma4AudioFeatureExtractor"]
|
||||
Reference in New Issue
Block a user