zhoukz commited on Sep 1

Commit

2f0baa1

verified ·

1 Parent(s): cc937c5

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

.gitattributes +1 -0
added_tokens.json +41 -0
chat_template.jinja +7 -0
config.json +118 -0
configuration_midashenglm.py +80 -0
generation_config.json +9 -0
merges.txt +0 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_midashenglm.py +660 -0
preprocessor_config.json +13 -0
processing_midashenglm.py +277 -0
processor_config.json +10 -0
special_tokens_map.json +42 -0
tokenizer.json +3 -0
tokenizer_config.json +365 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|AUDIO|>": 151646,
+  "<|IMAGE|>": 151655,
+  "<|VIDEO|>": 151656,
+  "<|ar|>": 151679,
+  "<|audio_bos|>": 151647,
+  "<|audio_eos|>": 151648,
+  "<|box_end|>": 151649,
+  "<|de|>": 151667,
+  "<|endoftext|>": 151643,
+  "<|en|>": 151665,
+  "<|es|>": 151668,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|fr|>": 151669,
+  "<|hi|>": 151670,
+  "<|id|>": 151676,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|it|>": 151678,
+  "<|jp|>": 151680,
+  "<|kr|>": 151666,
+  "<|nl|>": 151674,
+  "<|pt|>": 151675,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|ru|>": 151677,
+  "<|th|>": 151672,
+  "<|uk|>": 151671,
+  "<|unknown|>": 151681,
+  "<|vision_bos|>": 151652,
+  "<|vision_eos|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vi|>": 151673
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,118 @@

+{
+  "architectures": [
+    "MiDashengLMModel"
+  ],
+  "audio_encoder_config": {
+    "attn_drop_rate": 0.0,
+    "center": true,
+    "depth": 32,
+    "drop_rate": 0.0,
+    "embed_dim": 1280,
+    "f_max": 8000.0,
+    "f_min": 0.0,
+    "hop_length": 160,
+    "init_values": null,
+    "input_channels": 1,
+    "mlp_ratio": 4.0,
+    "model_type": "midashenglm_dasheng_encoder",
+    "n_fft": 512,
+    "n_mels": 64,
+    "num_heads": 16,
+    "outputdim": 527,
+    "patch_size": [
+      64,
+      4
+    ],
+    "patch_stride": [
+      64,
+      4
+    ],
+    "qkv_bias": true,
+    "sample_rate": 16000,
+    "target_length": 1008,
+    "win_length": 512
+  },
+  "audio_token_id": 151646,
+  "auto_map": {
+    "AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
+    "AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
+  },
+  "model_type": "midashenglm",
+  "quantization_config": {
+    "_load_in_4bit": true,
+    "_load_in_8bit": false,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_storage": "uint8",
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": true,
+    "load_in_8bit": false,
+    "quant_method": "bitsandbytes"
+  },
+  "subsample_factor": 5,
+  "text_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "init_std": 0.02,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 32768,
+    "max_window_layers": 28,
+    "model_type": "qwen2_5_omni_text",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_section": [
+        16,
+        24,
+        24
+      ],
+      "rope_type": "default",
+      "type": "default"
+    },
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 152064
+  },
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.54.0"
+}

configuration_midashenglm.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from typing import Dict, Optional, Tuple, Union
+from transformers import PretrainedConfig
+from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
+    Qwen2_5OmniTextConfig,
+)
+class DashengConfig(PretrainedConfig):
+    model_type = "midashenglm_dasheng_encoder"
+    def __init__(
+        self,
+        embed_dim: int = 768,
+        outputdim: int = 527,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        patch_stride: Union[int, Tuple[int, int]] = 16,
+        input_channels: int = 1,
+        target_length: int = 1012,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        init_values: Optional[float] = None,
+        drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        f_min: float = 0.0,
+        f_max: float = 8000.0,
+        center: bool = True,
+        win_length: int = 512,
+        hop_length: int = 160,
+        sample_rate: int = 16000,
+        n_fft: int = 512,
+        n_mels: int = 64,
+        **kwargs,
+    ):
+        self.embed_dim = embed_dim
+        self.outputdim = outputdim
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.input_channels = input_channels
+        self.target_length = target_length
+        self.depth = depth
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.init_values = init_values
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.f_min = f_min
+        self.f_max = f_max
+        self.center = center
+        self.win_length = win_length
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.n_mels = n_mels
+        super().__init__(**kwargs)
+class MiDashengLMConfig(PretrainedConfig):
+    model_type = "midashenglm"
+    def __init__(
+        self,
+        audio_encoder_config: Dict = {},
+        subsample_factor: int = 5,
+        text_config: Dict = {},
+        audio_token_id: Optional[int] = None,
+        **kwargs,
+    ):
+        self.audio_encoder_config = DashengConfig(**audio_encoder_config)
+        self.subsample_factor = subsample_factor
+        self.text_config = (
+            Qwen2_5OmniTextConfig(**text_config)
+            if text_config
+            else Qwen2_5OmniTextConfig()
+        )
+        self.audio_token_id = audio_token_id
+        super().__init__(**kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "eos_token_id": [
+    151643,
+    151645
+  ],
+  "max_length": 32768,
+  "pad_token_id": 151643,
+  "transformers_version": "4.54.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9f59b51dba695e593883438352090cfccc61ca527595511215dca21b22ab6b7
+size 4968475550

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45f8c2d13ba443773868202a754c4e37d1a519aa4fa556875d7fc9d9cbd705ae
+size 1259325658

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_midashenglm.py ADDED Viewed

	@@ -0,0 +1,660 @@

+import collections
+import collections.abc
+from dataclasses import dataclass
+from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, Union, cast
+import torch
+import torch.nn as nn
+import torchaudio.functional as F
+from torch import Tensor
+from transformers import GenerationMixin, PreTrainedModel
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.models.qwen2_5_omni.configuration_qwen2_5_omni import (
+    Qwen2_5OmniTextConfig,
+)
+from transformers.models.qwen2_5_omni.modeling_qwen2_5_omni import (
+    Qwen2_5OmniThinkerTextModel,
+)
+from transformers.utils import can_return_tuple
+from .configuration_midashenglm import DashengConfig, MiDashengLMConfig
+_Tuple2 = Union[int, Tuple[int, int], Sequence[int]]
+def _resolve_tuple2(x: _Tuple2) -> Tuple[int, int]:
+    if isinstance(x, collections.abc.Sequence):
+        assert len(x) == 2, (
+            f"Expected a sequence of length 2, got {x} with length {len(x)}"
+        )
+        return cast(Tuple[int, int], tuple(x))
+    return (x, x)
+class AudioPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        input_size: _Tuple2 = 64,
+        patch_size: _Tuple2 = 16,
+        patch_stride: _Tuple2 = 16,
+        in_chans: int = 1,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten: bool = False,
+    ):
+        super().__init__()
+        self.input_size = _resolve_tuple2(input_size)
+        self.patch_size = _resolve_tuple2(patch_size)
+        self.patch_stride = _resolve_tuple2(patch_stride)
+        self.grid_size = (
+            self.input_size[0] // self.patch_stride[0],
+            self.input_size[1] // self.patch_stride[1],
+        )
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_stride,
+        )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        if self.flatten:
+            x = torch.permute(
+                torch.flatten(x, 2, 3), (0, 2, 1)
+            )  # rearrange(x, "b c f t -> b (f t) c")
+        x = self.norm(x)
+        return x
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class DashengMlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        drop: float = 0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class DashengAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        causal: bool = False,
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.causal = causal
+    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        # if mask is not None:
+        # # Mask is a tensor of shape [B, T, T]
+        # # Different from self.causal == True, the mask might be something like:
+        # # [False, False, True]
+        # # [False, False, True]
+        # # [True, True, True]
+        # # We use -inf to pad here, since if we would pad by any number, the entries at rows only containing
+        # # [True, True, True] would lead to weights such as: [0.33,0.33,0.33], which is not correct
+        if self.causal:
+            mask_value = -torch.finfo(attn.dtype).max
+            i, j = attn.shape[-2:]
+            mask = torch.ones(i, j, device=q.device, dtype=torch.bool).triu(j - i + 1)
+            attn = attn.masked_fill(mask, mask_value)
+        if mask is not None:
+            # mask value as the lowest possible value in fp32
+            mask_value = torch.finfo(attn.dtype).min
+            # Mask is of shape [1, SRC_LEN]
+            attn_mask = mask[:, None, None, :].expand(B, 1, N, N)
+            # Mask should be of shape
+            # [B,1,Target_len, Source_len]
+            attn = attn.masked_fill(attn_mask, mask_value)
+        attn = attn.softmax(dim=-1)
+        attn = torch.nan_to_num(attn)
+        # Only for the case that a mask with all True entries on a row is passed.
+        # attn = torch.nan_to_num(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class DashengBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
+        self.attn = DashengAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
+        self.mlp = DashengMlp(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            drop=drop,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+    # Kwargs usually has a mask parameter that is passed to Attention
+    def forward(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        x = x + self.ls1(self.attn(self.norm1(x), mask))
+        x = x + self.ls2(self.mlp(self.norm2(x)))
+        return x
+class DashengFrontend(nn.Module):
+    def __init__(self, config: DashengConfig):
+        super().__init__()
+        self.config = config
+        spectrogram_window = torch.hann_window(self.config.win_length)
+        self.register_buffer(
+            "spectrogram_window",
+            spectrogram_window,
+            persistent=False,
+        )
+        self.spectrogram_window: torch.Tensor
+        melscale_fbanks = F.melscale_fbanks(
+            n_freqs=self.config.n_fft // 2 + 1,
+            f_min=self.config.f_min,
+            f_max=self.config.f_max,
+            n_mels=self.config.n_mels,
+            sample_rate=self.config.sample_rate,
+        )
+        self.register_buffer("melscale_fbanks", melscale_fbanks, persistent=False)
+        self.melscale_fbanks: torch.Tensor
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        spectrogram = F.spectrogram(
+            waveform=waveform.to(torch.float32),
+            pad=0,
+            window=self.spectrogram_window,
+            n_fft=self.config.n_fft,
+            hop_length=self.config.hop_length,
+            win_length=self.config.win_length,
+            power=2,
+            normalized=False,
+            center=self.config.center,
+        )
+        mel_spectrogram = (spectrogram.mT @ self.melscale_fbanks.to(torch.float32)).mT
+        # x has shape [batch, freq, time].
+        # F.amplitude_to_DB accepts inputs shaped as:
+        #   - [freq, time]
+        #   - [channel, freq, time]
+        #   - [..., channel, freq, time]
+        # Here we insert a channel dimension of size 1 before calling it,
+        # then remove that extra dimension afterward.
+        log_mel_spectrogram = F.amplitude_to_DB(
+            mel_spectrogram.unsqueeze(1),
+            multiplier=10,
+            amin=1e-10,
+            db_multiplier=0,
+            top_db=120,
+        ).squeeze(1)
+        return log_mel_spectrogram.to(waveform.dtype)
+class DashengAudioTransformer(PreTrainedModel):
+    config_class = DashengConfig
+    supports_gradient_checkpointing = True
+    def __init__(self, config: DashengConfig):
+        super().__init__(config)
+        self.target_length = config.target_length
+        self.embed_dim = config.embed_dim
+        self.hop_length = config.hop_length
+        self.gradient_checkpointing = False
+        self.front_end = DashengFrontend(config)
+        self.init_bn = nn.BatchNorm2d(config.n_mels, momentum=0.01)
+        self.patch_embed = AudioPatchEmbed(
+            input_size=(config.n_mels, config.target_length),
+            embed_dim=config.embed_dim,
+            in_chans=config.input_channels,
+            patch_size=config.patch_size,
+            flatten=False,
+            patch_stride=config.patch_stride,
+        )
+        self.time_pos_embed = nn.Parameter(
+            torch.randn(1, config.embed_dim, 1, self.patch_embed.grid_size[1]) * 0.02
+        )
+        self.freq_pos_embed = nn.Parameter(
+            torch.randn(1, config.embed_dim, self.patch_embed.grid_size[0], 1) * 0.02
+        )
+        self.pos_drop = nn.Dropout(p=config.drop_rate)
+        self.blocks = nn.ModuleList(
+            DashengBlock(
+                dim=config.embed_dim,
+                num_heads=config.num_heads,
+                mlp_ratio=config.mlp_ratio,
+                qkv_bias=config.qkv_bias,
+                init_values=config.init_values,
+                drop=config.drop_rate,
+                attn_drop=config.attn_drop_rate,
+            )
+            for _ in range(config.depth)
+        )
+        self.norm = nn.LayerNorm(config.embed_dim, eps=1e-6)
+        self.post_init()
+    def forward_features(
+        self,
+        x: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        t = x.shape[-1]
+        x = x + self.time_pos_embed[:, :, :, :t]
+        x = (
+            x + self.freq_pos_embed[:, :, :, :]
+        )  # Just to support __getitem__ in posembed
+        x = torch.permute(
+            torch.flatten(x, 2, 3), (0, 2, 1)
+        )  # rearrange(x, "b c f t -> b (f t) c")
+        x = self.pos_drop(x)
+        for block in self.blocks:
+            if self.gradient_checkpointing and self.training:
+                x = self._gradient_checkpointing_func(block, x, mask)
+            else:
+                x = block(x, mask)
+        x = self.norm(x)
+        return x
+    def _to_mask(self, lengths: torch.Tensor, max_length: int) -> torch.Tensor:
+        batch_size = len(lengths)
+        idx = torch.arange(max_length, device=lengths.device)
+        idx = idx.repeat(batch_size).view(batch_size, max_length)
+        mask = (idx < lengths.unsqueeze(-1)).bool()
+        return mask
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_length: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        x = self.front_end(x)
+        target_length_in_patches = self.target_length // 4
+        x = x.unsqueeze(1)
+        x = torch.permute(x, (0, 2, 1, 3))
+        x = self.init_bn(x)
+        x = torch.permute(x, (0, 2, 1, 3))
+        x = self.patch_embed(x)
+        t = x.shape[-1]
+        input_splits = x.split(target_length_in_patches, dim=-1)
+        if x_length is not None:
+            assert len(x_length) == len(x), (
+                "batchsizes of input x and x_length need to be same"
+            )
+            assert x_length.ndim == 1, "Lengths are of size (B,)"
+            scaled_lengths = (x_length / (self.hop_length * 4)).long()
+            mask = self._to_mask(max_length=t, lengths=scaled_lengths)
+            split_masks = mask.logical_not().split(target_length_in_patches, dim=-1)
+        else:
+            mask = None
+            split_masks = [None] * len(input_splits)
+        outputs = []
+        for split_x, split_mask in zip(input_splits, split_masks):
+            forward_kwargs = {}
+            forward_kwargs["mask"] = split_mask
+            split_x = self.forward_features(split_x, **forward_kwargs)
+            outputs.append(split_x)
+        x = torch.cat(outputs, dim=1)
+        return x, mask
+class AudioProjectorSubsample(nn.Module):
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        downsample_rate=5,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        self.k = downsample_rate
+        self.net = nn.Sequential(
+            nn.Linear(in_dim * self.k, out_dim, dtype=dtype),
+            nn.GELU(),
+            nn.Linear(out_dim, out_dim, dtype=dtype),
+        )
+    def forward(self, x, mask=None):
+        batch_size, seq_len, dim = x.shape
+        num_frames_to_discard = seq_len % self.k
+        if num_frames_to_discard > 0:
+            x = x[:, :-num_frames_to_discard, :]
+            if mask is not None:
+                mask = mask[:, :-num_frames_to_discard]
+        if mask is None:
+            mask = torch.ones(x.shape[:-1], dtype=torch.long, device=x.device)
+        x = x.reshape(
+            batch_size, -1, self.k * dim
+        )  # rearrange(x, "b (s k) d -> b s (k d)", k=self.k)
+        x = self.net(x)
+        mask = mask.reshape(
+            batch_size, -1, self.k
+        )  # rearrange(mask, "b (s k) -> b s k", k=self.k)
+        mask = mask.any(dim=-1).long()
+        return x, mask
+@dataclass
+class Qwen25OmniTextModelOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+class Qwen25OmniThinkerTextOnlyDecoder(PreTrainedModel, GenerationMixin):
+    config_class = Qwen2_5OmniTextConfig
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+    def __init__(self, config: Qwen2_5OmniTextConfig):
+        super().__init__(config)
+        self.model = Qwen2_5OmniThinkerTextModel._from_config(config)
+        self.lm_head = nn.Linear(
+            config.hidden_size,
+            config.vocab_size,
+            bias=False,
+        )
+        self.post_init()
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, Qwen25OmniTextModelOutput]:
+        if attention_mask is not None and position_ids is None:
+            position_ids = (
+                attention_mask.long()
+                .cumsum(dim=-1)
+                .masked_fill_(attention_mask == 0, 1)
+                - 1
+            )
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=True,
+        )
+        hidden_states = outputs.last_hidden_state
+        logits = self.lm_head(hidden_states)
+        loss = (
+            self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+            if labels is not None
+            else None
+        )
+        return Qwen25OmniTextModelOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class MiDashengLMModel(PreTrainedModel):
+    config_class = MiDashengLMConfig
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+    supports_gradient_checkpointing = True
+    def __init__(self, config: MiDashengLMConfig):
+        super().__init__(config)
+        self.audio_token_id = config.audio_token_id
+        self.audio_encoder = DashengAudioTransformer._from_config(
+            config.audio_encoder_config,
+        )
+        self.audio_projector = AudioProjectorSubsample(
+            self.audio_encoder.embed_dim,
+            config.text_config.hidden_size,
+            config.subsample_factor,
+        )
+        self.decoder = Qwen25OmniThinkerTextOnlyDecoder._from_config(
+            config.text_config,
+            attn_implementation=config._attn_implementation,
+        )
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.decoder.model.embed_tokens
+    def get_output_embeddings(self):
+        return self.decoder.lm_head
+    def _forward_audio_encoder(
+        self,
+        audios: torch.Tensor,
+        audio_length: Optional[Iterable[int]],
+    ) -> torch.Tensor:
+        encoder_out, encoder_atts = self.audio_encoder(audios, audio_length)
+        # audio projector
+        encoder_out, encoder_atts = self.audio_projector(encoder_out, encoder_atts)
+        return encoder_out
+    def _prepare_inputs_embeds(
+        self,
+        input_ids: Optional[torch.Tensor],
+        input_values: Optional[torch.Tensor],
+        inputs_embeds: Optional[torch.Tensor],
+        audio_length: Optional[Iterable[int]] = None,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            if inputs_embeds is not None:
+                raise ValueError(
+                    "Both `inputs_embeds` and `input_ids` are passed. Please pass only one of them."
+                )
+            inputs_embeds = cast(
+                torch.Tensor, self.decoder.model.embed_tokens(input_ids)
+            )
+            if input_values is not None:
+                if self.audio_token_id is None:
+                    raise ValueError(
+                        "Audio input is provided, but `audio_token_id` is not configured."
+                    )
+                audio_embeddings = self._forward_audio_encoder(
+                    input_values,
+                    audio_length=audio_length,
+                ).to(inputs_embeds.dtype)
+                audio_mask = (input_ids == self.audio_token_id).flatten()
+                diff = torch.diff(
+                    audio_mask.long(),
+                    prepend=torch.zeros(
+                        (1,),
+                        dtype=torch.long,
+                        device=audio_mask.device,
+                    ),
+                )
+                audio_span_starts = (diff == 1).nonzero()
+                audio_span_ends = (diff == -1).nonzero()
+                embeds_view = inputs_embeds.view(-1, inputs_embeds.shape[-1])
+                for span_start, span_end, audio in zip(
+                    audio_span_starts,
+                    audio_span_ends,
+                    audio_embeddings,
+                    strict=True,
+                ):
+                    embeds_view[span_start:span_end] = audio[: span_end - span_start]
+        else:
+            if inputs_embeds is None:
+                raise ValueError(
+                    "Either `input_ids` or `inputs_embeds` must be passed."
+                )
+            if input_values is not None:
+                raise ValueError(
+                    "Cannot pass `input_values` when `inputs_embeds` is provided."
+                )
+        return inputs_embeds
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        input_values: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        audio_length: Optional[Iterable[int]] = None,
+        attention_mask: Optional[Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs: Any,
+    ):
+        inputs_embeds = self._prepare_inputs_embeds(
+            input_ids=input_ids,
+            input_values=input_values,
+            inputs_embeds=inputs_embeds,
+            audio_length=audio_length,
+        )
+        return self.decoder(
+            input_ids=None,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            labels=labels,
+            **kwargs,
+        )
+    def generate(
+        self,
+        input_ids: Optional[Tensor] = None,
+        input_values: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        audio_length: Optional[Iterable[int]] = None,
+        **kwargs,
+    ):
+        inputs_embeds = self._prepare_inputs_embeds(
+            input_ids=input_ids,
+            input_values=input_values,
+            inputs_embeds=inputs_embeds,
+            audio_length=audio_length,
+        )
+        return self.decoder.generate(
+            inputs_embeds=inputs_embeds,
+            generation_config=kwargs.pop("generation_config", self.generation_config),
+            **kwargs,
+        )

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_midashenglm.MiDashengLMProcessor"
+  },
+  "do_normalize": false,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "MiDashengLMProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

processing_midashenglm.py ADDED Viewed

	@@ -0,0 +1,277 @@

+from typing import Dict, List, Optional, Union, cast
+import numpy as np
+import torch
+from transformers import Qwen2Tokenizer, Qwen2TokenizerFast, Wav2Vec2FeatureExtractor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin
+from typing_extensions import Unpack
+class MiDashengLMProcessorKwargs(ProcessingKwargs):
+    _defaults = {  # type: ignore
+        "text_kwargs": {
+            "padding": True,
+            "padding_side": "left",
+        },
+        "audio_kwargs": {},
+    }
+def calculate_mel_frames_dasheng(
+    audio_length_samples: int,
+    n_fft: int = 512,
+    hop_size: int = 160,
+    dasheng_subsampling: int = 4,
+    center=True,
+    model_subsampling: int = 5,
+) -> int:
+    """Calculate the number of Mel-spectrogram frames."""
+    if center:
+        audio_length_samples = audio_length_samples + n_fft
+    return (
+        int(1 + ((audio_length_samples - n_fft) / hop_size))
+        // dasheng_subsampling
+        // model_subsampling
+    )
+class MiDashengLMProcessor(ProcessorMixin):
+    attributes = ["feature_extractor", "tokenizer"]
+    valid_kwargs = [
+        "chat_template",
+        "audio_token",
+        "audio_bos_token",
+        "audio_eos_token",
+    ]
+    feature_extractor_class = "Wav2Vec2FeatureExtractor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    def __init__(
+        self,
+        feature_extractor: Wav2Vec2FeatureExtractor,
+        tokenizer: Union[Qwen2Tokenizer, Qwen2TokenizerFast],
+        model_subsampling: int = 5,
+        chat_template: Optional[Union[str, Dict[str, str]]] = None,
+        audio_token: Optional[str] = None,
+        audio_bos_token: Optional[str] = None,
+        audio_eos_token: Optional[str] = None,
+    ):
+        assert audio_token is not None or hasattr(tokenizer, "audio_token"), (
+            "Either `audio_token` must be provided or tokenizer must have `audio_token` attribute."
+        )
+        assert audio_bos_token is not None or hasattr(tokenizer, "audio_bos_token"), (
+            "Either `audio_bos_token` must be provided or tokenizer must have `audio_bos_token` attribute."
+        )
+        assert audio_eos_token is not None or hasattr(tokenizer, "audio_eos_token"), (
+            "Either `audio_eos_token` must be provided or tokenizer must have `audio_eos_token` attribute."
+        )
+        assert not feature_extractor.do_normalize, (
+            "This model does not use normalization. Please set `do_normalize=False` in the feature extractor."
+        )
+        if chat_template is None:
+            chat_template = tokenizer.chat_template
+        def get_token(token_name: str) -> str:
+            if not hasattr(tokenizer, token_name):
+                raise ValueError(
+                    f"Tokenizer does not have attribute `{token_name}`. "
+                    "Please provide it as an argument to the processor."
+                )
+            token = getattr(tokenizer, token_name)
+            if not isinstance(token, str):
+                raise TypeError(
+                    f"Expected token {token_name} to be a string, but got {type(token)}."
+                )
+            return token
+        self.audio_token = audio_token or get_token("audio_token")
+        self.audio_bos_token = audio_bos_token or get_token("audio_bos_token")
+        self.audio_eos_token = audio_eos_token or get_token("audio_eos_token")
+        self.audio_token_id = cast(
+            int, tokenizer.convert_tokens_to_ids(self.audio_token)
+        )
+        self.model_subsampling = model_subsampling
+        self.sampling_rate = feature_extractor.sampling_rate
+        super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
+        self.feature_extractor: Wav2Vec2FeatureExtractor
+        self.tokenizer: Union[Qwen2Tokenizer, Qwen2TokenizerFast]
+        self.chat_template: Optional[Union[str, Dict[str, str]]]
+    def _process_messages_for_chat_template(
+        self,
+        conversation,
+        batch_images,
+        batch_videos,
+        batch_video_metadata,
+        **mm_load_kwargs,
+    ):
+        if (sr := mm_load_kwargs.get("sampling_rate", None)) is not None:
+            if sr != self.sampling_rate:
+                raise ValueError(
+                    f"This model is trained with a sampling rate of {self.sampling_rate}, "
+                    f"but the sampling rate {sr} is used to load audio."
+                )
+        return super()._process_messages_for_chat_template(
+            conversation,
+            batch_images,
+            batch_videos,
+            batch_video_metadata,
+            **mm_load_kwargs,
+        )
+    @classmethod
+    def _validate_audio_sample(
+        cls,
+        sample: Union[np.ndarray, torch.Tensor],
+    ) -> np.ndarray:
+        if isinstance(sample, torch.Tensor):
+            if sample.ndim != 1:
+                raise ValueError("Audio tensor must be 1D.")
+            return sample.numpy()
+        if isinstance(sample, np.ndarray):
+            if sample.ndim != 1:
+                raise ValueError("Audio array must be 1D.")
+            return sample
+        if isinstance(sample, str):
+            # When passing audio paths through `apply_chat_template`, transformers
+            # will attempt to load the audio file, but only succeeds if the path
+            # is a valid URL (starting with http:// or https://) or an existing local
+            # file. Otherwise, the string is passed as-is. This captures that case and
+            # raises an error to inform the user.
+            raise TypeError(
+                "Expected audio to be a numpy array or torch tensor, but got a string. "
+                "If you passed audios through `apply_chat_template`, "
+                "make sure the audio paths are valid URLs starting with http:// or https://, "
+                "or existing local files."
+            )
+        raise TypeError(
+            f"Expected audio to be a numpy array, torch tensor, or string, but got {type(sample)}."
+        )
+    def __call__(
+        self,
+        text: Optional[List[str]] = None,
+        audio: Optional[Union[List[np.ndarray], List[torch.Tensor]]] = None,
+        **kwargs: Unpack[MiDashengLMProcessorKwargs],
+    ) -> BatchFeature:
+        if text is None:
+            raise ValueError("You need to specify `text` input to process.")
+        elif isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError(
+                "Invalid input text. Please provide a string, or a list of strings"
+            )
+        if (
+            kwargs.get("images", None) is not None
+            or kwargs.get("videos", None) is not None
+        ):
+            raise ValueError("This model does not support images or videos.")
+        output_kwargs = self._merge_kwargs(
+            MiDashengLMProcessorKwargs,  # type: ignore # Bad type hint in transformers
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if audio is not None:
+            audio = [self._validate_audio_sample(sample) for sample in audio]
+            # ensure we have as much audios as audio tokens
+            num_audio_tokens = sum(sample.count(self.audio_token) for sample in text)
+            num_audios = 1 if type(audio) is np.ndarray else len(audio)
+            if num_audio_tokens != num_audios:
+                raise ValueError(
+                    f"Found {num_audio_tokens} {self.audio_token} token{'s' if num_audio_tokens > 1 else ''} in provided text but received {num_audios} audio{'s' if num_audios > 1 else ''}"
+                )
+            # Some kwargs should not be changed so we can expand text with audio tokens below
+            output_kwargs["audio_kwargs"]["return_attention_mask"] = True
+            output_kwargs["audio_kwargs"]["padding"] = True
+            output_kwargs["audio_kwargs"]["return_tensors"] = "pt"
+            # + Padding
+            audio_inputs = self.feature_extractor(
+                audio,
+                sampling_rate=self.sampling_rate,
+                **output_kwargs["audio_kwargs"],
+            )
+            # remove attention mask, dasheng uses lengths
+            audio_feature_mask = audio_inputs.pop("attention_mask")
+            expanded_text = []
+            audio_lengths = audio_feature_mask.sum(-1).tolist()
+            audio_inputs["audio_length"] = torch.tensor(audio_lengths).long()
+            for sample in text:
+                replace_str = []
+                while self.audio_token in sample:
+                    audio_length = audio_lengths.pop(0)
+                    num_audio_tokens = calculate_mel_frames_dasheng(
+                        audio_length, model_subsampling=self.model_subsampling
+                    )
+                    expanded_audio_token = self.audio_token * num_audio_tokens
+                    audio_token_start_idx = sample.find(self.audio_token)
+                    audio_token_end_idx = audio_token_start_idx + len(self.audio_token)
+                    has_bos = (
+                        sample[
+                            audio_token_start_idx
+                            - len(self.audio_bos_token) : audio_token_start_idx
+                        ]
+                        == self.audio_bos_token
+                    )
+                    has_eos = (
+                        sample[
+                            audio_token_end_idx : audio_token_end_idx
+                            + len(self.audio_eos_token)
+                        ]
+                        == self.audio_eos_token
+                    )
+                    # Check if this audio token is surrounded by bos/eos tokens
+                    if not has_bos and not has_eos:
+                        expanded_audio_token = (
+                            self.audio_bos_token
+                            + expanded_audio_token
+                            + self.audio_eos_token
+                        )
+                    replace_str.append(expanded_audio_token)
+                    sample = sample.replace(self.audio_token, "<placeholder>", 1)
+                while "<placeholder>" in sample:
+                    sample = sample.replace("<placeholder>", replace_str.pop(0), 1)
+                expanded_text.append(sample)
+            text = expanded_text
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", "pt")
+        inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(
+            text,
+            BatchFeature(inputs),  # type: ignore
+            modalities=["audio"],
+        )
+        if audio is not None:
+            inputs.update(audio_inputs)
+        return BatchFeature(data={**inputs}, tensor_type=return_tensors)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return list(
+            dict.fromkeys(
+                tokenizer_input_names + feature_extractor_input_names + ["audio_length"]
+            )
+        )

processor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "audio_bos_token": "<|audio_bos|>",
+  "audio_eos_token": "<|audio_eos|>",
+  "audio_token": "<|AUDIO|>",
+  "auto_map": {
+    "AutoProcessor": "processing_midashenglm.MiDashengLMProcessor"
+  },
+  "model_subsampling": 5,
+  "processor_class": "MiDashengLMProcessor"
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "additional_special_tokens": [
+    "<|en|>",
+    "<|kr|>",
+    "<|de|>",
+    "<|es|>",
+    "<|fr|>",
+    "<|hi|>",
+    "<|uk|>",
+    "<|th|>",
+    "<|vi|>",
+    "<|nl|>",
+    "<|pt|>",
+    "<|id|>",
+    "<|ru|>",
+    "<|it|>",
+    "<|ar|>",
+    "<|jp|>",
+    "<|unknown|>"
+  ],
+  "audio_bos_token": "<|audio_bos|>",
+  "audio_eos_token": "<|audio_eos|>",
+  "audio_token": "<|AUDIO|>",
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<|IMAGE|>",
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "video_token": "<|VIDEO|>",
+  "vision_bos_token": "<|vision_bos|>",
+  "vision_eos_token": "<|vision_eos|>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c40343a9d670f4fadbe6415ed2cff441055f663e51d813f2315c3368399914d5
+size 11424986

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,365 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|AUDIO|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|audio_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|audio_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_bos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_eos|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|IMAGE|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|VIDEO|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<|en|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|kr|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|de|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<|es|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "<|fr|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<|hi|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "<|uk|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<|th|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "<|vi|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<|nl|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151675": {
+      "content": "<|pt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151676": {
+      "content": "<|id|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151677": {
+      "content": "<|ru|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151678": {
+      "content": "<|it|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151679": {
+      "content": "<|ar|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151680": {
+      "content": "<|jp|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151681": {
+      "content": "<|unknown|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|en|>",
+    "<|kr|>",
+    "<|de|>",
+    "<|es|>",
+    "<|fr|>",
+    "<|hi|>",
+    "<|uk|>",
+    "<|th|>",
+    "<|vi|>",
+    "<|nl|>",
+    "<|pt|>",
+    "<|id|>",
+    "<|ru|>",
+    "<|it|>",
+    "<|ar|>",
+    "<|jp|>",
+    "<|unknown|>"
+  ],
+  "audio_bos_token": "<|audio_bos|>",
+  "audio_eos_token": "<|audio_eos|>",
+  "audio_token": "<|AUDIO|>",
+  "auto_map": {
+    "AutoProcessor": "processing_midashenglm.MiDashengLMProcessor"
+  },
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {
+    "audio_bos_token": "<|audio_bos|>",
+    "audio_eos_token": "<|audio_eos|>",
+    "audio_token": "<|AUDIO|>",
+    "image_token": "<|IMAGE|>",
+    "video_token": "<|VIDEO|>",
+    "vision_bos_token": "<|vision_bos|>",
+    "vision_eos_token": "<|vision_eos|>"
+  },
+  "image_token": "<|IMAGE|>",
+  "model_max_length": 32768,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "MiDashengLMProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "video_token": "<|VIDEO|>",
+  "vision_bos_token": "<|vision_bos|>",
+  "vision_eos_token": "<|vision_eos|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff