Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.gitattributes +1 -0
chat_template.jinja +171 -0
config.json +55 -0
configuration_rwkv079qwen3.py +238 -0
generation_config.json +10 -0
modeling_rwkv079qwen3.py +1063 -0
special_tokens_map.json +23 -0
tokenization_rwkv079qwen3.py +4 -0
tokenization_rwkv079qwen3_fast.py +4 -0
tokenizer.json +3 -0
tokenizer_config.json +1035 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,171 @@

+{# ----------‑‑‑ special token variables ‑‑‑---------- #}
+{%- set bos_token              = '<seed:bos>'               -%}
+{%- set eos_token              = '<seed:eos>'               -%}
+{%- set pad_token              = '<seed:pad>'               -%}
+{%- set toolcall_begin_token   = '<seed:tool_call>'         -%}
+{%- set toolcall_end_token     = '</seed:tool_call>'        -%}
+{%- set think_begin_token      = '<seed:think>'             -%}
+{%- set think_end_token        = '</seed:think>'            -%}
+{%- set budget_begin_token     = '<seed:cot_budget_reflect>'-%}
+{%- set budget_end_token       = '</seed:cot_budget_reflect>'-%}
+{# -------------- reflection-interval lookup -------------- #}
+{%- if not thinking_budget is defined %}
+{%- set thinking_budget = -1 -%}
+{%- endif -%}
+{%- set budget_reflections_v05 = {
+     0:      0,
+     512:    128,
+     1024:   256,
+     2048:   512,
+     4096:   512,
+     8192:   1024,
+     16384:  1024
+} -%}
+{# Find the first gear that is greater than or equal to the thinking_budget. #}
+{%- set ns = namespace(interval = None) -%}
+{%- for k, v in budget_reflections_v05 | dictsort -%}
+    {%- if ns.interval is none and thinking_budget <= k -%}
+        {%- set ns.interval = v -%}
+    {%- endif -%}
+{%- endfor -%}
+{# If it exceeds the maximum gear, use the value of the last gear #}
+{%- if ns.interval is none -%}
+    {%- set ns.interval = budget_reflections_v05[16384] -%}
+{%- endif -%}
+{# ---------- Preprocess the system message ---------- #}
+{%- if messages[0]["role"] == "system" %}
+{%- set system_message = messages[0]["content"] %}
+{%- set loop_messages = messages[1:] %}
+{%- else %}
+{%- set loop_messages = messages %}
+{%- endif %}
+{# ---------- Ensure tools exist ---------- #}
+{%- if not tools is defined or tools is none %}
+{%- set tools = [] %}
+{%- endif %}
+{# tools2doc.jinja #}
+{%- macro py_type(t) -%}
+    {%- if t == "string" -%}str
+    {%- elif t in ("number", "integer") -%}int
+    {%- elif t == "boolean" -%}bool
+    {%- elif t == "array" -%}list
+    {%- else -%}Any{%- endif -%}
+{%- endmacro -%}
+{# ---------- Output the system block ---------- #}
+{%- if system_message is defined %}
+{{ bos_token + "system\n" + system_message }}
+{%- else %}
+{%- if tools is iterable and tools | length > 0 %}
+{{ bos_token + "system\nYou are Doubao, a helpful AI assistant. You may call one or more functions to assist with the user query." }}
+{%- endif %}
+{%- endif %}
+{%- if use_json_tooldef is defined and use_json_tooldef %}
+{{"Tool List:\nYou are authorized to use the following tools (described in JSON Schema format). Before performing any task, you must decide how to call them based on the descriptions and parameters of these tools."}}
+{{ tools | tojson(ensure_ascii=False) }}
+{%- else %}
+{%- for item in tools if item.type == "function" %}
+Function:
+def {{ item.function.name }}(
+{%- for name, spec in item.function.parameters.properties.items() %}
+        {{- name }}: {{ py_type(spec.type) }}{% if not loop.last %},{% endif %}
+{%- endfor %}):
+    """
+    {{ item.function.description | trim }}
+    {# ---------- Args ---------- #}
+    {%- if item.function.parameters.properties %}
+    Args:
+    {%- for name, spec in item.function.parameters.properties.items() %}
+    - {{ name }} ({{ py_type(spec.type) }})
+      {%- if name in item.function.parameters.required %} [必填]{% else %} [选填]{% endif %}:
+      {{- " " ~ (spec.description or "") }}
+    {%- endfor %}
+    {%- endif %}
+    {# ---------- Returns ---------- #}
+    {%- if item.function.returns is defined
+          and item.function.returns.properties is defined
+          and item.function.returns.properties %}
+    Returns:
+    {%- for name, spec in item.function.returns.properties.items() %}
+    - {{ name }} ({{ py_type(spec.type) }}):
+      {{- " " ~ (spec.description or "") }}
+    {%- endfor %}
+    {%- endif %}
+    """
+{%- endfor %}
+{%- endif %}
+{%- if tools is iterable and tools | length > 0 %}
+{{"工具调用请遵循如下格式:\n<seed:tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>value_1</parameter>\n<parameter=example_parameter_2>This is the value for the second parameter\nthat can span\nmultiple lines</parameter>\n</function>\n</seed:tool_call>\n"}}
+{%- endif %}
+{# End the system block line #}
+{%- if system_message is defined or tools is iterable and tools | length > 0 %}
+{{ eos_token }}
+{%- endif %}
+{# ---------- Thinking Budget ---------- #}
+{%- if thinking_budget is defined %}
+{%- if thinking_budget == 0 %}
+{{ bos_token+"system" }}
+{{ "You are an intelligent assistant that can answer questions in one step without the need for reasoning and thinking, that is, your thinking budget is 0. Next, please skip the thinking process and directly start answering the user's questions." }}
+{{ eos_token }}
+{%- elif not thinking_budget == -1 %}
+{{ bos_token+"system" }}
+{{ "You are an intelligent assistant with reflective ability. In the process of thinking and reasoning, you need to strictly follow the thinking budget, which is "}}{{thinking_budget}}{{". That is, you need to complete your thinking within "}}{{thinking_budget}}{{" tokens and start answering the user's questions. You will reflect on your thinking process every "}}{{ns.interval}}{{" tokens, stating how many tokens have been used and how many are left."}}
+{{ eos_token }}
+{%- endif %}
+{%- endif %}
+{# ---------- List the historical messages one by one ---------- #}
+{%- for message in loop_messages %}
+{%- if message.role == "assistant"
+  and message.tool_calls is defined
+  and message.tool_calls is iterable
+  and message.tool_calls | length > 0 %}
+{{ bos_token + message.role }}
+{%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %}
+{{ "\n" + think_begin_token + message.reasoning_content | trim + think_end_token }}
+{%- endif %}
+{%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
+{{ "\n" + message.content | trim + "\n" }}
+{%- endif %}
+{%- for tool_call in message.tool_calls %}
+{%- if tool_call.function is defined %}{% set tool_call = tool_call.function %}{% endif %}
+{{ "\n" + toolcall_begin_token + "\n<function=" + tool_call.name + ">\n" }}
+{%- if tool_call.arguments is defined %}
+{%- for arg_name, arg_value in tool_call.arguments | items %}
+{{ "<parameter=" + arg_name + ">" }}
+{%- set arg_value = arg_value if arg_value is string else arg_value | string %}
+{{ arg_value+"</parameter>\n" }}
+{%- endfor %}
+{%- endif %}
+{{ "</function>\n" + toolcall_end_token }}
+{%- endfor %}
+{{ eos_token }}
+{%- elif message.role in ["user", "system"] %}
+{{ bos_token + message.role + "\n" + message.content + eos_token }}
+{%- elif message.role == "assistant" %}
+{{ bos_token + message.role }}
+{%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %}
+{{ "\n" + think_begin_token + message.reasoning_content | trim + think_end_token }}
+{%- endif %}
+{%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
+{{ "\n" + message.content | trim + eos_token }}
+{%- endif %}
+{# Include the tool role #}
+{%- else %}
+{{ bos_token + message.role + "\n" + message.content + eos_token }}
+{%- endif %}
+{%- endfor %}
+{# ---------- Control the model to start continuation ---------- #}
+{%- if add_generation_prompt %}
+{{ bos_token+"assistant\n" }}
+{%- if thinking_budget == 0 %}
+{{ think_begin_token + "\n" + budget_begin_token + "The current thinking budget is 0, so I will directly start answering the question." + budget_end_token + "\n" + think_end_token }}
+{%- endif %}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "architectures": [
+    "RWKV079Qwen3ForCausalLM"
+  ],
+  "auto_map": {
+      "AutoConfig": "configuration_rwkv079qwen3.RWKV079Qwen3Config",
+      "AutoModelForCausalLM": "modeling_rwkv079qwen3.RWKV079Qwen3ForCausalLM"
+  },
+  "description": "Hybrid-RWKV Strategically Interleaved RWKV-Attention",
+  "base_model": "ByteDance-Seed/Seed-OSS-36B-Instruct",
+  "model_revision": "alpha",
+  "transformer_layers":[5,11,17,23,28,33,38,43,48,53,58,63],
+  "rwkv_layers": [0,1,2,3,4,6,7,8,9,10,12,13,14,15,16,18,19,20,21,22,24,25,26,27,29,30,31,32,34,35,36,37,39,40,41,42,44,45,46,47,49,50,51,52,54,55,56,57,59,60,61,62],
+  "rwkv_architecture": "hxa079",
+  "enable_qk_norm": false,
+  "nope_in_transformer": true,
+  "nope_in_rwkv": false,
+  "lora_rank_decay": 448,
+  "lora_rank_iclr":192,
+  "lora_rank_value_residual_mix":128,
+  "lora_rank_key_residual_mix":128,
+  "lora_rank_gate":576,
+  "use_rope":true,
+  "attention_bias": true,
+  "attention_dropout": 0.1,
+  "attention_out_bias": false,
+  "bos_token_id": 0,
+  "pad_token_id": 1,
+  "eos_token_id": 2,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 27648,
+  "max_position_embeddings": 524288,
+  "mlp_bias": false,
+  "model_type": "rwkv079qwen3",
+  "num_attention_heads": 80,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "residual_dropout": 0.1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": {
+    "rope_type": "default"
+  },
+  "rope_theta": 10000000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.55.0",
+  "use_cache": true,
+  "vocab_size": 155136
+}

configuration_rwkv079qwen3.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RWKV079Qwen3 model configuration"""
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class RWKV079Qwen3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RWKV079Qwen3Model`]. It is used to instantiate a
+    RWKV079Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-7B-beta [Qwen/Qwen3-7B-beta](https://huggingface.co/Qwen/Qwen3-7B-beta).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the RWKV079Qwen3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RWKV079Qwen3Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        lora_rank_decay (`int`, *optional*):
+            The rank of the lora used to generate decay.
+        lora_rank_iclr (`int`, *optional*):
+            The rank of the lora used to generate the in-context learning rate.
+        lora_rank_value_residual_mix (`int`, *optional*):
+            The rank of the lora used to generate the value residual mix amount.
+        lora_rank_value_gate (`int`, *optional*):
+            The rank of the lora used to generate the gate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import RWKV079Qwen3Model, RWKV079Qwen3Config
+    >>> # Initializing a RWKV079Qwen3 style configuration
+    >>> configuration = RWKV079Qwen3Config()
+    >>> # Initializing a model from the RWKV079Qwen3-7B style configuration
+    >>> model = RWKV079Qwen3Model(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "rwkv079qwen3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        lora_rank_tokenshift=None,
+        lora_rank_decay=None,
+        lora_rank_iclr=None,
+        lora_rank_value_residual_mix=None,
+        lora_rank_value_key_mix=None,
+        lora_rank_gate=None,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        use_rope=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        first_attention_layer=9999,
+        first_post_attention_layer=9999,
+        attention_striping=1,
+        last_striping_layer=99999,
+        layer_types=None,
+        attention_dropout=0.0,
+        attention_bias=True,
+        attention_output_bias=False,
+        gate_rank_type=2,
+        balance_state=True,
+        groupnorm_att=False,
+        use_tokenshift=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        self.first_attention_layer = first_attention_layer
+        self.first_post_attention_layer = first_post_attention_layer
+        self.attention_striping = attention_striping
+        self.last_striping_layer = last_striping_layer
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.lora_rank_tokenshift = lora_rank_tokenshift
+        self.lora_rank_decay = lora_rank_decay
+        self.lora_rank_iclr = lora_rank_iclr
+        self.lora_rank_value_residual_mix = lora_rank_value_residual_mix
+        self.lora_rank_gate = lora_rank_gate
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.use_rope = use_rope
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention"
+                if self.sliding_window is not None and i >= self.max_window_layers
+                else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types)
+        self.attention_bias = attention_bias
+        self.attention_output_bias = attention_output_bias
+        self.gate_rank_type = gate_rank_type
+        self.balance_state = balance_state
+        self.groupnorm_att = groupnorm_att
+        self.use_tokenshift = use_tokenshift
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "pad_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.55.0",
+  "temperature": 1.1,
+  "top_p": 0.95
+}

modeling_rwkv079qwen3.py ADDED Viewed

	@@ -0,0 +1,1063 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PyTorch RWKV079Qwen3 model.
+base code from SmerkyG @ recursal.ai, featherless.ai
+hxa079 implementation RWKV079 + NoPE Hybrid Attention
+"""
+import math
+import inspect
+from typing import List, Optional, Tuple, Union, Dict, Any
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import torch.nn.functional as F
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, CacheLayerMixin
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import (
+    GenericForQuestionAnswering,
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
+from transformers.utils.generic import check_model_inputs
+from .configuration_rwkv079qwen3 import RWKV079Qwen3Config
+from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer, Qwen3MLP, Qwen3RMSNorm, Qwen3Attention
+class RWKV079State():
+    def __init__(self) -> None:
+        #super().__init__()
+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+        self.layer_kv_states: List[torch.Tensor] = []
+        self.layer_shift_states:  List[torch.Tensor] = []
+        self.cumulative_scores: List[torch.Tensor] = []
+        self.sin: List[torch.Tensor] = []
+        self.cos: List[torch.Tensor] = []
+    def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
+        sequence length.
+        """
+        if layer_idx < len(self):
+            return (self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx])
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+    def __iter__(self):
+        """
+        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
+        keys and values
+        """
+        for layer_idx in range(len(self)):
+            yield (self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx])
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.layer_kv_states)
+    def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
+        """Given the sequence length of the new inputs, returns the usable length of the cache."""
+        # Linear Attention variants do not have a maximum length
+        return new_seq_length
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        raise NotImplementedError('Cannot reorder Linear Attention state')
+    def get_seq_length(self, layer_idx: int = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        return self._seen_tokens
+    def get_max_cache_shape(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cache object. DynamicCache does not have a maximum length."""
+        return None
+    def get_max_length(self) -> Optional[int]:
+        """
+        Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length.
+        """
+        return None
+    def crop(self, max_length: int):
+        # can't implement this for linear attention variants
+        return
+    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
+        """Return the length and offset of the cache, used to generate the mask"""
+        kv_offset = 0
+        query_length = cache_position.shape[0]
+        past_seen_tokens = self.get_seq_length()
+        kv_length = query_length + past_seen_tokens
+        return kv_length, kv_offset
+    @property
+    def is_compileable(self) -> bool:
+        """Return whether the cache is compileable"""
+        return True #all(layer.is_compileable for layer in self.layers)
+    @torch.no_grad
+    def update(
+        self,
+        kv_state: torch.Tensor,
+        shift_state: torch.Tensor,
+        layer_idx: int,
+        token_count: int = 0,
+        is_attention_layer: bool = True,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            if is_attention_layer:
+                token_count = kv_state.size(-2)
+            self._seen_tokens += token_count
+        #print(f'self._seen_tokens = {self._seen_tokens} layer_idx = {layer_idx} is_attention_layer = {is_attention_layer} kv_state.size(-2) = {kv_state.size(-2)}')
+        # Update the cache
+        if kv_state is not None:
+            # There may be skipped layers, fill them with empty lists
+            if layer_idx >= len(self.layer_kv_states):
+                for _ in range(len(self.layer_kv_states), layer_idx):
+                    if is_attention_layer:
+                        self.layer_kv_states.append(torch.tensor([], dtype=kv_state.dtype, device=kv_state.device)) # acts as key_cache
+                        self.layer_shift_states.append(torch.tensor([], dtype=shift_state.dtype, device=shift_state.device)) # acts as value_cache
+                    else:
+                        self.layer_kv_states.append(torch.zeros_like(kv_state).requires_grad_(False))
+                        self.layer_shift_states.append(torch.zeros_like(shift_state).requires_grad_(False))
+                self.layer_kv_states.append(kv_state) # acts as key_cache
+                self.layer_shift_states.append(shift_state) # acts as value_cache
+            else:
+                if is_attention_layer:
+                    self.layer_kv_states[layer_idx] = torch.cat([self.layer_kv_states[layer_idx], kv_state], dim=-2) # acts as key_cache
+                    self.layer_shift_states[layer_idx] = torch.cat([self.layer_shift_states[layer_idx], shift_state], dim=-2) # acts as value_cache
+                else:
+                    self.layer_kv_states[layer_idx].copy_(kv_state)
+                    self.layer_shift_states[layer_idx].copy_(shift_state)
+        return self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx]
+try:
+    from fla.ops.rwkv7.chunk import chunk_rwkv7
+    from fla.ops.rwkv7.fused_recurrent import fused_recurrent_rwkv7
+except ImportError:
+    print("Required module is not installed. Please install it using the following commands:")
+    print("pip install --no-use-pep517 flash-linear-attention")
+    print("Additionally, ensure you have at least version 2.2.0 of Triton installed:")
+    print("pip install triton>=2.2.0")
+# def is_layer_attention(config, layer_id):
+#     return layer_id >= config.first_attention_layer and layer_id < config.first_post_attention_layer and  (layer_id > min(config.num_hidden_layers, config.last_striping_layer) or (min(config.num_hidden_layers-1, config.last_striping_layer) - layer_id) % config.attention_striping == 0)
+def is_layer_attention(config, layer_id):
+    return layer_id in config.transformer_layers
+def repeat_kv_rwkv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    Repeat KV heads along the head dimension (GQA).
+    Input:  (B, T, H_kv, D)
+    Output: (B, T, H_kv * n_rep, D)
+    """
+    B, T, H_kv, D = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    # Expand head dim
+    hidden_states = hidden_states[:, :, :, None, :]  # (B, T, H_kv, 1, D)
+    hidden_states = hidden_states.expand(B, T, H_kv, n_rep, D)  # (B, T, H_kv, n_rep, D)
+    return hidden_states.reshape(B, T, H_kv * n_rep, D).contiguous()
+def T5RMSNorm(hidden_states,weight,variance_epsilon:float=1e-6):
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
+    return (weight * hidden_states).to(input_dtype)
+def compute_qwen3_rope_cache(seq_len, rotary_dim, device, dtype, rope_theta):
+            half_dim = rotary_dim // 2
+            freq_seq = torch.arange(half_dim, dtype=dtype, device=device)
+            inv_freq = 1.0 / (rope_theta ** (freq_seq / half_dim))
+            positions = torch.arange(seq_len, dtype=dtype, device=device)
+            freqs = torch.einsum("i,j->ij", positions, inv_freq)
+            emb = torch.cat([freqs, freqs], dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+            return cos.unsqueeze(0), sin.unsqueeze(0), inv_freq
+# def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+#     """Applies Rotary Position Embedding to the query and key tensors.
+#     Args:
+#         q (`torch.Tensor`): The query tensor.
+#         k (`torch.Tensor`): The key tensor.
+#         cos (`torch.Tensor`): The cosine part of the rotary embedding.
+#         sin (`torch.Tensor`): The sine part of the rotary embedding.
+#         position_ids (`torch.Tensor`, *optional*):
+#             Deprecated and unused.
+#         unsqueeze_dim (`int`, *optional*, defaults to 1):
+#             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+#             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+#             that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+#             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+#             cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+#             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+#     Returns:
+#         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+#     """
+#     cos = cos.unsqueeze(unsqueeze_dim)
+#     sin = sin.unsqueeze(unsqueeze_dim)
+#     q_embed = (q * cos) + (rotate_half(q) * sin)
+#     k_embed = (k * cos) + (rotate_half(k) * sin)
+#     return q_embed, k_embed
+class Qwen3RotaryEmbedding(nn.Module):
+    def __init__(self, config: RWKV079Qwen3Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            # This .to() is needed if the model has been moved to a device after being initialized (because
+            # the buffer is automatically moved, but not the original copy)
+            self.original_inv_freq = self.original_inv_freq.to(device)
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rms_norm(hidden_states, eps = 1e-6):
+    #print('ugyuugyu')
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + eps)
+    return hidden_states.to(input_dtype)
+def generate_rotary_embedding(max_seqlen:int, dim:int, theta:float = 10000.0, scale:float = 1):
+    #inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float).to(device) / dim))
+    angular_velocity = theta ** -(torch.arange(0, dim, 2, dtype=torch.float) / dim) / scale # frequencies from 1.0 ... 1/theta
+    angles = torch.outer(torch.arange(max_seqlen), angular_velocity)
+    # Different from paper, but it uses a different permutation in order to obtain the same calculation
+    emb = torch.cat((angles, angles), dim=-1)
+    return torch.stack([emb.cos(), emb.sin()], dim=0)
+    #return torch.polar(torch.ones_like(angles), angles)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def apply_rotary_pos_emb_single(x, cos, sin, unsqueeze_dim=1):
+    return (x * cos.unsqueeze(unsqueeze_dim)) + (rotate_half(x) * sin.unsqueeze(unsqueeze_dim))
+from typing import Callable, Optional, Tuple, Union
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = attn_weights.masked_fill(attn_weights.isnan(), 0) # IMPORTANT FOR BATCHED INFERENCE IN LM EVAL!
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+from torch.nn.attention.flex_attention import create_block_mask, flex_attention, create_mask
+from functools import lru_cache
+block_mask = None
+def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
+        is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
+    L, S = query.size(-2), key.size(-2)
+    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+    attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
+    if is_causal:
+        assert attn_mask is None
+        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+        attn_bias.to(query.dtype)
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
+        else:
+            attn_bias = attn_mask + attn_bias
+    if enable_gqa:
+        key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
+        value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
+    attn_weight = query.float() @ key.float().transpose(-2, -1) * scale_factor
+    attn_weight += attn_bias.float()
+    #attn_weight = stable_softmax(attn_weight, dim=-1)
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    attn_weight = attn_weight.masked_fill(attn_weight.isnan(), 0) # IMPORTANT FOR BATCHED INFERENCE IN LM EVAL!
+    #attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+    return attn_weight @ value.float()
+class Qwen3AttentionNoPE_Causal(Qwen3Attention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        frozen_residual: torch.Tensor,
+        v_first: Optional[torch.Tensor] = None,
+        k_first: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        x = hidden_states
+        B, L, D = x.size()
+        input_shape = x.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        q = self.q_proj(x).view(hidden_shape).transpose(1, 2)
+        k = self.k_proj(x).view(hidden_shape).transpose(1, 2)
+        v = self.v_proj(x).view(hidden_shape).transpose(1, 2)
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"cache_position": cache_position}
+            k, v = past_key_values.update(k, v, self.layer_idx, cache_kwargs)
+        # repeat k/v heads if n_kv_heads < n_heads
+        k = repeat_kv(k, self.num_key_value_groups)
+        v = repeat_kv(v, self.num_key_value_groups)
+        S = k.size(-2)
+        y = nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=0.0, attn_mask=attention_mask, is_causal=attention_mask is None and L==S)
+        y = y.transpose(1,2)
+        y = y.reshape(*input_shape, -1)#.contiguous()
+        y = self.o_proj(y)
+        attn_weights = None
+        return y, v_first, k_first
+class RWKV079Attention(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        C = self.hidden_size = config.hidden_size
+        H = self.num_heads = config.num_attention_heads
+        H_kv = config.num_key_value_heads
+        N = self.head_dim = getattr(config, 'head_dim', self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.attention_dropout = config.attention_dropout
+        if self.hidden_size % self.num_heads != 0:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.receptance = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.key = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.value = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.output = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        #self.r_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
+        #self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # thus post q_norm does not need reshape
+        lora_rank_decay = config.lora_rank_decay
+        lora_rank_iclr = config.lora_rank_iclr
+        lora_rank_value_residual_mix = config.lora_rank_value_residual_mix
+        lora_rank_key_residual_mix = config.lora_rank_key_residual_mix
+        lora_rank_gate = config.lora_rank_gate
+        print(f"lora_rank_value_residual_mix = {lora_rank_value_residual_mix} lora_rank_key_residual_mix={lora_rank_key_residual_mix}")
+        self.w0 = nn.Parameter(torch.empty(1,1,H*N))
+        self.w1 = nn.Parameter(torch.empty(C, lora_rank_decay))
+        self.w2 = nn.Parameter(torch.empty(lora_rank_decay, H*N))
+        self.a0 = nn.Parameter(torch.empty(1,1,H*N))
+        self.a1 = nn.Parameter(torch.empty(C, lora_rank_iclr))
+        self.a2 = nn.Parameter(torch.empty(lora_rank_iclr, H*N))
+        #if layer_idx > 0:
+        self.v0 = nn.Parameter(torch.empty(1,1,H_kv*N))
+        self.v1 = nn.Parameter(torch.empty(C, lora_rank_value_residual_mix))
+        self.v2 = nn.Parameter(torch.empty(lora_rank_value_residual_mix, H_kv*N))
+        self.k0 = nn.Parameter(torch.empty(1,1,H_kv*N))
+        self.k1 = nn.Parameter(torch.empty(C, lora_rank_key_residual_mix))
+        self.k2 = nn.Parameter(torch.empty(lora_rank_key_residual_mix, H_kv*N))
+        self.g1 = nn.Parameter(torch.empty(C, lora_rank_gate))
+        self.g2 = nn.Parameter(torch.empty(lora_rank_gate, H*N))
+        self.r_k = nn.Parameter(torch.empty(H,N))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        frozen_residual: torch.Tensor,
+        v_first: Optional[torch.Tensor] = None,
+        k_first: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[RWKV079State] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ):
+        if attention_mask is not None:
+            assert len(attention_mask.shape) in (2, 4)
+        output_shift_state = hidden_states[:, -1:].detach().clone()
+        x = hidden_states
+        B, T, C = hidden_states.shape
+        H = self.num_heads
+        N = self.head_dim
+        q_len = T
+        if use_cache and past_key_values is not None and len(past_key_values) > self.layer_idx:
+            #print(f'use past state layer {self.layer_idx}')
+            input_vk_state, input_shift_state = past_key_values[self.layer_idx]
+        else:
+            input_vk_state, input_shift_state = torch.zeros(B,H,N,N, dtype=torch.bfloat16,device=x.device), torch.zeros_like(x[:, -1:])
+        xr = xw = xk = xv = xa = xg = x
+        r = self.receptance(xr).view(B,T,-1,N)
+        w = -F.softplus(-(self.w0 + torch.tanh(xw @ self.w1) @ self.w2)) -0.5
+        k = self.key(xk).view(B,T,-1,N)
+        v = self.value(xv).view(B,T,-1,N)
+        a = torch.sigmoid(self.a0 + (xa @ self.a1) @ self.a2)
+        g = torch.sigmoid(xg @ self.g1) @ self.g2
+        if position_embeddings is not None:
+            cos, sin = position_embeddings
+            r, k = apply_rotary_pos_emb(r, k, cos, sin, unsqueeze_dim=2)
+        if self.layer_idx == 0:
+                v_first = v # store the v of the first layer
+                k_first = k # store the k of the first layer
+        else:
+            v = v + (v_first - v) * torch.sigmoid(self.v0 + (x @ self.v1) @ self.v2).view(B,T,self.num_key_value_heads,-1) # add value residual
+            k = k + (k_first - k) * torch.sigmoid(self.k0 + (x @ self.k1) @ self.k2).view(B,T,self.num_key_value_heads,-1) # add key residual
+        # dealing with left-padding
+        # if attention_mask is not None:
+        #     if len(attention_mask.shape) == 2:
+        #         v = v * attention_mask[:, -v.shape[-2]:, None]
+        #     elif len(attention_mask.shape) == 4:
+        #         v = v * attention_mask[:, -1, -1, -v.shape[-2]:].view(B, T, 1)
+        #         #v = v * attention_mask[:, :, -1, -v.shape[-2]:, None]
+        if attention_mask is not None:
+            if attention_mask is not None:
+                if attention_mask.ndim == 2:
+                    # [B, S]
+                    mask = attention_mask[:, -T:]             # [B, T]
+                    v = v * mask[:, :, None, None]            # → [B, T, 1, 1] に拡張して掛け算
+                elif attention_mask.ndim == 4:
+                    # [B, 1, L, S]
+                    mask = attention_mask[:, 0, -1, -T:]      # [B, T]
+                    v = v * mask[:, :, None, None]            # 同上
+        # repeat k/v heads if n_kv_heads < n_heads
+        # k = k.view(B, T, -1, 1, self.head_dim).expand(-1, -1, -1, self.num_key_value_groups, -1).reshape(B, T, -1)
+        # v = v.view(B, T, -1, 1, self.head_dim).expand(-1, -1, -1, self.num_key_value_groups, -1).reshape(B, T, -1)
+        k = repeat_kv_rwkv(k, self.num_key_value_groups).view(B, T, -1)
+        v = repeat_kv_rwkv(v, self.num_key_value_groups).view(B, T, -1)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        kk = F.normalize(k.view(B,T,H,-1), dim=-1, p=2.0).view(B,T,-1)
+        k = k * (1.0 - w + a)
+        aa = -kk
+        bb = kk * a
+        w = -w.exp()
+        r_,w_,k_,v_,aa_,bb_ = [i.view(B,T,H,N) for i in [r,w,k,v,aa,bb]]
+        #print(f'r shape = {r_.shape}')
+        # if self.layer_idx == 0:
+        #     print(f'input_vk_state sum = {torch.sum(input_vk_state)}')
+        #x, output_vk_state = fused_recurrent_rwkv7(r_, w_, k_, v_, aa_, bb_, initial_state=input_vk_state, output_final_state=use_cache)
+        x, output_vk_state = fused_recurrent_rwkv7(r_, w_, k_, v_, aa_, bb_, scale=1.0, initial_state=input_vk_state, output_final_state=True, head_first=False)
+        # if self.layer_idx == 0:
+        #     print(f'output_vk_state sum = {torch.sum(output_vk_state)}')
+        x = x.view(B,T,-1) * (float(N) ** -0.5)
+        x = x + ((r.view(B,T,H,-1)*k.view(B,T,H,-1)*self.r_k).sum(dim=-1, keepdim=True) * v.view(B,T,H,-1)).view(B,T,-1)
+        x = x * g
+        x = self.output(x)
+        if past_key_values is not None:
+            past_key_values.update(output_vk_state, output_shift_state, self.layer_idx, q_len, is_layer_attention(self.config, self.layer_idx))
+        return x, v_first, k_first
+class RWKV079Qwen3DecoderLayer(nn.Module):
+    def __init__(self, config: RWKV079Qwen3Config, layer_idx: int):
+        nn.Module.__init__(self)
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        if is_layer_attention(config, layer_idx):
+            print(f'layer {layer_idx} : attention')
+            att_fn = Qwen3AttentionNoPE_Causal #Qwen3KeyQuant #Qwen3SWAPrefill #Qwen3DropoutSWASink #Qwen3AttentionNoPE #Qwen3MOBA #Qwen3AttentionVerticalSparse # Qwen3DoubleAttention # Qwen3SymPow #Qwen3Chunk #Qwen3Power #Qwen3MOBA #Qwen3Attention # Qwen3NewAttention # Qwen3AttentionAdapted
+        else:
+            print(f'layer {layer_idx} : rwkv')
+            att_fn = RWKV079Attention
+        self.self_attn = att_fn(config, layer_idx)
+        self.mlp = Qwen3MLP(config)
+        self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.attention_type = config.layer_types[layer_idx]
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        frozen_residual: torch.Tensor,
+        v_first: Optional[torch.Tensor],
+        k_first: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, v_first, k_first = self.self_attn(
+            hidden_states=hidden_states,
+            frozen_residual=frozen_residual,
+            v_first=v_first,
+            k_first=k_first,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            #is_causal=True,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states, v_first,k_first,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        return outputs
+@auto_docstring
+class RWKV079Qwen3PreTrainedModel(PreTrainedModel):
+    config: RWKV079Qwen3Config
+    config_class = RWKV079Qwen3Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["RWKV079Qwen3DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    # def _init_weights(self, module):
+    #     std = self.config.initializer_range
+    #     if isinstance(module, nn.Linear):
+    #         module.weight.data.normal_(mean=0.0, std=std)
+    #         if module.bias is not None:
+    #             module.bias.data.zero_()
+    #     elif isinstance(module, nn.Embedding):
+    #         module.weight.data.normal_(mean=0.0, std=std)
+    #         if module.padding_idx is not None:
+    #             module.weight.data[module.padding_idx].zero_()
+@auto_docstring
+class RWKV079Qwen3Model(RWKV079Qwen3PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen3DecoderLayer`]
+    Args:
+        config: RWKV079Qwen3Config
+    """
+    def __init__(self, config: RWKV079Qwen3Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [RWKV079Qwen3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.has_sliding_layers = "sliding_attention" in self.config.layer_types
+        # Initialize weights and apply final processing
+        self.post_init()
+    #@check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and not isinstance(past_key_values, RWKV079State):
+            past_key_values = RWKV079State()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        # It may already have been prepared by e.g. `generate`
+        if not isinstance(causal_mask_mapping := attention_mask, dict):
+            # Prepare mask arguments
+            mask_kwargs = {
+                "config": self.config,
+                "input_embeds": inputs_embeds,
+                "attention_mask": attention_mask,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "position_ids": position_ids,
+            }
+            # Create the masks
+            causal_mask_mapping = {
+                "full_attention": create_causal_mask(**mask_kwargs),
+            }
+            # The sliding window alternating layers are not always activated depending on the config
+            if self.has_sliding_layers:
+                causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        if self.config.use_rope:
+            position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        else:
+            position_embeddings = None
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        v_first = None
+        k_first = None
+        frozen_residual = None
+        for decoder_layer in self.layers:
+            if not is_layer_attention(self.config, decoder_layer.layer_idx):
+                frozen_residual = hidden_states#rms_norm(hidden_states)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            attention_mask = causal_mask_mapping[decoder_layer.attention_type]
+            if attention_mask is not None and attention_mask.ndim == 1:
+                attention_mask = None
+            #attention_mask = None
+            layer_outputs = decoder_layer(
+                hidden_states,
+                frozen_residual=frozen_residual,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                v_first=v_first,
+                k_first=k_first
+            )
+            hidden_states = layer_outputs[0]
+            v_first = layer_outputs[1]
+            k_first = layer_outputs[2]
+            if output_attentions:
+                all_self_attns += (layer_outputs[2],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        #if return_legacy_cache:
+        #    next_cache = next_cache.to_legacy_cache()
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class RWKV079Qwen3ForCausalLM(RWKV079Qwen3PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = RWKV079Qwen3Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, RWKV079Qwen3ForCausalLM
+        >>> model = RWKV079Qwen3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        # # run the prefill only up to the last token, then run one more for the actual result
+        # # we do this so that called code doesn't have to handle the dichotomy specially and can just check for L==1
+        # for i in range(2):
+        #     all_but_one = max(1, input_ids.size(-1)-1)
+        #     iid = input_ids[..., i*all_but_one:(i+1)*all_but_one]
+        #     if iid.size(-1) == 0:
+        #         continue
+        #     pids = position_ids
+        #     if pids is not None:
+        #         pids = position_ids[..., i*all_but_one:(i+1)*all_but_one]
+        #     cp = cache_position
+        #     if cp is not None:
+        #         cp = cache_position[..., i*all_but_one:(i+1)*all_but_one]
+        #     rv = self.forward_inner(iid, attention_mask=attention_mask, position_ids=pids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, cache_position=cp, num_logits_to_keep=num_logits_to_keep, **loss_kwargs)
+        #     past_key_values = rv.past_key_values
+    #     return rv
+    # def forward_inner(
+    #     self,
+    #     input_ids: torch.LongTensor = None,
+    #     attention_mask: Optional[torch.Tensor] = None,
+    #     position_ids: Optional[torch.LongTensor] = None,
+    #     past_key_values: Optional[List[torch.FloatTensor]] = None,
+    #     inputs_embeds: Optional[torch.FloatTensor] = None,
+    #     labels: Optional[torch.LongTensor] = None,
+    #     use_cache: Optional[bool] = None,
+    #     output_attentions: Optional[bool] = None,
+    #     output_hidden_states: Optional[bool] = None,
+    #     cache_position: Optional[torch.LongTensor] = None,
+    #     num_logits_to_keep: int = 0,
+    #     **loss_kwargs,
+    # ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size, **loss_kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring
+class RWKV079Qwen3ForSequenceClassification(RWKV079Qwen3PreTrainedModel):
+    pass
+@auto_docstring
+class RWKV079Qwen3ForTokenClassification(RWKV079Qwen3PreTrainedModel):
+    pass
+@auto_docstring
+class RWKV079Qwen3ForQuestionAnswering(RWKV079Qwen3PreTrainedModel):
+    base_model_prefix = "transformer"  # For BC, where `transformer` was used instead of `model`

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<seed:bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<seed:eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<seed:pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenization_rwkv079qwen3.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from transformers.models.qwen3.tokenization_qwen3 import Qwen3Tokenizer
+class RWKV6Qwen3Tokenizer(Qwen3Tokenizer):
+    pass

tokenization_rwkv079qwen3_fast.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from transformers.models.qwen2.tokenization_qwen3_fast import Qwen3TokenizerFast
+class RWKV6Qwen3TokenizerFast(Qwen3TokenizerFast):
+    pass

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f6bd848f52451824a3033a9f1e67eea5b399a13c90f845a332d3a29537e05827
+size 11883696

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,1035 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<seed:bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<seed:pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<seed:eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<seed:think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "4": {
+      "content": "</seed:think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "5": {
+      "content": "<seed:cot_budget_reflect>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "6": {
+      "content": "</seed:cot_budget_reflect>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "7": {
+      "content": "<seed:tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "8": {
+      "content": "</seed:tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "9": {
+      "content": "<[PLHD9_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<[PLHD10_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<[PLHD11_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<[PLHD12_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<[PLHD13_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<[PLHD14_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<[PLHD15_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<[PLHD16_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<[PLHD17_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<[PLHD18_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "19": {
+      "content": "<[PLHD19_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "20": {
+      "content": "<[PLHD20_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "21": {
+      "content": "<[PLHD21_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "22": {
+      "content": "<[PLHD22_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "23": {
+      "content": "<[PLHD23_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "24": {
+      "content": "<[PLHD24_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "25": {
+      "content": "<[PLHD25_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "26": {
+      "content": "<[PLHD26_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "27": {
+      "content": "<[PLHD27_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "28": {
+      "content": "<[PLHD28_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "29": {
+      "content": "<[PLHD29_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "30": {
+      "content": "<[PLHD30_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "31": {
+      "content": "<[PLHD31_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32": {
+      "content": "<[PLHD32_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "33": {
+      "content": "<[PLHD33_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "34": {
+      "content": "<[PLHD34_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "35": {
+      "content": "<[PLHD35_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "36": {
+      "content": "<[PLHD36_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "37": {
+      "content": "<[PLHD37_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "38": {
+      "content": "<[PLHD38_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "39": {
+      "content": "<[PLHD39_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "40": {
+      "content": "<[PLHD40_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "41": {
+      "content": "<[PLHD41_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "42": {
+      "content": "<[PLHD42_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "43": {
+      "content": "<[PLHD43_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "44": {
+      "content": "<[PLHD44_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "45": {
+      "content": "<[PLHD45_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46": {
+      "content": "<[PLHD46_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "47": {
+      "content": "<[PLHD47_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "48": {
+      "content": "<[PLHD48_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49": {
+      "content": "<[PLHD49_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50": {
+      "content": "<[PLHD50_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "51": {
+      "content": "<[PLHD51_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "52": {
+      "content": "<[PLHD52_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "53": {
+      "content": "<[PLHD53_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "54": {
+      "content": "<[PLHD54_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "55": {
+      "content": "<[PLHD55_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "56": {
+      "content": "<[PLHD56_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57": {
+      "content": "<[PLHD57_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "58": {
+      "content": "<[PLHD58_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59": {
+      "content": "<[PLHD59_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "60": {
+      "content": "<[PLHD60_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "61": {
+      "content": "<[PLHD61_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "62": {
+      "content": "<[PLHD62_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "63": {
+      "content": "<[PLHD63_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "64": {
+      "content": "<[PLHD64_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65": {
+      "content": "<[PLHD65_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "66": {
+      "content": "<[PLHD66_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "67": {
+      "content": "<[PLHD67_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "68": {
+      "content": "<[PLHD68_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "69": {
+      "content": "<[PLHD69_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70": {
+      "content": "<[PLHD70_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "71": {
+      "content": "<[PLHD71_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "72": {
+      "content": "<[PLHD72_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73": {
+      "content": "<[PLHD73_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "74": {
+      "content": "<[PLHD74_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "75": {
+      "content": "<[PLHD75_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "76": {
+      "content": "<[PLHD76_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "77": {
+      "content": "<[PLHD77_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "78": {
+      "content": "<[PLHD78_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "79": {
+      "content": "<[PLHD79_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "80": {
+      "content": "<[PLHD80_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "81": {
+      "content": "<[PLHD81_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "82": {
+      "content": "<[PLHD82_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "83": {
+      "content": "<[PLHD83_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "84": {
+      "content": "<[PLHD84_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "85": {
+      "content": "<[PLHD85_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "86": {
+      "content": "<[PLHD86_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "87": {
+      "content": "<[PLHD87_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "88": {
+      "content": "<[PLHD88_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "89": {
+      "content": "<[PLHD89_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "90": {
+      "content": "<[PLHD90_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "91": {
+      "content": "<[PLHD91_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92": {
+      "content": "<[PLHD92_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "93": {
+      "content": "<[PLHD93_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "94": {
+      "content": "<[PLHD94_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "95": {
+      "content": "<[PLHD95_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96": {
+      "content": "<[PLHD96_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "97": {
+      "content": "<[PLHD97_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "98": {
+      "content": "<[PLHD98_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "99": {
+      "content": "<[PLHD99_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "<[PLHD100_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "<[PLHD101_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "<[PLHD102_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "<[PLHD103_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "104": {
+      "content": "<[PLHD104_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "105": {
+      "content": "<[PLHD105_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "106": {
+      "content": "<[PLHD106_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "107": {
+      "content": "<[PLHD107_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "108": {
+      "content": "<[PLHD108_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "109": {
+      "content": "<[PLHD109_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "110": {
+      "content": "<[PLHD110_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "111": {
+      "content": "<[PLHD111_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "112": {
+      "content": "<[PLHD112_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "113": {
+      "content": "<[PLHD113_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "114": {
+      "content": "<[PLHD114_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "115": {
+      "content": "<[PLHD115_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "116": {
+      "content": "<[PLHD116_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "117": {
+      "content": "<[PLHD117_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "118": {
+      "content": "<[PLHD118_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "119": {
+      "content": "<[PLHD119_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "120": {
+      "content": "<[PLHD120_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "121": {
+      "content": "<[PLHD121_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "122": {
+      "content": "<[PLHD122_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "123": {
+      "content": "<[PLHD123_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "124": {
+      "content": "<[PLHD124_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "125": {
+      "content": "<[PLHD125_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "126": {
+      "content": "<[PLHD126_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "127": {
+      "content": "<[PLHD127_never_used]>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<seed:bos>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<seed:eos>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<seed:pad>",
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}