OpenMOSE commited on
Commit
d0551b0
·
verified ·
1 Parent(s): 80bc950

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
chat_template.jinja ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {# ----------‑‑‑ special token variables ‑‑‑---------- #}
2
+ {%- set bos_token = '<seed:bos>' -%}
3
+ {%- set eos_token = '<seed:eos>' -%}
4
+ {%- set pad_token = '<seed:pad>' -%}
5
+ {%- set toolcall_begin_token = '<seed:tool_call>' -%}
6
+ {%- set toolcall_end_token = '</seed:tool_call>' -%}
7
+ {%- set think_begin_token = '<seed:think>' -%}
8
+ {%- set think_end_token = '</seed:think>' -%}
9
+ {%- set budget_begin_token = '<seed:cot_budget_reflect>'-%}
10
+ {%- set budget_end_token = '</seed:cot_budget_reflect>'-%}
11
+ {# -------------- reflection-interval lookup -------------- #}
12
+ {%- if not thinking_budget is defined %}
13
+ {%- set thinking_budget = -1 -%}
14
+ {%- endif -%}
15
+ {%- set budget_reflections_v05 = {
16
+ 0: 0,
17
+ 512: 128,
18
+ 1024: 256,
19
+ 2048: 512,
20
+ 4096: 512,
21
+ 8192: 1024,
22
+ 16384: 1024
23
+ } -%}
24
+ {# Find the first gear that is greater than or equal to the thinking_budget. #}
25
+ {%- set ns = namespace(interval = None) -%}
26
+ {%- for k, v in budget_reflections_v05 | dictsort -%}
27
+ {%- if ns.interval is none and thinking_budget <= k -%}
28
+ {%- set ns.interval = v -%}
29
+ {%- endif -%}
30
+ {%- endfor -%}
31
+ {# If it exceeds the maximum gear, use the value of the last gear #}
32
+ {%- if ns.interval is none -%}
33
+ {%- set ns.interval = budget_reflections_v05[16384] -%}
34
+ {%- endif -%}
35
+ {# ---------- Preprocess the system message ---------- #}
36
+ {%- if messages[0]["role"] == "system" %}
37
+ {%- set system_message = messages[0]["content"] %}
38
+ {%- set loop_messages = messages[1:] %}
39
+ {%- else %}
40
+ {%- set loop_messages = messages %}
41
+ {%- endif %}
42
+ {# ---------- Ensure tools exist ---------- #}
43
+ {%- if not tools is defined or tools is none %}
44
+ {%- set tools = [] %}
45
+ {%- endif %}
46
+ {# tools2doc.jinja #}
47
+ {%- macro py_type(t) -%}
48
+ {%- if t == "string" -%}str
49
+ {%- elif t in ("number", "integer") -%}int
50
+ {%- elif t == "boolean" -%}bool
51
+ {%- elif t == "array" -%}list
52
+ {%- else -%}Any{%- endif -%}
53
+ {%- endmacro -%}
54
+ {# ---------- Output the system block ---------- #}
55
+ {%- if system_message is defined %}
56
+ {{ bos_token + "system\n" + system_message }}
57
+ {%- else %}
58
+ {%- if tools is iterable and tools | length > 0 %}
59
+ {{ bos_token + "system\nYou are Doubao, a helpful AI assistant. You may call one or more functions to assist with the user query." }}
60
+ {%- endif %}
61
+ {%- endif %}
62
+ {%- if use_json_tooldef is defined and use_json_tooldef %}
63
+
64
+ {{"Tool List:\nYou are authorized to use the following tools (described in JSON Schema format). Before performing any task, you must decide how to call them based on the descriptions and parameters of these tools."}}
65
+ {{ tools | tojson(ensure_ascii=False) }}
66
+ {%- else %}
67
+ {%- for item in tools if item.type == "function" %}
68
+
69
+
70
+ Function:
71
+ def {{ item.function.name }}(
72
+ {%- for name, spec in item.function.parameters.properties.items() %}
73
+ {{- name }}: {{ py_type(spec.type) }}{% if not loop.last %},{% endif %}
74
+ {%- endfor %}):
75
+ """
76
+ {{ item.function.description | trim }}
77
+
78
+ {# ---------- Args ---------- #}
79
+ {%- if item.function.parameters.properties %}
80
+ Args:
81
+ {%- for name, spec in item.function.parameters.properties.items() %}
82
+
83
+ - {{ name }} ({{ py_type(spec.type) }})
84
+ {%- if name in item.function.parameters.required %} [必填]{% else %} [选填]{% endif %}:
85
+ {{- " " ~ (spec.description or "") }}
86
+ {%- endfor %}
87
+ {%- endif %}
88
+
89
+ {# ---------- Returns ---------- #}
90
+ {%- if item.function.returns is defined
91
+ and item.function.returns.properties is defined
92
+ and item.function.returns.properties %}
93
+ Returns:
94
+ {%- for name, spec in item.function.returns.properties.items() %}
95
+
96
+ - {{ name }} ({{ py_type(spec.type) }}):
97
+ {{- " " ~ (spec.description or "") }}
98
+ {%- endfor %}
99
+ {%- endif %}
100
+
101
+ """
102
+ {%- endfor %}
103
+ {%- endif %}
104
+ {%- if tools is iterable and tools | length > 0 %}
105
+
106
+ {{"工具调用请遵循如下格式:\n<seed:tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>value_1</parameter>\n<parameter=example_parameter_2>This is the value for the second parameter\nthat can span\nmultiple lines</parameter>\n</function>\n</seed:tool_call>\n"}}
107
+ {%- endif %}
108
+ {# End the system block line #}
109
+ {%- if system_message is defined or tools is iterable and tools | length > 0 %}
110
+ {{ eos_token }}
111
+ {%- endif %}
112
+ {# ---------- Thinking Budget ---------- #}
113
+ {%- if thinking_budget is defined %}
114
+ {%- if thinking_budget == 0 %}
115
+ {{ bos_token+"system" }}
116
+ {{ "You are an intelligent assistant that can answer questions in one step without the need for reasoning and thinking, that is, your thinking budget is 0. Next, please skip the thinking process and directly start answering the user's questions." }}
117
+ {{ eos_token }}
118
+ {%- elif not thinking_budget == -1 %}
119
+ {{ bos_token+"system" }}
120
+ {{ "You are an intelligent assistant with reflective ability. In the process of thinking and reasoning, you need to strictly follow the thinking budget, which is "}}{{thinking_budget}}{{". That is, you need to complete your thinking within "}}{{thinking_budget}}{{" tokens and start answering the user's questions. You will reflect on your thinking process every "}}{{ns.interval}}{{" tokens, stating how many tokens have been used and how many are left."}}
121
+ {{ eos_token }}
122
+ {%- endif %}
123
+ {%- endif %}
124
+ {# ---------- List the historical messages one by one ---------- #}
125
+ {%- for message in loop_messages %}
126
+ {%- if message.role == "assistant"
127
+ and message.tool_calls is defined
128
+ and message.tool_calls is iterable
129
+ and message.tool_calls | length > 0 %}
130
+ {{ bos_token + message.role }}
131
+ {%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %}
132
+ {{ "\n" + think_begin_token + message.reasoning_content | trim + think_end_token }}
133
+ {%- endif %}
134
+ {%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
135
+ {{ "\n" + message.content | trim + "\n" }}
136
+ {%- endif %}
137
+ {%- for tool_call in message.tool_calls %}
138
+ {%- if tool_call.function is defined %}{% set tool_call = tool_call.function %}{% endif %}
139
+ {{ "\n" + toolcall_begin_token + "\n<function=" + tool_call.name + ">\n" }}
140
+ {%- if tool_call.arguments is defined %}
141
+ {%- for arg_name, arg_value in tool_call.arguments | items %}
142
+ {{ "<parameter=" + arg_name + ">" }}
143
+ {%- set arg_value = arg_value if arg_value is string else arg_value | string %}
144
+ {{ arg_value+"</parameter>\n" }}
145
+ {%- endfor %}
146
+ {%- endif %}
147
+ {{ "</function>\n" + toolcall_end_token }}
148
+ {%- endfor %}
149
+ {{ eos_token }}
150
+ {%- elif message.role in ["user", "system"] %}
151
+ {{ bos_token + message.role + "\n" + message.content + eos_token }}
152
+ {%- elif message.role == "assistant" %}
153
+ {{ bos_token + message.role }}
154
+ {%- if message.reasoning_content is defined and message.reasoning_content is string and message.reasoning_content | trim | length > 0 %}
155
+ {{ "\n" + think_begin_token + message.reasoning_content | trim + think_end_token }}
156
+ {%- endif %}
157
+ {%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
158
+ {{ "\n" + message.content | trim + eos_token }}
159
+ {%- endif %}
160
+ {# Include the tool role #}
161
+ {%- else %}
162
+ {{ bos_token + message.role + "\n" + message.content + eos_token }}
163
+ {%- endif %}
164
+ {%- endfor %}
165
+ {# ---------- Control the model to start continuation ---------- #}
166
+ {%- if add_generation_prompt %}
167
+ {{ bos_token+"assistant\n" }}
168
+ {%- if thinking_budget == 0 %}
169
+ {{ think_begin_token + "\n" + budget_begin_token + "The current thinking budget is 0, so I will directly start answering the question." + budget_end_token + "\n" + think_end_token }}
170
+ {%- endif %}
171
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RWKV079Qwen3ForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_rwkv079qwen3.RWKV079Qwen3Config",
7
+ "AutoModelForCausalLM": "modeling_rwkv079qwen3.RWKV079Qwen3ForCausalLM"
8
+ },
9
+ "description": "Hybrid-RWKV Strategically Interleaved RWKV-Attention",
10
+ "base_model": "ByteDance-Seed/Seed-OSS-36B-Instruct",
11
+ "model_revision": "alpha",
12
+ "transformer_layers":[5,11,17,23,28,33,38,43,48,53,58,63],
13
+ "rwkv_layers": [0,1,2,3,4,6,7,8,9,10,12,13,14,15,16,18,19,20,21,22,24,25,26,27,29,30,31,32,34,35,36,37,39,40,41,42,44,45,46,47,49,50,51,52,54,55,56,57,59,60,61,62],
14
+ "rwkv_architecture": "hxa079",
15
+ "enable_qk_norm": false,
16
+ "nope_in_transformer": true,
17
+ "nope_in_rwkv": false,
18
+ "lora_rank_decay": 448,
19
+ "lora_rank_iclr":192,
20
+ "lora_rank_value_residual_mix":128,
21
+ "lora_rank_key_residual_mix":128,
22
+ "lora_rank_gate":576,
23
+ "use_rope":true,
24
+
25
+
26
+
27
+ "attention_bias": true,
28
+ "attention_dropout": 0.1,
29
+ "attention_out_bias": false,
30
+ "bos_token_id": 0,
31
+ "pad_token_id": 1,
32
+ "eos_token_id": 2,
33
+ "head_dim": 128,
34
+ "hidden_act": "silu",
35
+ "hidden_size": 5120,
36
+ "initializer_range": 0.02,
37
+ "intermediate_size": 27648,
38
+ "max_position_embeddings": 524288,
39
+ "mlp_bias": false,
40
+ "model_type": "rwkv079qwen3",
41
+ "num_attention_heads": 80,
42
+ "num_hidden_layers": 64,
43
+ "num_key_value_heads": 8,
44
+ "residual_dropout": 0.1,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_scaling": {
47
+ "rope_type": "default"
48
+ },
49
+ "rope_theta": 10000000.0,
50
+ "tie_word_embeddings": false,
51
+ "torch_dtype": "bfloat16",
52
+ "transformers_version": "4.55.0",
53
+ "use_cache": true,
54
+ "vocab_size": 155136
55
+ }
configuration_rwkv079qwen3.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """RWKV079Qwen3 model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig, layer_type_validation
18
+ from transformers.modeling_rope_utils import rope_config_validation
19
+ from transformers.utils import logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ class RWKV079Qwen3Config(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`RWKV079Qwen3Model`]. It is used to instantiate a
28
+ RWKV079Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
29
+ with the defaults will yield a similar configuration to that of
30
+ Qwen3-7B-beta [Qwen/Qwen3-7B-beta](https://huggingface.co/Qwen/Qwen3-7B-beta).
31
+
32
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
+ documentation from [`PretrainedConfig`] for more information.
34
+
35
+
36
+ Args:
37
+ vocab_size (`int`, *optional*, defaults to 151936):
38
+ Vocabulary size of the RWKV079Qwen3 model. Defines the number of different tokens that can be represented by the
39
+ `inputs_ids` passed when calling [`RWKV079Qwen3Model`]
40
+ hidden_size (`int`, *optional*, defaults to 4096):
41
+ Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 22016):
43
+ Dimension of the MLP representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 32):
45
+ Number of hidden layers in the Transformer encoder.
46
+ num_attention_heads (`int`, *optional*, defaults to 32):
47
+ Number of attention heads for each attention layer in the Transformer encoder.
48
+ num_key_value_heads (`int`, *optional*, defaults to 32):
49
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
50
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
51
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
52
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
53
+ by meanpooling all the original heads within that group. For more details checkout [this
54
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
55
+ lora_rank_decay (`int`, *optional*):
56
+ The rank of the lora used to generate decay.
57
+ lora_rank_iclr (`int`, *optional*):
58
+ The rank of the lora used to generate the in-context learning rate.
59
+ lora_rank_value_residual_mix (`int`, *optional*):
60
+ The rank of the lora used to generate the value residual mix amount.
61
+ lora_rank_value_gate (`int`, *optional*):
62
+ The rank of the lora used to generate the gate.
63
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
64
+ The non-linear activation function (function or string) in the decoder.
65
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
66
+ The maximum sequence length that this model might ever be used with.
67
+ initializer_range (`float`, *optional*, defaults to 0.02):
68
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
69
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
70
+ The epsilon used by the rms normalization layers.
71
+ use_cache (`bool`, *optional*, defaults to `True`):
72
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
73
+ relevant if `config.is_decoder=True`.
74
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
75
+ Whether the model's input and output word embeddings should be tied.
76
+ rope_theta (`float`, *optional*, defaults to 10000.0):
77
+ The base period of the RoPE embeddings.
78
+ rope_scaling (`Dict`, *optional*):
79
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
80
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
81
+ accordingly.
82
+ Expected contents:
83
+ `rope_type` (`str`):
84
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
85
+ 'llama3'], with 'default' being the original RoPE implementation.
86
+ `factor` (`float`, *optional*):
87
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
88
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
89
+ original maximum pre-trained length.
90
+ `original_max_position_embeddings` (`int`, *optional*):
91
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
92
+ pretraining.
93
+ `attention_factor` (`float`, *optional*):
94
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
95
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
96
+ `factor` field to infer the suggested value.
97
+ `beta_fast` (`float`, *optional*):
98
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
99
+ ramp function. If unspecified, it defaults to 32.
100
+ `beta_slow` (`float`, *optional*):
101
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
102
+ ramp function. If unspecified, it defaults to 1.
103
+ `short_factor` (`List[float]`, *optional*):
104
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
105
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
106
+ size divided by the number of attention heads divided by 2
107
+ `long_factor` (`List[float]`, *optional*):
108
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
109
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
110
+ size divided by the number of attention heads divided by 2
111
+ `low_freq_factor` (`float`, *optional*):
112
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
113
+ `high_freq_factor` (`float`, *optional*):
114
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
115
+ use_sliding_window (`bool`, *optional*, defaults to `False`):
116
+ Whether to use sliding window attention.
117
+ sliding_window (`int`, *optional*, defaults to 4096):
118
+ Sliding window attention (SWA) window size. If not specified, will default to `4096`.
119
+ max_window_layers (`int`, *optional*, defaults to 28):
120
+ The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
121
+ attention_dropout (`float`, *optional*, defaults to 0.0):
122
+ The dropout ratio for the attention probabilities.
123
+
124
+ ```python
125
+ >>> from transformers import RWKV079Qwen3Model, RWKV079Qwen3Config
126
+
127
+ >>> # Initializing a RWKV079Qwen3 style configuration
128
+ >>> configuration = RWKV079Qwen3Config()
129
+
130
+ >>> # Initializing a model from the RWKV079Qwen3-7B style configuration
131
+ >>> model = RWKV079Qwen3Model(configuration)
132
+
133
+ >>> # Accessing the model configuration
134
+ >>> configuration = model.config
135
+ ```"""
136
+
137
+ model_type = "rwkv079qwen3"
138
+ keys_to_ignore_at_inference = ["past_key_values"]
139
+
140
+ def __init__(
141
+ self,
142
+ vocab_size=151936,
143
+ hidden_size=4096,
144
+ intermediate_size=22016,
145
+ num_hidden_layers=32,
146
+ num_attention_heads=32,
147
+ num_key_value_heads=32,
148
+ lora_rank_tokenshift=None,
149
+ lora_rank_decay=None,
150
+ lora_rank_iclr=None,
151
+ lora_rank_value_residual_mix=None,
152
+ lora_rank_value_key_mix=None,
153
+ lora_rank_gate=None,
154
+ hidden_act="silu",
155
+ max_position_embeddings=32768,
156
+ initializer_range=0.02,
157
+ rms_norm_eps=1e-6,
158
+ use_cache=True,
159
+ tie_word_embeddings=False,
160
+ use_rope=True,
161
+ rope_theta=10000.0,
162
+ rope_scaling=None,
163
+ use_sliding_window=False,
164
+ sliding_window=4096,
165
+ max_window_layers=28,
166
+ first_attention_layer=9999,
167
+ first_post_attention_layer=9999,
168
+ attention_striping=1,
169
+ last_striping_layer=99999,
170
+ layer_types=None,
171
+ attention_dropout=0.0,
172
+ attention_bias=True,
173
+ attention_output_bias=False,
174
+ gate_rank_type=2,
175
+ balance_state=True,
176
+ groupnorm_att=False,
177
+ use_tokenshift=False,
178
+ **kwargs,
179
+ ):
180
+ self.vocab_size = vocab_size
181
+ self.max_position_embeddings = max_position_embeddings
182
+ self.hidden_size = hidden_size
183
+ self.intermediate_size = intermediate_size
184
+ self.num_hidden_layers = num_hidden_layers
185
+ self.num_attention_heads = num_attention_heads
186
+ self.use_sliding_window = use_sliding_window
187
+ self.sliding_window = sliding_window if use_sliding_window else None
188
+ self.max_window_layers = max_window_layers
189
+ self.first_attention_layer = first_attention_layer
190
+ self.first_post_attention_layer = first_post_attention_layer
191
+ self.attention_striping = attention_striping
192
+ self.last_striping_layer = last_striping_layer
193
+
194
+ # for backward compatibility
195
+ if num_key_value_heads is None:
196
+ num_key_value_heads = num_attention_heads
197
+
198
+ self.num_key_value_heads = num_key_value_heads
199
+ self.lora_rank_tokenshift = lora_rank_tokenshift
200
+ self.lora_rank_decay = lora_rank_decay
201
+ self.lora_rank_iclr = lora_rank_iclr
202
+ self.lora_rank_value_residual_mix = lora_rank_value_residual_mix
203
+ self.lora_rank_gate = lora_rank_gate
204
+ self.hidden_act = hidden_act
205
+ self.initializer_range = initializer_range
206
+ self.rms_norm_eps = rms_norm_eps
207
+ self.use_cache = use_cache
208
+ self.use_rope = use_rope
209
+ self.rope_theta = rope_theta
210
+ self.rope_scaling = rope_scaling
211
+ self.attention_dropout = attention_dropout
212
+ # Validate the correctness of rotary position embeddings parameters
213
+ # BC: if there is a 'type' field, move it to 'rope_type'.
214
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
215
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
216
+ rope_config_validation(self)
217
+
218
+ self.layer_types = layer_types
219
+ if self.layer_types is None:
220
+ self.layer_types = [
221
+ "sliding_attention"
222
+ if self.sliding_window is not None and i >= self.max_window_layers
223
+ else "full_attention"
224
+ for i in range(self.num_hidden_layers)
225
+ ]
226
+ layer_type_validation(self.layer_types)
227
+
228
+ self.attention_bias = attention_bias
229
+ self.attention_output_bias = attention_output_bias
230
+ self.gate_rank_type = gate_rank_type
231
+ self.balance_state = balance_state
232
+ self.groupnorm_att = groupnorm_att
233
+ self.use_tokenshift = use_tokenshift
234
+
235
+ super().__init__(
236
+ tie_word_embeddings=tie_word_embeddings,
237
+ **kwargs,
238
+ )
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "pad_token_id": 1,
5
+ "eos_token_id": 2,
6
+ "transformers_version": "4.55.0",
7
+ "temperature": 1.1,
8
+ "top_p": 0.95
9
+ }
10
+
modeling_rwkv079qwen3.py ADDED
@@ -0,0 +1,1063 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """
21
+ PyTorch RWKV079Qwen3 model.
22
+ base code from SmerkyG @ recursal.ai, featherless.ai
23
+ hxa079 implementation RWKV079 + NoPE Hybrid Attention
24
+
25
+ """
26
+
27
+ import math
28
+ import inspect
29
+ from typing import List, Optional, Tuple, Union, Dict, Any
30
+
31
+ import torch
32
+ import torch.utils.checkpoint
33
+ from torch import nn
34
+ import torch.nn.functional as F
35
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
36
+
37
+ from transformers.activations import ACT2FN
38
+ from transformers.cache_utils import Cache, DynamicCache, CacheLayerMixin
39
+ from transformers.generation import GenerationMixin
40
+ from transformers.integrations import use_kernel_forward_from_hub
41
+ from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
42
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
43
+ from transformers.modeling_layers import (
44
+ GenericForQuestionAnswering,
45
+ GenericForSequenceClassification,
46
+ GenericForTokenClassification,
47
+ GradientCheckpointingLayer,
48
+ )
49
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
50
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
51
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
52
+ from transformers.processing_utils import Unpack
53
+ from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple
54
+ from transformers.utils.generic import check_model_inputs
55
+
56
+ from .configuration_rwkv079qwen3 import RWKV079Qwen3Config
57
+
58
+ from transformers.models.qwen3.modeling_qwen3 import Qwen3DecoderLayer, Qwen3MLP, Qwen3RMSNorm, Qwen3Attention
59
+
60
+ class RWKV079State():
61
+ def __init__(self) -> None:
62
+ #super().__init__()
63
+ self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen
64
+ self.layer_kv_states: List[torch.Tensor] = []
65
+ self.layer_shift_states: List[torch.Tensor] = []
66
+ self.cumulative_scores: List[torch.Tensor] = []
67
+ self.sin: List[torch.Tensor] = []
68
+ self.cos: List[torch.Tensor] = []
69
+
70
+ def __getitem__(self, layer_idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
71
+ """
72
+ Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
73
+ sequence length.
74
+ """
75
+ if layer_idx < len(self):
76
+ return (self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx])
77
+ else:
78
+ raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
79
+
80
+ def __iter__(self):
81
+ """
82
+ Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
83
+ keys and values
84
+ """
85
+ for layer_idx in range(len(self)):
86
+ yield (self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx])
87
+
88
+ def __len__(self):
89
+ """
90
+ Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
91
+ to the number of layers in the model.
92
+ """
93
+ return len(self.layer_kv_states)
94
+
95
+ def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
96
+ """Given the sequence length of the new inputs, returns the usable length of the cache."""
97
+ # Linear Attention variants do not have a maximum length
98
+ return new_seq_length
99
+
100
+ def reorder_cache(self, beam_idx: torch.LongTensor):
101
+ """Reorders the cache for beam search, given the selected beam indices."""
102
+ raise NotImplementedError('Cannot reorder Linear Attention state')
103
+
104
+ def get_seq_length(self, layer_idx: int = 0) -> int:
105
+ """Returns the sequence length of the cached states. A layer index can be optionally passed."""
106
+ return self._seen_tokens
107
+
108
+ def get_max_cache_shape(self) -> Optional[int]:
109
+ """Returns the maximum sequence length of the cache object. DynamicCache does not have a maximum length."""
110
+ return None
111
+
112
+ def get_max_length(self) -> Optional[int]:
113
+ """
114
+ Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length.
115
+ """
116
+ return None
117
+
118
+ def crop(self, max_length: int):
119
+ # can't implement this for linear attention variants
120
+ return
121
+
122
+ def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
123
+ """Return the length and offset of the cache, used to generate the mask"""
124
+ kv_offset = 0
125
+ query_length = cache_position.shape[0]
126
+ past_seen_tokens = self.get_seq_length()
127
+ kv_length = query_length + past_seen_tokens
128
+ return kv_length, kv_offset
129
+
130
+ @property
131
+ def is_compileable(self) -> bool:
132
+ """Return whether the cache is compileable"""
133
+ return True #all(layer.is_compileable for layer in self.layers)
134
+
135
+ @torch.no_grad
136
+ def update(
137
+ self,
138
+ kv_state: torch.Tensor,
139
+ shift_state: torch.Tensor,
140
+ layer_idx: int,
141
+ token_count: int = 0,
142
+ is_attention_layer: bool = True,
143
+ cache_kwargs: Optional[Dict[str, Any]] = None,
144
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
145
+ # Update the number of seen tokens
146
+ if layer_idx == 0:
147
+ if is_attention_layer:
148
+ token_count = kv_state.size(-2)
149
+ self._seen_tokens += token_count
150
+
151
+ #print(f'self._seen_tokens = {self._seen_tokens} layer_idx = {layer_idx} is_attention_layer = {is_attention_layer} kv_state.size(-2) = {kv_state.size(-2)}')
152
+
153
+ # Update the cache
154
+ if kv_state is not None:
155
+ # There may be skipped layers, fill them with empty lists
156
+ if layer_idx >= len(self.layer_kv_states):
157
+ for _ in range(len(self.layer_kv_states), layer_idx):
158
+ if is_attention_layer:
159
+ self.layer_kv_states.append(torch.tensor([], dtype=kv_state.dtype, device=kv_state.device)) # acts as key_cache
160
+ self.layer_shift_states.append(torch.tensor([], dtype=shift_state.dtype, device=shift_state.device)) # acts as value_cache
161
+ else:
162
+ self.layer_kv_states.append(torch.zeros_like(kv_state).requires_grad_(False))
163
+ self.layer_shift_states.append(torch.zeros_like(shift_state).requires_grad_(False))
164
+ self.layer_kv_states.append(kv_state) # acts as key_cache
165
+ self.layer_shift_states.append(shift_state) # acts as value_cache
166
+ else:
167
+ if is_attention_layer:
168
+ self.layer_kv_states[layer_idx] = torch.cat([self.layer_kv_states[layer_idx], kv_state], dim=-2) # acts as key_cache
169
+ self.layer_shift_states[layer_idx] = torch.cat([self.layer_shift_states[layer_idx], shift_state], dim=-2) # acts as value_cache
170
+ else:
171
+ self.layer_kv_states[layer_idx].copy_(kv_state)
172
+ self.layer_shift_states[layer_idx].copy_(shift_state)
173
+
174
+ return self.layer_kv_states[layer_idx], self.layer_shift_states[layer_idx]
175
+
176
+ try:
177
+ from fla.ops.rwkv7.chunk import chunk_rwkv7
178
+ from fla.ops.rwkv7.fused_recurrent import fused_recurrent_rwkv7
179
+ except ImportError:
180
+ print("Required module is not installed. Please install it using the following commands:")
181
+ print("pip install --no-use-pep517 flash-linear-attention")
182
+ print("Additionally, ensure you have at least version 2.2.0 of Triton installed:")
183
+ print("pip install triton>=2.2.0")
184
+
185
+ # def is_layer_attention(config, layer_id):
186
+ # return layer_id >= config.first_attention_layer and layer_id < config.first_post_attention_layer and (layer_id > min(config.num_hidden_layers, config.last_striping_layer) or (min(config.num_hidden_layers-1, config.last_striping_layer) - layer_id) % config.attention_striping == 0)
187
+
188
+ def is_layer_attention(config, layer_id):
189
+ return layer_id in config.transformer_layers
190
+
191
+ def repeat_kv_rwkv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
192
+ """
193
+ Repeat KV heads along the head dimension (GQA).
194
+ Input: (B, T, H_kv, D)
195
+ Output: (B, T, H_kv * n_rep, D)
196
+ """
197
+ B, T, H_kv, D = hidden_states.shape
198
+ if n_rep == 1:
199
+ return hidden_states
200
+ # Expand head dim
201
+ hidden_states = hidden_states[:, :, :, None, :] # (B, T, H_kv, 1, D)
202
+ hidden_states = hidden_states.expand(B, T, H_kv, n_rep, D) # (B, T, H_kv, n_rep, D)
203
+ return hidden_states.reshape(B, T, H_kv * n_rep, D).contiguous()
204
+
205
+ def T5RMSNorm(hidden_states,weight,variance_epsilon:float=1e-6):
206
+ input_dtype = hidden_states.dtype
207
+ hidden_states = hidden_states.to(torch.float32)
208
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
209
+ hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
210
+ return (weight * hidden_states).to(input_dtype)
211
+
212
+ def compute_qwen3_rope_cache(seq_len, rotary_dim, device, dtype, rope_theta):
213
+ half_dim = rotary_dim // 2
214
+ freq_seq = torch.arange(half_dim, dtype=dtype, device=device)
215
+ inv_freq = 1.0 / (rope_theta ** (freq_seq / half_dim))
216
+ positions = torch.arange(seq_len, dtype=dtype, device=device)
217
+ freqs = torch.einsum("i,j->ij", positions, inv_freq)
218
+ emb = torch.cat([freqs, freqs], dim=-1)
219
+ cos = emb.cos()
220
+ sin = emb.sin()
221
+ return cos.unsqueeze(0), sin.unsqueeze(0), inv_freq
222
+
223
+ # def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
224
+ # """Applies Rotary Position Embedding to the query and key tensors.
225
+
226
+ # Args:
227
+ # q (`torch.Tensor`): The query tensor.
228
+ # k (`torch.Tensor`): The key tensor.
229
+ # cos (`torch.Tensor`): The cosine part of the rotary embedding.
230
+ # sin (`torch.Tensor`): The sine part of the rotary embedding.
231
+ # position_ids (`torch.Tensor`, *optional*):
232
+ # Deprecated and unused.
233
+ # unsqueeze_dim (`int`, *optional*, defaults to 1):
234
+ # The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
235
+ # sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
236
+ # that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
237
+ # k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
238
+ # cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
239
+ # the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
240
+ # Returns:
241
+ # `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
242
+ # """
243
+ # cos = cos.unsqueeze(unsqueeze_dim)
244
+ # sin = sin.unsqueeze(unsqueeze_dim)
245
+ # q_embed = (q * cos) + (rotate_half(q) * sin)
246
+ # k_embed = (k * cos) + (rotate_half(k) * sin)
247
+ # return q_embed, k_embed
248
+
249
+ class Qwen3RotaryEmbedding(nn.Module):
250
+ def __init__(self, config: RWKV079Qwen3Config, device=None):
251
+ super().__init__()
252
+ # BC: "rope_type" was originally "type"
253
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
254
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
255
+ else:
256
+ self.rope_type = "default"
257
+ self.max_seq_len_cached = config.max_position_embeddings
258
+ self.original_max_seq_len = config.max_position_embeddings
259
+
260
+ self.config = config
261
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
262
+
263
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
264
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
265
+ self.original_inv_freq = self.inv_freq
266
+
267
+ def _dynamic_frequency_update(self, position_ids, device):
268
+ """
269
+ dynamic RoPE layers should recompute `inv_freq` in the following situations:
270
+ 1 - growing beyond the cached sequence length (allow scaling)
271
+ 2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
272
+ """
273
+ seq_len = torch.max(position_ids) + 1
274
+ if seq_len > self.max_seq_len_cached: # growth
275
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
276
+ self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation
277
+ self.max_seq_len_cached = seq_len
278
+
279
+ if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset
280
+ # This .to() is needed if the model has been moved to a device after being initialized (because
281
+ # the buffer is automatically moved, but not the original copy)
282
+ self.original_inv_freq = self.original_inv_freq.to(device)
283
+ self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
284
+ self.max_seq_len_cached = self.original_max_seq_len
285
+
286
+ @torch.no_grad()
287
+ def forward(self, x, position_ids):
288
+ if "dynamic" in self.rope_type:
289
+ self._dynamic_frequency_update(position_ids, device=x.device)
290
+
291
+ # Core RoPE block
292
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
293
+ position_ids_expanded = position_ids[:, None, :].float()
294
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
295
+ device_type = x.device.type
296
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
297
+ with torch.autocast(device_type=device_type, enabled=False):
298
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
299
+ emb = torch.cat((freqs, freqs), dim=-1)
300
+ cos = emb.cos()
301
+ sin = emb.sin()
302
+
303
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
304
+ cos = cos * self.attention_scaling
305
+ sin = sin * self.attention_scaling
306
+
307
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
308
+
309
+ def rms_norm(hidden_states, eps = 1e-6):
310
+ #print('ugyuugyu')
311
+ input_dtype = hidden_states.dtype
312
+ hidden_states = hidden_states.to(torch.float32)
313
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
314
+ hidden_states = hidden_states * torch.rsqrt(variance + eps)
315
+ return hidden_states.to(input_dtype)
316
+
317
+ def generate_rotary_embedding(max_seqlen:int, dim:int, theta:float = 10000.0, scale:float = 1):
318
+ #inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float).to(device) / dim))
319
+
320
+ angular_velocity = theta ** -(torch.arange(0, dim, 2, dtype=torch.float) / dim) / scale # frequencies from 1.0 ... 1/theta
321
+ angles = torch.outer(torch.arange(max_seqlen), angular_velocity)
322
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
323
+ emb = torch.cat((angles, angles), dim=-1)
324
+ return torch.stack([emb.cos(), emb.sin()], dim=0)
325
+ #return torch.polar(torch.ones_like(angles), angles)
326
+
327
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
328
+ def rotate_half(x):
329
+ """Rotates half the hidden dims of the input."""
330
+ x1 = x[..., : x.shape[-1] // 2]
331
+ x2 = x[..., x.shape[-1] // 2 :]
332
+ return torch.cat((-x2, x1), dim=-1)
333
+
334
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
335
+ """Applies Rotary Position Embedding to the query and key tensors.
336
+
337
+ Args:
338
+ q (`torch.Tensor`): The query tensor.
339
+ k (`torch.Tensor`): The key tensor.
340
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
341
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
342
+ position_ids (`torch.Tensor`, *optional*):
343
+ Deprecated and unused.
344
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
345
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
346
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
347
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
348
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
349
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
350
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
351
+ Returns:
352
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
353
+ """
354
+ cos = cos.unsqueeze(unsqueeze_dim)
355
+ sin = sin.unsqueeze(unsqueeze_dim)
356
+ q_embed = (q * cos) + (rotate_half(q) * sin)
357
+ k_embed = (k * cos) + (rotate_half(k) * sin)
358
+ return q_embed, k_embed
359
+
360
+ def apply_rotary_pos_emb_single(x, cos, sin, unsqueeze_dim=1):
361
+ return (x * cos.unsqueeze(unsqueeze_dim)) + (rotate_half(x) * sin.unsqueeze(unsqueeze_dim))
362
+
363
+ from typing import Callable, Optional, Tuple, Union
364
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
365
+ from transformers.processing_utils import Unpack
366
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
367
+
368
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
369
+ """
370
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
371
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
372
+ """
373
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
374
+ if n_rep == 1:
375
+ return hidden_states
376
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
377
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
378
+
379
+ def eager_attention_forward(
380
+ module: nn.Module,
381
+ query: torch.Tensor,
382
+ key: torch.Tensor,
383
+ value: torch.Tensor,
384
+ attention_mask: Optional[torch.Tensor],
385
+ scaling: float,
386
+ dropout: float = 0.0,
387
+ **kwargs,
388
+ ):
389
+ key_states = repeat_kv(key, module.num_key_value_groups)
390
+ value_states = repeat_kv(value, module.num_key_value_groups)
391
+
392
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
393
+ if attention_mask is not None:
394
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
395
+ attn_weights = attn_weights + causal_mask
396
+
397
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
398
+ attn_weights = attn_weights.masked_fill(attn_weights.isnan(), 0) # IMPORTANT FOR BATCHED INFERENCE IN LM EVAL!
399
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
400
+ attn_output = torch.matmul(attn_weights, value_states)
401
+ attn_output = attn_output.transpose(1, 2).contiguous()
402
+
403
+ return attn_output, attn_weights
404
+
405
+ from torch.nn.attention.flex_attention import create_block_mask, flex_attention, create_mask
406
+ from functools import lru_cache
407
+
408
+ block_mask = None
409
+
410
+
411
+
412
+ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
413
+ is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
414
+ L, S = query.size(-2), key.size(-2)
415
+ scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
416
+ attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
417
+ if is_causal:
418
+ assert attn_mask is None
419
+ temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
420
+ attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
421
+ attn_bias.to(query.dtype)
422
+
423
+ if attn_mask is not None:
424
+ if attn_mask.dtype == torch.bool:
425
+ attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
426
+ else:
427
+ attn_bias = attn_mask + attn_bias
428
+
429
+ if enable_gqa:
430
+ key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
431
+ value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
432
+
433
+ attn_weight = query.float() @ key.float().transpose(-2, -1) * scale_factor
434
+ attn_weight += attn_bias.float()
435
+ #attn_weight = stable_softmax(attn_weight, dim=-1)
436
+ attn_weight = torch.softmax(attn_weight, dim=-1)
437
+ attn_weight = attn_weight.masked_fill(attn_weight.isnan(), 0) # IMPORTANT FOR BATCHED INFERENCE IN LM EVAL!
438
+ #attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
439
+ return attn_weight @ value.float()
440
+
441
+
442
+
443
+ class Qwen3AttentionNoPE_Causal(Qwen3Attention):
444
+ def forward(
445
+ self,
446
+ hidden_states: torch.Tensor,
447
+ frozen_residual: torch.Tensor,
448
+ v_first: Optional[torch.Tensor] = None,
449
+ k_first: Optional[torch.Tensor] = None,
450
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
451
+ attention_mask: Optional[torch.Tensor] = None,
452
+ past_key_values: Optional[Cache] = None,
453
+ cache_position: Optional[torch.LongTensor] = None,
454
+ **kwargs: Unpack[FlashAttentionKwargs],
455
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
456
+ x = hidden_states
457
+
458
+ B, L, D = x.size()
459
+
460
+ input_shape = x.shape[:-1]
461
+ hidden_shape = (*input_shape, -1, self.head_dim)
462
+
463
+ q = self.q_proj(x).view(hidden_shape).transpose(1, 2)
464
+ k = self.k_proj(x).view(hidden_shape).transpose(1, 2)
465
+ v = self.v_proj(x).view(hidden_shape).transpose(1, 2)
466
+
467
+ if past_key_values is not None:
468
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
469
+ cache_kwargs = {"cache_position": cache_position}
470
+ k, v = past_key_values.update(k, v, self.layer_idx, cache_kwargs)
471
+
472
+ # repeat k/v heads if n_kv_heads < n_heads
473
+ k = repeat_kv(k, self.num_key_value_groups)
474
+ v = repeat_kv(v, self.num_key_value_groups)
475
+
476
+ S = k.size(-2)
477
+
478
+ y = nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=0.0, attn_mask=attention_mask, is_causal=attention_mask is None and L==S)
479
+ y = y.transpose(1,2)
480
+ y = y.reshape(*input_shape, -1)#.contiguous()
481
+ y = self.o_proj(y)
482
+
483
+ attn_weights = None
484
+
485
+ return y, v_first, k_first
486
+
487
+
488
+ class RWKV079Attention(nn.Module):
489
+ def __init__(self, config, layer_idx: Optional[int] = None):
490
+ super().__init__()
491
+ self.config = config
492
+ self.layer_idx = layer_idx
493
+ C = self.hidden_size = config.hidden_size
494
+ H = self.num_heads = config.num_attention_heads
495
+ H_kv = config.num_key_value_heads
496
+ N = self.head_dim = getattr(config, 'head_dim', self.hidden_size // self.num_heads)
497
+ self.num_key_value_heads = config.num_key_value_heads
498
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
499
+ self.attention_dropout = config.attention_dropout
500
+
501
+ if self.hidden_size % self.num_heads != 0:
502
+ raise ValueError(
503
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
504
+ f" and `num_heads`: {self.num_heads})."
505
+ )
506
+ self.receptance = nn.Linear(
507
+ config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
508
+ )
509
+ self.key = nn.Linear(
510
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
511
+ )
512
+ self.value = nn.Linear(
513
+ config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
514
+ )
515
+ self.output = nn.Linear(
516
+ config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
517
+ )
518
+ #self.r_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) # unlike olmo, only on the head dim!
519
+ #self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) # thus post q_norm does not need reshape
520
+
521
+
522
+ lora_rank_decay = config.lora_rank_decay
523
+ lora_rank_iclr = config.lora_rank_iclr
524
+ lora_rank_value_residual_mix = config.lora_rank_value_residual_mix
525
+ lora_rank_key_residual_mix = config.lora_rank_key_residual_mix
526
+ lora_rank_gate = config.lora_rank_gate
527
+
528
+ print(f"lora_rank_value_residual_mix = {lora_rank_value_residual_mix} lora_rank_key_residual_mix={lora_rank_key_residual_mix}")
529
+
530
+
531
+ self.w0 = nn.Parameter(torch.empty(1,1,H*N))
532
+ self.w1 = nn.Parameter(torch.empty(C, lora_rank_decay))
533
+ self.w2 = nn.Parameter(torch.empty(lora_rank_decay, H*N))
534
+
535
+ self.a0 = nn.Parameter(torch.empty(1,1,H*N))
536
+ self.a1 = nn.Parameter(torch.empty(C, lora_rank_iclr))
537
+ self.a2 = nn.Parameter(torch.empty(lora_rank_iclr, H*N))
538
+
539
+ #if layer_idx > 0:
540
+ self.v0 = nn.Parameter(torch.empty(1,1,H_kv*N))
541
+ self.v1 = nn.Parameter(torch.empty(C, lora_rank_value_residual_mix))
542
+ self.v2 = nn.Parameter(torch.empty(lora_rank_value_residual_mix, H_kv*N))
543
+
544
+ self.k0 = nn.Parameter(torch.empty(1,1,H_kv*N))
545
+ self.k1 = nn.Parameter(torch.empty(C, lora_rank_key_residual_mix))
546
+ self.k2 = nn.Parameter(torch.empty(lora_rank_key_residual_mix, H_kv*N))
547
+
548
+
549
+ self.g1 = nn.Parameter(torch.empty(C, lora_rank_gate))
550
+ self.g2 = nn.Parameter(torch.empty(lora_rank_gate, H*N))
551
+
552
+ self.r_k = nn.Parameter(torch.empty(H,N))
553
+
554
+
555
+ def forward(
556
+ self,
557
+ hidden_states: torch.Tensor,
558
+ frozen_residual: torch.Tensor,
559
+ v_first: Optional[torch.Tensor] = None,
560
+ k_first: Optional[torch.Tensor] = None,
561
+ attention_mask: Optional[torch.Tensor] = None,
562
+ position_ids: Optional[torch.LongTensor] = None,
563
+ past_key_values: Optional[RWKV079State] = None,
564
+ output_attentions: bool = False,
565
+ use_cache: bool = False,
566
+ cache_position: Optional[torch.LongTensor] = None,
567
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
568
+ **kwargs,
569
+ ):
570
+ if attention_mask is not None:
571
+ assert len(attention_mask.shape) in (2, 4)
572
+
573
+ output_shift_state = hidden_states[:, -1:].detach().clone()
574
+
575
+ x = hidden_states
576
+
577
+ B, T, C = hidden_states.shape
578
+ H = self.num_heads
579
+ N = self.head_dim
580
+
581
+ q_len = T
582
+
583
+ if use_cache and past_key_values is not None and len(past_key_values) > self.layer_idx:
584
+ #print(f'use past state layer {self.layer_idx}')
585
+ input_vk_state, input_shift_state = past_key_values[self.layer_idx]
586
+ else:
587
+ input_vk_state, input_shift_state = torch.zeros(B,H,N,N, dtype=torch.bfloat16,device=x.device), torch.zeros_like(x[:, -1:])
588
+
589
+ xr = xw = xk = xv = xa = xg = x
590
+
591
+ r = self.receptance(xr).view(B,T,-1,N)
592
+ w = -F.softplus(-(self.w0 + torch.tanh(xw @ self.w1) @ self.w2)) -0.5
593
+ k = self.key(xk).view(B,T,-1,N)
594
+ v = self.value(xv).view(B,T,-1,N)
595
+ a = torch.sigmoid(self.a0 + (xa @ self.a1) @ self.a2)
596
+ g = torch.sigmoid(xg @ self.g1) @ self.g2
597
+
598
+ if position_embeddings is not None:
599
+ cos, sin = position_embeddings
600
+ r, k = apply_rotary_pos_emb(r, k, cos, sin, unsqueeze_dim=2)
601
+
602
+
603
+
604
+ if self.layer_idx == 0:
605
+ v_first = v # store the v of the first layer
606
+ k_first = k # store the k of the first layer
607
+ else:
608
+ v = v + (v_first - v) * torch.sigmoid(self.v0 + (x @ self.v1) @ self.v2).view(B,T,self.num_key_value_heads,-1) # add value residual
609
+ k = k + (k_first - k) * torch.sigmoid(self.k0 + (x @ self.k1) @ self.k2).view(B,T,self.num_key_value_heads,-1) # add key residual
610
+
611
+ # dealing with left-padding
612
+ # if attention_mask is not None:
613
+ # if len(attention_mask.shape) == 2:
614
+ # v = v * attention_mask[:, -v.shape[-2]:, None]
615
+ # elif len(attention_mask.shape) == 4:
616
+ # v = v * attention_mask[:, -1, -1, -v.shape[-2]:].view(B, T, 1)
617
+ # #v = v * attention_mask[:, :, -1, -v.shape[-2]:, None]
618
+ if attention_mask is not None:
619
+ if attention_mask is not None:
620
+ if attention_mask.ndim == 2:
621
+ # [B, S]
622
+ mask = attention_mask[:, -T:] # [B, T]
623
+ v = v * mask[:, :, None, None] # → [B, T, 1, 1] に拡張して掛け算
624
+ elif attention_mask.ndim == 4:
625
+ # [B, 1, L, S]
626
+ mask = attention_mask[:, 0, -1, -T:] # [B, T]
627
+ v = v * mask[:, :, None, None] # 同上
628
+
629
+
630
+ # repeat k/v heads if n_kv_heads < n_heads
631
+ # k = k.view(B, T, -1, 1, self.head_dim).expand(-1, -1, -1, self.num_key_value_groups, -1).reshape(B, T, -1)
632
+ # v = v.view(B, T, -1, 1, self.head_dim).expand(-1, -1, -1, self.num_key_value_groups, -1).reshape(B, T, -1)
633
+ k = repeat_kv_rwkv(k, self.num_key_value_groups).view(B, T, -1)
634
+ v = repeat_kv_rwkv(v, self.num_key_value_groups).view(B, T, -1)
635
+ dropout_rate = 0.0 if not self.training else self.attention_dropout
636
+
637
+ kk = F.normalize(k.view(B,T,H,-1), dim=-1, p=2.0).view(B,T,-1)
638
+ k = k * (1.0 - w + a)
639
+
640
+ aa = -kk
641
+ bb = kk * a
642
+ w = -w.exp()
643
+
644
+
645
+
646
+ r_,w_,k_,v_,aa_,bb_ = [i.view(B,T,H,N) for i in [r,w,k,v,aa,bb]]
647
+
648
+ #print(f'r shape = {r_.shape}')
649
+
650
+ # if self.layer_idx == 0:
651
+ # print(f'input_vk_state sum = {torch.sum(input_vk_state)}')
652
+
653
+ #x, output_vk_state = fused_recurrent_rwkv7(r_, w_, k_, v_, aa_, bb_, initial_state=input_vk_state, output_final_state=use_cache)
654
+ x, output_vk_state = fused_recurrent_rwkv7(r_, w_, k_, v_, aa_, bb_, scale=1.0, initial_state=input_vk_state, output_final_state=True, head_first=False)
655
+
656
+ # if self.layer_idx == 0:
657
+ # print(f'output_vk_state sum = {torch.sum(output_vk_state)}')
658
+
659
+ x = x.view(B,T,-1) * (float(N) ** -0.5)
660
+
661
+ x = x + ((r.view(B,T,H,-1)*k.view(B,T,H,-1)*self.r_k).sum(dim=-1, keepdim=True) * v.view(B,T,H,-1)).view(B,T,-1)
662
+
663
+
664
+
665
+
666
+ x = x * g
667
+ x = self.output(x)
668
+
669
+ if past_key_values is not None:
670
+ past_key_values.update(output_vk_state, output_shift_state, self.layer_idx, q_len, is_layer_attention(self.config, self.layer_idx))
671
+
672
+ return x, v_first, k_first
673
+
674
+ class RWKV079Qwen3DecoderLayer(nn.Module):
675
+ def __init__(self, config: RWKV079Qwen3Config, layer_idx: int):
676
+ nn.Module.__init__(self)
677
+ self.hidden_size = config.hidden_size
678
+ self.layer_idx = layer_idx
679
+
680
+ if is_layer_attention(config, layer_idx):
681
+ print(f'layer {layer_idx} : attention')
682
+ att_fn = Qwen3AttentionNoPE_Causal #Qwen3KeyQuant #Qwen3SWAPrefill #Qwen3DropoutSWASink #Qwen3AttentionNoPE #Qwen3MOBA #Qwen3AttentionVerticalSparse # Qwen3DoubleAttention # Qwen3SymPow #Qwen3Chunk #Qwen3Power #Qwen3MOBA #Qwen3Attention # Qwen3NewAttention # Qwen3AttentionAdapted
683
+ else:
684
+ print(f'layer {layer_idx} : rwkv')
685
+ att_fn = RWKV079Attention
686
+
687
+ self.self_attn = att_fn(config, layer_idx)
688
+
689
+ self.mlp = Qwen3MLP(config)
690
+ self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
691
+ self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
692
+ self.attention_type = config.layer_types[layer_idx]
693
+
694
+ def forward(
695
+ self,
696
+ hidden_states: torch.Tensor,
697
+ frozen_residual: torch.Tensor,
698
+ v_first: Optional[torch.Tensor],
699
+ k_first: Optional[torch.Tensor],
700
+ attention_mask: Optional[torch.Tensor] = None,
701
+ position_ids: Optional[torch.LongTensor] = None,
702
+ past_key_values: Optional[Cache] = None,
703
+ output_attentions: Optional[bool] = False,
704
+ use_cache: Optional[bool] = False,
705
+ cache_position: Optional[torch.LongTensor] = None,
706
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
707
+ **kwargs,
708
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
709
+ residual = hidden_states
710
+
711
+ hidden_states = self.input_layernorm(hidden_states)
712
+
713
+ # Self Attention
714
+ hidden_states, v_first, k_first = self.self_attn(
715
+ hidden_states=hidden_states,
716
+ frozen_residual=frozen_residual,
717
+ v_first=v_first,
718
+ k_first=k_first,
719
+ attention_mask=attention_mask,
720
+ position_ids=position_ids,
721
+ past_key_values=past_key_values,
722
+ output_attentions=output_attentions,
723
+ use_cache=use_cache,
724
+ cache_position=cache_position,
725
+ position_embeddings=position_embeddings,
726
+ #is_causal=True,
727
+ )
728
+
729
+ hidden_states = residual + hidden_states
730
+
731
+ # Fully Connected
732
+ residual = hidden_states
733
+ hidden_states = self.post_attention_layernorm(hidden_states)
734
+ hidden_states = self.mlp(hidden_states)
735
+ hidden_states = residual + hidden_states
736
+
737
+ outputs = (hidden_states, v_first,k_first,)
738
+
739
+ if output_attentions:
740
+ outputs += (self_attn_weights,)
741
+
742
+ return outputs
743
+
744
+
745
+ @auto_docstring
746
+ class RWKV079Qwen3PreTrainedModel(PreTrainedModel):
747
+ config: RWKV079Qwen3Config
748
+ config_class = RWKV079Qwen3Config
749
+ base_model_prefix = "model"
750
+ supports_gradient_checkpointing = True
751
+ _no_split_modules = ["RWKV079Qwen3DecoderLayer"]
752
+ _skip_keys_device_placement = "past_key_values"
753
+ _supports_flash_attn_2 = True
754
+ _supports_sdpa = True
755
+ _supports_flex_attn = True
756
+
757
+ _supports_cache_class = True
758
+ _supports_quantized_cache = True
759
+ _supports_static_cache = True
760
+
761
+ # def _init_weights(self, module):
762
+ # std = self.config.initializer_range
763
+ # if isinstance(module, nn.Linear):
764
+ # module.weight.data.normal_(mean=0.0, std=std)
765
+ # if module.bias is not None:
766
+ # module.bias.data.zero_()
767
+ # elif isinstance(module, nn.Embedding):
768
+ # module.weight.data.normal_(mean=0.0, std=std)
769
+ # if module.padding_idx is not None:
770
+ # module.weight.data[module.padding_idx].zero_()
771
+
772
+ @auto_docstring
773
+ class RWKV079Qwen3Model(RWKV079Qwen3PreTrainedModel):
774
+ """
775
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen3DecoderLayer`]
776
+
777
+ Args:
778
+ config: RWKV079Qwen3Config
779
+ """
780
+
781
+ def __init__(self, config: RWKV079Qwen3Config):
782
+ super().__init__(config)
783
+ self.padding_idx = config.pad_token_id
784
+ self.vocab_size = config.vocab_size
785
+
786
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
787
+ self.layers = nn.ModuleList(
788
+ [RWKV079Qwen3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
789
+ )
790
+ self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
791
+ self.rotary_emb = Qwen3RotaryEmbedding(config=config)
792
+ self.gradient_checkpointing = False
793
+ self.has_sliding_layers = "sliding_attention" in self.config.layer_types
794
+
795
+ # Initialize weights and apply final processing
796
+ self.post_init()
797
+
798
+ #@check_model_inputs
799
+ @auto_docstring
800
+ def forward(
801
+ self,
802
+ input_ids: Optional[torch.LongTensor] = None,
803
+ attention_mask: Optional[torch.Tensor] = None,
804
+ position_ids: Optional[torch.LongTensor] = None,
805
+ past_key_values: Optional[Cache] = None,
806
+ inputs_embeds: Optional[torch.FloatTensor] = None,
807
+ use_cache: Optional[bool] = None,
808
+ output_attentions: Optional[bool] = None,
809
+ output_hidden_states: Optional[bool] = None,
810
+ cache_position: Optional[torch.LongTensor] = None,
811
+ **kwargs: Unpack[TransformersKwargs],
812
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
813
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
814
+ output_hidden_states = (
815
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
816
+ )
817
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
818
+
819
+ if (input_ids is None) ^ (inputs_embeds is not None):
820
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
821
+
822
+ if self.gradient_checkpointing and self.training and use_cache:
823
+ logger.warning_once(
824
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
825
+ )
826
+ use_cache = False
827
+
828
+ if inputs_embeds is None:
829
+ inputs_embeds = self.embed_tokens(input_ids)
830
+
831
+ if use_cache and not isinstance(past_key_values, RWKV079State):
832
+ past_key_values = RWKV079State()
833
+
834
+ if cache_position is None:
835
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
836
+ cache_position = torch.arange(
837
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
838
+ )
839
+
840
+ if position_ids is None:
841
+ position_ids = cache_position.unsqueeze(0)
842
+
843
+ # It may already have been prepared by e.g. `generate`
844
+ if not isinstance(causal_mask_mapping := attention_mask, dict):
845
+ # Prepare mask arguments
846
+ mask_kwargs = {
847
+ "config": self.config,
848
+ "input_embeds": inputs_embeds,
849
+ "attention_mask": attention_mask,
850
+ "cache_position": cache_position,
851
+ "past_key_values": past_key_values,
852
+ "position_ids": position_ids,
853
+ }
854
+ # Create the masks
855
+ causal_mask_mapping = {
856
+ "full_attention": create_causal_mask(**mask_kwargs),
857
+ }
858
+ # The sliding window alternating layers are not always activated depending on the config
859
+ if self.has_sliding_layers:
860
+ causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs)
861
+
862
+ hidden_states = inputs_embeds
863
+
864
+ # create position embeddings to be shared across the decoder layers
865
+ if self.config.use_rope:
866
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
867
+ else:
868
+ position_embeddings = None
869
+
870
+ # decoder layers
871
+ all_hidden_states = () if output_hidden_states else None
872
+ all_self_attns = () if output_attentions else None
873
+ next_decoder_cache = None
874
+ v_first = None
875
+ k_first = None
876
+ frozen_residual = None
877
+
878
+ for decoder_layer in self.layers:
879
+ if not is_layer_attention(self.config, decoder_layer.layer_idx):
880
+ frozen_residual = hidden_states#rms_norm(hidden_states)
881
+ if output_hidden_states:
882
+ all_hidden_states += (hidden_states,)
883
+
884
+ attention_mask = causal_mask_mapping[decoder_layer.attention_type]
885
+ if attention_mask is not None and attention_mask.ndim == 1:
886
+ attention_mask = None
887
+ #attention_mask = None
888
+
889
+ layer_outputs = decoder_layer(
890
+ hidden_states,
891
+ frozen_residual=frozen_residual,
892
+ attention_mask=attention_mask,
893
+ position_ids=position_ids,
894
+ past_key_values=past_key_values,
895
+ output_attentions=output_attentions,
896
+ use_cache=use_cache,
897
+ cache_position=cache_position,
898
+ position_embeddings=position_embeddings,
899
+ v_first=v_first,
900
+ k_first=k_first
901
+ )
902
+
903
+ hidden_states = layer_outputs[0]
904
+ v_first = layer_outputs[1]
905
+ k_first = layer_outputs[2]
906
+
907
+ if output_attentions:
908
+ all_self_attns += (layer_outputs[2],)
909
+
910
+ hidden_states = self.norm(hidden_states)
911
+
912
+ # add hidden states from the last decoder layer
913
+ if output_hidden_states:
914
+ all_hidden_states += (hidden_states,)
915
+
916
+ #if return_legacy_cache:
917
+ # next_cache = next_cache.to_legacy_cache()
918
+
919
+ return BaseModelOutputWithPast(
920
+ last_hidden_state=hidden_states,
921
+ past_key_values=past_key_values if use_cache else None,
922
+ hidden_states=all_hidden_states,
923
+ attentions=all_self_attns,
924
+ )
925
+
926
+ class RWKV079Qwen3ForCausalLM(RWKV079Qwen3PreTrainedModel, GenerationMixin):
927
+ _tied_weights_keys = ["lm_head.weight"]
928
+
929
+ def __init__(self, config):
930
+ super().__init__(config)
931
+ self.model = RWKV079Qwen3Model(config)
932
+ self.vocab_size = config.vocab_size
933
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
934
+
935
+ # Initialize weights and apply final processing
936
+ self.post_init()
937
+
938
+ @can_return_tuple
939
+ @auto_docstring
940
+ def forward(
941
+ self,
942
+ input_ids: torch.LongTensor = None,
943
+ attention_mask: Optional[torch.Tensor] = None,
944
+ position_ids: Optional[torch.LongTensor] = None,
945
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
946
+ inputs_embeds: Optional[torch.FloatTensor] = None,
947
+ labels: Optional[torch.LongTensor] = None,
948
+ use_cache: Optional[bool] = None,
949
+ output_attentions: Optional[bool] = None,
950
+ output_hidden_states: Optional[bool] = None,
951
+ cache_position: Optional[torch.LongTensor] = None,
952
+ logits_to_keep: Union[int, torch.Tensor] = 0,
953
+ **loss_kwargs,
954
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
955
+ r"""
956
+ Args:
957
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
958
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
959
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
960
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
961
+
962
+ num_logits_to_keep (`int`, *optional*):
963
+ Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
964
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
965
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
966
+
967
+ Returns:
968
+
969
+ Example:
970
+
971
+ ```python
972
+ >>> from transformers import AutoTokenizer, RWKV079Qwen3ForCausalLM
973
+
974
+ >>> model = RWKV079Qwen3ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
975
+ >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
976
+
977
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
978
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
979
+
980
+ >>> # Generate
981
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
982
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
983
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
984
+ ```"""
985
+
986
+ # # run the prefill only up to the last token, then run one more for the actual result
987
+ # # we do this so that called code doesn't have to handle the dichotomy specially and can just check for L==1
988
+ # for i in range(2):
989
+ # all_but_one = max(1, input_ids.size(-1)-1)
990
+ # iid = input_ids[..., i*all_but_one:(i+1)*all_but_one]
991
+ # if iid.size(-1) == 0:
992
+ # continue
993
+ # pids = position_ids
994
+ # if pids is not None:
995
+ # pids = position_ids[..., i*all_but_one:(i+1)*all_but_one]
996
+ # cp = cache_position
997
+ # if cp is not None:
998
+ # cp = cache_position[..., i*all_but_one:(i+1)*all_but_one]
999
+ # rv = self.forward_inner(iid, attention_mask=attention_mask, position_ids=pids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, cache_position=cp, num_logits_to_keep=num_logits_to_keep, **loss_kwargs)
1000
+ # past_key_values = rv.past_key_values
1001
+ # return rv
1002
+
1003
+ # def forward_inner(
1004
+ # self,
1005
+ # input_ids: torch.LongTensor = None,
1006
+ # attention_mask: Optional[torch.Tensor] = None,
1007
+ # position_ids: Optional[torch.LongTensor] = None,
1008
+ # past_key_values: Optional[List[torch.FloatTensor]] = None,
1009
+ # inputs_embeds: Optional[torch.FloatTensor] = None,
1010
+ # labels: Optional[torch.LongTensor] = None,
1011
+ # use_cache: Optional[bool] = None,
1012
+ # output_attentions: Optional[bool] = None,
1013
+ # output_hidden_states: Optional[bool] = None,
1014
+ # cache_position: Optional[torch.LongTensor] = None,
1015
+ # num_logits_to_keep: int = 0,
1016
+ # **loss_kwargs,
1017
+ # ) -> Union[Tuple, CausalLMOutputWithPast]:
1018
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1019
+ output_hidden_states = (
1020
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1021
+ )
1022
+
1023
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1024
+ outputs = self.model(
1025
+ input_ids=input_ids,
1026
+ attention_mask=attention_mask,
1027
+ position_ids=position_ids,
1028
+ past_key_values=past_key_values,
1029
+ inputs_embeds=inputs_embeds,
1030
+ use_cache=use_cache,
1031
+ output_attentions=output_attentions,
1032
+ output_hidden_states=output_hidden_states,
1033
+ cache_position=cache_position,
1034
+ )
1035
+
1036
+ hidden_states = outputs.last_hidden_state
1037
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1038
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
1039
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
1040
+
1041
+ loss = None
1042
+ if labels is not None:
1043
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.vocab_size, **loss_kwargs)
1044
+
1045
+ return CausalLMOutputWithPast(
1046
+ loss=loss,
1047
+ logits=logits,
1048
+ past_key_values=outputs.past_key_values,
1049
+ hidden_states=outputs.hidden_states,
1050
+ attentions=outputs.attentions,
1051
+ )
1052
+
1053
+ @auto_docstring
1054
+ class RWKV079Qwen3ForSequenceClassification(RWKV079Qwen3PreTrainedModel):
1055
+ pass
1056
+
1057
+ @auto_docstring
1058
+ class RWKV079Qwen3ForTokenClassification(RWKV079Qwen3PreTrainedModel):
1059
+ pass
1060
+
1061
+ @auto_docstring
1062
+ class RWKV079Qwen3ForQuestionAnswering(RWKV079Qwen3PreTrainedModel):
1063
+ base_model_prefix = "transformer" # For BC, where `transformer` was used instead of `model`
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<seed:bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<seed:eos>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<seed:pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenization_rwkv079qwen3.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from transformers.models.qwen3.tokenization_qwen3 import Qwen3Tokenizer
2
+
3
+ class RWKV6Qwen3Tokenizer(Qwen3Tokenizer):
4
+ pass
tokenization_rwkv079qwen3_fast.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from transformers.models.qwen2.tokenization_qwen3_fast import Qwen3TokenizerFast
2
+
3
+ class RWKV6Qwen3TokenizerFast(Qwen3TokenizerFast):
4
+ pass
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6bd848f52451824a3033a9f1e67eea5b399a13c90f845a332d3a29537e05827
3
+ size 11883696
tokenizer_config.json ADDED
@@ -0,0 +1,1035 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<seed:bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<seed:pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<seed:eos>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<seed:think>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": false
34
+ },
35
+ "4": {
36
+ "content": "</seed:think>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": false
42
+ },
43
+ "5": {
44
+ "content": "<seed:cot_budget_reflect>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ },
51
+ "6": {
52
+ "content": "</seed:cot_budget_reflect>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": false
58
+ },
59
+ "7": {
60
+ "content": "<seed:tool_call>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": false
66
+ },
67
+ "8": {
68
+ "content": "</seed:tool_call>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": false
74
+ },
75
+ "9": {
76
+ "content": "<[PLHD9_never_used]>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "10": {
84
+ "content": "<[PLHD10_never_used]>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "11": {
92
+ "content": "<[PLHD11_never_used]>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "12": {
100
+ "content": "<[PLHD12_never_used]>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "13": {
108
+ "content": "<[PLHD13_never_used]>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "14": {
116
+ "content": "<[PLHD14_never_used]>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "15": {
124
+ "content": "<[PLHD15_never_used]>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "16": {
132
+ "content": "<[PLHD16_never_used]>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "17": {
140
+ "content": "<[PLHD17_never_used]>",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "18": {
148
+ "content": "<[PLHD18_never_used]>",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "19": {
156
+ "content": "<[PLHD19_never_used]>",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "20": {
164
+ "content": "<[PLHD20_never_used]>",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "21": {
172
+ "content": "<[PLHD21_never_used]>",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "22": {
180
+ "content": "<[PLHD22_never_used]>",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "23": {
188
+ "content": "<[PLHD23_never_used]>",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "24": {
196
+ "content": "<[PLHD24_never_used]>",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "25": {
204
+ "content": "<[PLHD25_never_used]>",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "26": {
212
+ "content": "<[PLHD26_never_used]>",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "27": {
220
+ "content": "<[PLHD27_never_used]>",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "28": {
228
+ "content": "<[PLHD28_never_used]>",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "29": {
236
+ "content": "<[PLHD29_never_used]>",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "30": {
244
+ "content": "<[PLHD30_never_used]>",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "31": {
252
+ "content": "<[PLHD31_never_used]>",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "32": {
260
+ "content": "<[PLHD32_never_used]>",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "33": {
268
+ "content": "<[PLHD33_never_used]>",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "34": {
276
+ "content": "<[PLHD34_never_used]>",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "35": {
284
+ "content": "<[PLHD35_never_used]>",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "36": {
292
+ "content": "<[PLHD36_never_used]>",
293
+ "lstrip": false,
294
+ "normalized": false,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "37": {
300
+ "content": "<[PLHD37_never_used]>",
301
+ "lstrip": false,
302
+ "normalized": false,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "38": {
308
+ "content": "<[PLHD38_never_used]>",
309
+ "lstrip": false,
310
+ "normalized": false,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "39": {
316
+ "content": "<[PLHD39_never_used]>",
317
+ "lstrip": false,
318
+ "normalized": false,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "40": {
324
+ "content": "<[PLHD40_never_used]>",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "41": {
332
+ "content": "<[PLHD41_never_used]>",
333
+ "lstrip": false,
334
+ "normalized": false,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "42": {
340
+ "content": "<[PLHD42_never_used]>",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "43": {
348
+ "content": "<[PLHD43_never_used]>",
349
+ "lstrip": false,
350
+ "normalized": false,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "44": {
356
+ "content": "<[PLHD44_never_used]>",
357
+ "lstrip": false,
358
+ "normalized": false,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "45": {
364
+ "content": "<[PLHD45_never_used]>",
365
+ "lstrip": false,
366
+ "normalized": false,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "46": {
372
+ "content": "<[PLHD46_never_used]>",
373
+ "lstrip": false,
374
+ "normalized": false,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "47": {
380
+ "content": "<[PLHD47_never_used]>",
381
+ "lstrip": false,
382
+ "normalized": false,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "48": {
388
+ "content": "<[PLHD48_never_used]>",
389
+ "lstrip": false,
390
+ "normalized": false,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "49": {
396
+ "content": "<[PLHD49_never_used]>",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "50": {
404
+ "content": "<[PLHD50_never_used]>",
405
+ "lstrip": false,
406
+ "normalized": false,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "51": {
412
+ "content": "<[PLHD51_never_used]>",
413
+ "lstrip": false,
414
+ "normalized": false,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "52": {
420
+ "content": "<[PLHD52_never_used]>",
421
+ "lstrip": false,
422
+ "normalized": false,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "53": {
428
+ "content": "<[PLHD53_never_used]>",
429
+ "lstrip": false,
430
+ "normalized": false,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "54": {
436
+ "content": "<[PLHD54_never_used]>",
437
+ "lstrip": false,
438
+ "normalized": false,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "55": {
444
+ "content": "<[PLHD55_never_used]>",
445
+ "lstrip": false,
446
+ "normalized": false,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "56": {
452
+ "content": "<[PLHD56_never_used]>",
453
+ "lstrip": false,
454
+ "normalized": false,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": true
458
+ },
459
+ "57": {
460
+ "content": "<[PLHD57_never_used]>",
461
+ "lstrip": false,
462
+ "normalized": false,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": true
466
+ },
467
+ "58": {
468
+ "content": "<[PLHD58_never_used]>",
469
+ "lstrip": false,
470
+ "normalized": false,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": true
474
+ },
475
+ "59": {
476
+ "content": "<[PLHD59_never_used]>",
477
+ "lstrip": false,
478
+ "normalized": false,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": true
482
+ },
483
+ "60": {
484
+ "content": "<[PLHD60_never_used]>",
485
+ "lstrip": false,
486
+ "normalized": false,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": true
490
+ },
491
+ "61": {
492
+ "content": "<[PLHD61_never_used]>",
493
+ "lstrip": false,
494
+ "normalized": false,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": true
498
+ },
499
+ "62": {
500
+ "content": "<[PLHD62_never_used]>",
501
+ "lstrip": false,
502
+ "normalized": false,
503
+ "rstrip": false,
504
+ "single_word": false,
505
+ "special": true
506
+ },
507
+ "63": {
508
+ "content": "<[PLHD63_never_used]>",
509
+ "lstrip": false,
510
+ "normalized": false,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": true
514
+ },
515
+ "64": {
516
+ "content": "<[PLHD64_never_used]>",
517
+ "lstrip": false,
518
+ "normalized": false,
519
+ "rstrip": false,
520
+ "single_word": false,
521
+ "special": true
522
+ },
523
+ "65": {
524
+ "content": "<[PLHD65_never_used]>",
525
+ "lstrip": false,
526
+ "normalized": false,
527
+ "rstrip": false,
528
+ "single_word": false,
529
+ "special": true
530
+ },
531
+ "66": {
532
+ "content": "<[PLHD66_never_used]>",
533
+ "lstrip": false,
534
+ "normalized": false,
535
+ "rstrip": false,
536
+ "single_word": false,
537
+ "special": true
538
+ },
539
+ "67": {
540
+ "content": "<[PLHD67_never_used]>",
541
+ "lstrip": false,
542
+ "normalized": false,
543
+ "rstrip": false,
544
+ "single_word": false,
545
+ "special": true
546
+ },
547
+ "68": {
548
+ "content": "<[PLHD68_never_used]>",
549
+ "lstrip": false,
550
+ "normalized": false,
551
+ "rstrip": false,
552
+ "single_word": false,
553
+ "special": true
554
+ },
555
+ "69": {
556
+ "content": "<[PLHD69_never_used]>",
557
+ "lstrip": false,
558
+ "normalized": false,
559
+ "rstrip": false,
560
+ "single_word": false,
561
+ "special": true
562
+ },
563
+ "70": {
564
+ "content": "<[PLHD70_never_used]>",
565
+ "lstrip": false,
566
+ "normalized": false,
567
+ "rstrip": false,
568
+ "single_word": false,
569
+ "special": true
570
+ },
571
+ "71": {
572
+ "content": "<[PLHD71_never_used]>",
573
+ "lstrip": false,
574
+ "normalized": false,
575
+ "rstrip": false,
576
+ "single_word": false,
577
+ "special": true
578
+ },
579
+ "72": {
580
+ "content": "<[PLHD72_never_used]>",
581
+ "lstrip": false,
582
+ "normalized": false,
583
+ "rstrip": false,
584
+ "single_word": false,
585
+ "special": true
586
+ },
587
+ "73": {
588
+ "content": "<[PLHD73_never_used]>",
589
+ "lstrip": false,
590
+ "normalized": false,
591
+ "rstrip": false,
592
+ "single_word": false,
593
+ "special": true
594
+ },
595
+ "74": {
596
+ "content": "<[PLHD74_never_used]>",
597
+ "lstrip": false,
598
+ "normalized": false,
599
+ "rstrip": false,
600
+ "single_word": false,
601
+ "special": true
602
+ },
603
+ "75": {
604
+ "content": "<[PLHD75_never_used]>",
605
+ "lstrip": false,
606
+ "normalized": false,
607
+ "rstrip": false,
608
+ "single_word": false,
609
+ "special": true
610
+ },
611
+ "76": {
612
+ "content": "<[PLHD76_never_used]>",
613
+ "lstrip": false,
614
+ "normalized": false,
615
+ "rstrip": false,
616
+ "single_word": false,
617
+ "special": true
618
+ },
619
+ "77": {
620
+ "content": "<[PLHD77_never_used]>",
621
+ "lstrip": false,
622
+ "normalized": false,
623
+ "rstrip": false,
624
+ "single_word": false,
625
+ "special": true
626
+ },
627
+ "78": {
628
+ "content": "<[PLHD78_never_used]>",
629
+ "lstrip": false,
630
+ "normalized": false,
631
+ "rstrip": false,
632
+ "single_word": false,
633
+ "special": true
634
+ },
635
+ "79": {
636
+ "content": "<[PLHD79_never_used]>",
637
+ "lstrip": false,
638
+ "normalized": false,
639
+ "rstrip": false,
640
+ "single_word": false,
641
+ "special": true
642
+ },
643
+ "80": {
644
+ "content": "<[PLHD80_never_used]>",
645
+ "lstrip": false,
646
+ "normalized": false,
647
+ "rstrip": false,
648
+ "single_word": false,
649
+ "special": true
650
+ },
651
+ "81": {
652
+ "content": "<[PLHD81_never_used]>",
653
+ "lstrip": false,
654
+ "normalized": false,
655
+ "rstrip": false,
656
+ "single_word": false,
657
+ "special": true
658
+ },
659
+ "82": {
660
+ "content": "<[PLHD82_never_used]>",
661
+ "lstrip": false,
662
+ "normalized": false,
663
+ "rstrip": false,
664
+ "single_word": false,
665
+ "special": true
666
+ },
667
+ "83": {
668
+ "content": "<[PLHD83_never_used]>",
669
+ "lstrip": false,
670
+ "normalized": false,
671
+ "rstrip": false,
672
+ "single_word": false,
673
+ "special": true
674
+ },
675
+ "84": {
676
+ "content": "<[PLHD84_never_used]>",
677
+ "lstrip": false,
678
+ "normalized": false,
679
+ "rstrip": false,
680
+ "single_word": false,
681
+ "special": true
682
+ },
683
+ "85": {
684
+ "content": "<[PLHD85_never_used]>",
685
+ "lstrip": false,
686
+ "normalized": false,
687
+ "rstrip": false,
688
+ "single_word": false,
689
+ "special": true
690
+ },
691
+ "86": {
692
+ "content": "<[PLHD86_never_used]>",
693
+ "lstrip": false,
694
+ "normalized": false,
695
+ "rstrip": false,
696
+ "single_word": false,
697
+ "special": true
698
+ },
699
+ "87": {
700
+ "content": "<[PLHD87_never_used]>",
701
+ "lstrip": false,
702
+ "normalized": false,
703
+ "rstrip": false,
704
+ "single_word": false,
705
+ "special": true
706
+ },
707
+ "88": {
708
+ "content": "<[PLHD88_never_used]>",
709
+ "lstrip": false,
710
+ "normalized": false,
711
+ "rstrip": false,
712
+ "single_word": false,
713
+ "special": true
714
+ },
715
+ "89": {
716
+ "content": "<[PLHD89_never_used]>",
717
+ "lstrip": false,
718
+ "normalized": false,
719
+ "rstrip": false,
720
+ "single_word": false,
721
+ "special": true
722
+ },
723
+ "90": {
724
+ "content": "<[PLHD90_never_used]>",
725
+ "lstrip": false,
726
+ "normalized": false,
727
+ "rstrip": false,
728
+ "single_word": false,
729
+ "special": true
730
+ },
731
+ "91": {
732
+ "content": "<[PLHD91_never_used]>",
733
+ "lstrip": false,
734
+ "normalized": false,
735
+ "rstrip": false,
736
+ "single_word": false,
737
+ "special": true
738
+ },
739
+ "92": {
740
+ "content": "<[PLHD92_never_used]>",
741
+ "lstrip": false,
742
+ "normalized": false,
743
+ "rstrip": false,
744
+ "single_word": false,
745
+ "special": true
746
+ },
747
+ "93": {
748
+ "content": "<[PLHD93_never_used]>",
749
+ "lstrip": false,
750
+ "normalized": false,
751
+ "rstrip": false,
752
+ "single_word": false,
753
+ "special": true
754
+ },
755
+ "94": {
756
+ "content": "<[PLHD94_never_used]>",
757
+ "lstrip": false,
758
+ "normalized": false,
759
+ "rstrip": false,
760
+ "single_word": false,
761
+ "special": true
762
+ },
763
+ "95": {
764
+ "content": "<[PLHD95_never_used]>",
765
+ "lstrip": false,
766
+ "normalized": false,
767
+ "rstrip": false,
768
+ "single_word": false,
769
+ "special": true
770
+ },
771
+ "96": {
772
+ "content": "<[PLHD96_never_used]>",
773
+ "lstrip": false,
774
+ "normalized": false,
775
+ "rstrip": false,
776
+ "single_word": false,
777
+ "special": true
778
+ },
779
+ "97": {
780
+ "content": "<[PLHD97_never_used]>",
781
+ "lstrip": false,
782
+ "normalized": false,
783
+ "rstrip": false,
784
+ "single_word": false,
785
+ "special": true
786
+ },
787
+ "98": {
788
+ "content": "<[PLHD98_never_used]>",
789
+ "lstrip": false,
790
+ "normalized": false,
791
+ "rstrip": false,
792
+ "single_word": false,
793
+ "special": true
794
+ },
795
+ "99": {
796
+ "content": "<[PLHD99_never_used]>",
797
+ "lstrip": false,
798
+ "normalized": false,
799
+ "rstrip": false,
800
+ "single_word": false,
801
+ "special": true
802
+ },
803
+ "100": {
804
+ "content": "<[PLHD100_never_used]>",
805
+ "lstrip": false,
806
+ "normalized": false,
807
+ "rstrip": false,
808
+ "single_word": false,
809
+ "special": true
810
+ },
811
+ "101": {
812
+ "content": "<[PLHD101_never_used]>",
813
+ "lstrip": false,
814
+ "normalized": false,
815
+ "rstrip": false,
816
+ "single_word": false,
817
+ "special": true
818
+ },
819
+ "102": {
820
+ "content": "<[PLHD102_never_used]>",
821
+ "lstrip": false,
822
+ "normalized": false,
823
+ "rstrip": false,
824
+ "single_word": false,
825
+ "special": true
826
+ },
827
+ "103": {
828
+ "content": "<[PLHD103_never_used]>",
829
+ "lstrip": false,
830
+ "normalized": false,
831
+ "rstrip": false,
832
+ "single_word": false,
833
+ "special": true
834
+ },
835
+ "104": {
836
+ "content": "<[PLHD104_never_used]>",
837
+ "lstrip": false,
838
+ "normalized": false,
839
+ "rstrip": false,
840
+ "single_word": false,
841
+ "special": true
842
+ },
843
+ "105": {
844
+ "content": "<[PLHD105_never_used]>",
845
+ "lstrip": false,
846
+ "normalized": false,
847
+ "rstrip": false,
848
+ "single_word": false,
849
+ "special": true
850
+ },
851
+ "106": {
852
+ "content": "<[PLHD106_never_used]>",
853
+ "lstrip": false,
854
+ "normalized": false,
855
+ "rstrip": false,
856
+ "single_word": false,
857
+ "special": true
858
+ },
859
+ "107": {
860
+ "content": "<[PLHD107_never_used]>",
861
+ "lstrip": false,
862
+ "normalized": false,
863
+ "rstrip": false,
864
+ "single_word": false,
865
+ "special": true
866
+ },
867
+ "108": {
868
+ "content": "<[PLHD108_never_used]>",
869
+ "lstrip": false,
870
+ "normalized": false,
871
+ "rstrip": false,
872
+ "single_word": false,
873
+ "special": true
874
+ },
875
+ "109": {
876
+ "content": "<[PLHD109_never_used]>",
877
+ "lstrip": false,
878
+ "normalized": false,
879
+ "rstrip": false,
880
+ "single_word": false,
881
+ "special": true
882
+ },
883
+ "110": {
884
+ "content": "<[PLHD110_never_used]>",
885
+ "lstrip": false,
886
+ "normalized": false,
887
+ "rstrip": false,
888
+ "single_word": false,
889
+ "special": true
890
+ },
891
+ "111": {
892
+ "content": "<[PLHD111_never_used]>",
893
+ "lstrip": false,
894
+ "normalized": false,
895
+ "rstrip": false,
896
+ "single_word": false,
897
+ "special": true
898
+ },
899
+ "112": {
900
+ "content": "<[PLHD112_never_used]>",
901
+ "lstrip": false,
902
+ "normalized": false,
903
+ "rstrip": false,
904
+ "single_word": false,
905
+ "special": true
906
+ },
907
+ "113": {
908
+ "content": "<[PLHD113_never_used]>",
909
+ "lstrip": false,
910
+ "normalized": false,
911
+ "rstrip": false,
912
+ "single_word": false,
913
+ "special": true
914
+ },
915
+ "114": {
916
+ "content": "<[PLHD114_never_used]>",
917
+ "lstrip": false,
918
+ "normalized": false,
919
+ "rstrip": false,
920
+ "single_word": false,
921
+ "special": true
922
+ },
923
+ "115": {
924
+ "content": "<[PLHD115_never_used]>",
925
+ "lstrip": false,
926
+ "normalized": false,
927
+ "rstrip": false,
928
+ "single_word": false,
929
+ "special": true
930
+ },
931
+ "116": {
932
+ "content": "<[PLHD116_never_used]>",
933
+ "lstrip": false,
934
+ "normalized": false,
935
+ "rstrip": false,
936
+ "single_word": false,
937
+ "special": true
938
+ },
939
+ "117": {
940
+ "content": "<[PLHD117_never_used]>",
941
+ "lstrip": false,
942
+ "normalized": false,
943
+ "rstrip": false,
944
+ "single_word": false,
945
+ "special": true
946
+ },
947
+ "118": {
948
+ "content": "<[PLHD118_never_used]>",
949
+ "lstrip": false,
950
+ "normalized": false,
951
+ "rstrip": false,
952
+ "single_word": false,
953
+ "special": true
954
+ },
955
+ "119": {
956
+ "content": "<[PLHD119_never_used]>",
957
+ "lstrip": false,
958
+ "normalized": false,
959
+ "rstrip": false,
960
+ "single_word": false,
961
+ "special": true
962
+ },
963
+ "120": {
964
+ "content": "<[PLHD120_never_used]>",
965
+ "lstrip": false,
966
+ "normalized": false,
967
+ "rstrip": false,
968
+ "single_word": false,
969
+ "special": true
970
+ },
971
+ "121": {
972
+ "content": "<[PLHD121_never_used]>",
973
+ "lstrip": false,
974
+ "normalized": false,
975
+ "rstrip": false,
976
+ "single_word": false,
977
+ "special": true
978
+ },
979
+ "122": {
980
+ "content": "<[PLHD122_never_used]>",
981
+ "lstrip": false,
982
+ "normalized": false,
983
+ "rstrip": false,
984
+ "single_word": false,
985
+ "special": true
986
+ },
987
+ "123": {
988
+ "content": "<[PLHD123_never_used]>",
989
+ "lstrip": false,
990
+ "normalized": false,
991
+ "rstrip": false,
992
+ "single_word": false,
993
+ "special": true
994
+ },
995
+ "124": {
996
+ "content": "<[PLHD124_never_used]>",
997
+ "lstrip": false,
998
+ "normalized": false,
999
+ "rstrip": false,
1000
+ "single_word": false,
1001
+ "special": true
1002
+ },
1003
+ "125": {
1004
+ "content": "<[PLHD125_never_used]>",
1005
+ "lstrip": false,
1006
+ "normalized": false,
1007
+ "rstrip": false,
1008
+ "single_word": false,
1009
+ "special": true
1010
+ },
1011
+ "126": {
1012
+ "content": "<[PLHD126_never_used]>",
1013
+ "lstrip": false,
1014
+ "normalized": false,
1015
+ "rstrip": false,
1016
+ "single_word": false,
1017
+ "special": true
1018
+ },
1019
+ "127": {
1020
+ "content": "<[PLHD127_never_used]>",
1021
+ "lstrip": false,
1022
+ "normalized": false,
1023
+ "rstrip": false,
1024
+ "single_word": false,
1025
+ "special": true
1026
+ }
1027
+ },
1028
+ "bos_token": "<seed:bos>",
1029
+ "clean_up_tokenization_spaces": false,
1030
+ "eos_token": "<seed:eos>",
1031
+ "extra_special_tokens": {},
1032
+ "model_max_length": 1000000000000000019884624838656,
1033
+ "pad_token": "<seed:pad>",
1034
+ "tokenizer_class": "PreTrainedTokenizerFast"
1035
+ }