Training in progress, step 1000

Browse files

Files changed (6) hide show

README.md +1 -1
adapter_config.json +4 -11
adapter_model.safetensors +2 -2
chat_template.jinja +14 -18
modelopt_state_train.pth +2 -2
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -27,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/g-puca1-deloitte/llmv3/runs/pnlds90a)
 This model was trained with SFT.

 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/g-puca1-deloitte/llmv3/runs/16peb9vx)
 This model was trained with SFT.

adapter_config.json CHANGED Viewed

@@ -21,23 +21,16 @@
   "modules_to_save": null,
   "peft_type": "LORA",
   "qalora_group_size": 16,
-  "r": 8,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "q_proj",
-    "k_proj",
     "o_proj",
-    "v_proj"
-  ],
-  "target_parameters": [
-    "0.mlp.experts.gate_up_proj",
-    "1.mlp.experts.gate_up_proj",
-    "2.mlp.experts.gate_up_proj",
-    "0.mlp.experts.down_proj",
-    "1.mlp.experts.down_proj",
-    "2.mlp.experts.down_proj"
   ],
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,
   "use_dora": false,

   "modules_to_save": null,
   "peft_type": "LORA",
   "qalora_group_size": 16,
+  "r": 16,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "q_proj",
+    "v_proj",
     "o_proj",
+    "k_proj"
   ],
+  "target_parameters": null,
   "task_type": "CAUSAL_LM",
   "trainable_token_indices": null,
   "use_dora": false,

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:db8e80ad25838200548d87734c6554a995dda35e6d85fb155943e80f810d86b0
-size 200875760

 version https://git-lfs.github.com/spec/v1
+oid sha256:1239e51a13cafbcbab37833817367f0a56d17ebdad8ef4d9d66a95452b0fa2ed
+size 47814376

chat_template.jinja CHANGED Viewed

@@ -3,9 +3,8 @@
   following kwargs:
   - "builtin_tools": A list, can contain "browser" and/or "python".
   - "model_identity": A string that optionally describes the model identity.
-  - "reasoning_effort": A string that describes the reasoning effort, defaults to "low".
  #}
 {#- Tool Definition Rendering ============================================== #}
 {%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
     {%- if param_spec.type == "array" -%}
@@ -81,7 +80,6 @@
         {{- "number" }}
     {%- elif param_spec.type == "boolean" -%}
         {{- "boolean" }}
     {%- elif param_spec.type == "object" -%}
         {%- if param_spec.properties -%}
             {{- "{\n" }}
@@ -104,7 +102,6 @@
         {{- "any" }}
     {%- endif -%}
 {%- endmacro -%}
 {%- macro render_tool_namespace(namespace_name, tools) -%}
     {{- "## " + namespace_name + "\n\n" }}
     {{- "namespace " + namespace_name + " {\n\n" }}
@@ -146,7 +143,6 @@
     {%- endfor %}
     {{- "} // namespace " + namespace_name }}
 {%- endmacro -%}
 {%- macro render_builtin_tools(browser_tool, python_tool) -%}
     {%- if browser_tool %}
         {{- "## browser\n\n" }}
@@ -184,14 +180,12 @@
         {{- "}) => any;\n\n" }}
         {{- "} // namespace browser\n\n" }}
     {%- endif -%}
     {%- if python_tool %}
         {{- "## python\n\n" }}
         {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
         {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
     {%- endif -%}
 {%- endmacro -%}
 {#- System Message Construction ============================================ #}
 {%- macro build_system_message() -%}
     {%- if model_identity is not defined %}
@@ -201,7 +195,7 @@
     {{- "Knowledge cutoff: 2024-06\n" }}
     {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
     {%- if reasoning_effort is not defined %}
-        {%- set reasoning_effort = "low" %}
     {%- endif %}
     {{- "Reasoning: " + reasoning_effort + "\n\n" }}
     {%- if builtin_tools %}
@@ -221,15 +215,12 @@
         {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
     {%- endif -%}
 {%- endmacro -%}
 {#- Main Template Logic ================================================= #}
 {#- Set defaults #}
 {#- Render system message #}
 {{- "<|start|>system<|message|>" }}
 {{- build_system_message() }}
 {{- "<|end|>" }}
 {#- Extract developer message #}
 {%- if messages[0].role == "developer" or messages[0].role == "system" %}
     {%- set developer_message = messages[0].content %}
@@ -238,7 +229,6 @@
     {%- set developer_message = "" %}
     {%- set loop_messages = messages %}
 {%- endif %}
 {#- Render developer message #}
 {%- if developer_message or tools %}
     {{- "<|start|>developer<|message|>" }}
@@ -253,7 +243,6 @@
     {%- endif -%}
     {{- "<|end|>" }}
 {%- endif %}
 {#- Render messages #}
 {%- set last_tool_call = namespace(name=none) %}
 {%- for message in loop_messages -%}
@@ -286,12 +275,14 @@
             {%- if tool_call.function %}
                 {%- set tool_call = tool_call.function %}
             {%- endif %}
-            {%- if message.content and message.thinking %}
-                {{- raise_exception("Cannot pass both content and thinking in an assistant message with tool calls! Put the analysis message in one or the other, but not both.") }}
-            {%- elif message.content and not future_final_message.found %}
                 {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
             {%- elif message.thinking and not future_final_message.found %}
                 {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
             {%- endif %}
             {{- "<|start|>assistant to=" }}
             {{- "functions." + tool_call.name + "<|channel|>commentary " }}
@@ -303,15 +294,21 @@
             {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
             {#- This is a situation that should only occur in training, never in inference. #}
             {%- if "thinking" in message %}
                 {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
             {%- endif %}
             {#- <|return|> indicates the end of generation, but <|end|> does not #}
             {#- <|return|> should never be an input to the model, but we include it as the final token #}
             {#- when training, so the model learns to emit it. #}
             {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
         {%- else %}
             {#- CoT is dropped during all previous turns, so we never render it for inference #}
             {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
             {%- set last_tool_call.name = none %}
         {%- endif %}
     {%- elif message.role == 'tool' -%}
@@ -324,8 +321,7 @@
         {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
     {%- endif -%}
 {%- endfor -%}
 {#- Generation prompt #}
 {%- if add_generation_prompt -%}
 <|start|>assistant
-{%- endif -%}

   following kwargs:
   - "builtin_tools": A list, can contain "browser" and/or "python".
   - "model_identity": A string that optionally describes the model identity.
+  - "reasoning_effort": A string that describes the reasoning effort, defaults to "none".
  #}
 {#- Tool Definition Rendering ============================================== #}
 {%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
     {%- if param_spec.type == "array" -%}
         {{- "number" }}
     {%- elif param_spec.type == "boolean" -%}
         {{- "boolean" }}
     {%- elif param_spec.type == "object" -%}
         {%- if param_spec.properties -%}
             {{- "{\n" }}
         {{- "any" }}
     {%- endif -%}
 {%- endmacro -%}
 {%- macro render_tool_namespace(namespace_name, tools) -%}
     {{- "## " + namespace_name + "\n\n" }}
     {{- "namespace " + namespace_name + " {\n\n" }}
     {%- endfor %}
     {{- "} // namespace " + namespace_name }}
 {%- endmacro -%}
 {%- macro render_builtin_tools(browser_tool, python_tool) -%}
     {%- if browser_tool %}
         {{- "## browser\n\n" }}
         {{- "}) => any;\n\n" }}
         {{- "} // namespace browser\n\n" }}
     {%- endif -%}
     {%- if python_tool %}
         {{- "## python\n\n" }}
         {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
         {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
     {%- endif -%}
 {%- endmacro -%}
 {#- System Message Construction ============================================ #}
 {%- macro build_system_message() -%}
     {%- if model_identity is not defined %}
     {{- "Knowledge cutoff: 2024-06\n" }}
     {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
     {%- if reasoning_effort is not defined %}
+        {%- set reasoning_effort = "none" %}
     {%- endif %}
     {{- "Reasoning: " + reasoning_effort + "\n\n" }}
     {%- if builtin_tools %}
         {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
     {%- endif -%}
 {%- endmacro -%}
 {#- Main Template Logic ================================================= #}
 {#- Set defaults #}
 {#- Render system message #}
 {{- "<|start|>system<|message|>" }}
 {{- build_system_message() }}
 {{- "<|end|>" }}
 {#- Extract developer message #}
 {%- if messages[0].role == "developer" or messages[0].role == "system" %}
     {%- set developer_message = messages[0].content %}
     {%- set developer_message = "" %}
     {%- set loop_messages = messages %}
 {%- endif %}
 {#- Render developer message #}
 {%- if developer_message or tools %}
     {{- "<|start|>developer<|message|>" }}
     {%- endif -%}
     {{- "<|end|>" }}
 {%- endif %}
 {#- Render messages #}
 {%- set last_tool_call = namespace(name=none) %}
 {%- for message in loop_messages -%}
             {%- if tool_call.function %}
                 {%- set tool_call = tool_call.function %}
             {%- endif %}
+            {%- if message.content and not future_final_message.found %}
+                {% generation %}
                 {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
+                {% endgeneration %}
             {%- elif message.thinking and not future_final_message.found %}
+                {% generation %}
                 {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
+                {% endgeneration %}
             {%- endif %}
             {{- "<|start|>assistant to=" }}
             {{- "functions." + tool_call.name + "<|channel|>commentary " }}
             {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
             {#- This is a situation that should only occur in training, never in inference. #}
             {%- if "thinking" in message %}
+                {% generation %}
                 {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
+                {% endgeneration %}
             {%- endif %}
             {#- <|return|> indicates the end of generation, but <|end|> does not #}
             {#- <|return|> should never be an input to the model, but we include it as the final token #}
             {#- when training, so the model learns to emit it. #}
+            {% generation %}
             {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
+            {% endgeneration %}
         {%- else %}
             {#- CoT is dropped during all previous turns, so we never render it for inference #}
+            {% generation %}
             {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
+            {% endgeneration %}
             {%- set last_tool_call.name = none %}
         {%- endif %}
     {%- elif message.role == 'tool' -%}
         {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
     {%- endif -%}
 {%- endfor -%}
 {#- Generation prompt #}
 {%- if add_generation_prompt -%}
 <|start|>assistant
+{%- endif -%}

modelopt_state_train.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7707130df73908b8251b8c5fca69b6bd6172d478c6a28a381d392cfba049ac35
-size 994683

 version https://git-lfs.github.com/spec/v1
+oid sha256:543a2762eb7388bc5b2aad89dbe66a41728b3efab046c4145d3788188ed452e1
+size 975611

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f82c343f61be8fad671fe7a01a7ddab16d3700c0b2aaa050705732b34953997b
 size 6353

 version https://git-lfs.github.com/spec/v1
+oid sha256:96e67e32bc5b130bd9dc87c8487c8030dd801ddfefd04c3cf3dfdc3195b19e55
 size 6353