Robotics
LeRobot
Safetensors
smolvla
godnpeter commited on
Commit
dcefd19
·
verified ·
1 Parent(s): 7d639c5

Upload policy weights, train config and readme

Browse files
Files changed (4) hide show
  1. README.md +63 -0
  2. config.json +94 -0
  3. model.safetensors +3 -0
  4. train_config.json +361 -0
README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: lerobot/smolvla_base
3
+ datasets: godnpeter/aopoli-lv-libero_combined_no_noops_lerobot_v21
4
+ library_name: lerobot
5
+ license: apache-2.0
6
+ model_name: smolvla
7
+ pipeline_tag: robotics
8
+ tags:
9
+ - robotics
10
+ - lerobot
11
+ - smolvla
12
+ ---
13
+
14
+ # Model Card for smolvla
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+ [SmolVLA](https://huggingface.co/papers/2506.01844) is a compact, efficient vision-language-action model that achieves competitive performance at reduced computational costs and can be deployed on consumer-grade hardware.
20
+
21
+
22
+ This policy has been trained and pushed to the Hub using [LeRobot](https://github.com/huggingface/lerobot).
23
+ See the full documentation at [LeRobot Docs](https://huggingface.co/docs/lerobot/index).
24
+
25
+ ---
26
+
27
+ ## How to Get Started with the Model
28
+
29
+ For a complete walkthrough, see the [training guide](https://huggingface.co/docs/lerobot/il_robots#train-a-policy).
30
+ Below is the short version on how to train and run inference/eval:
31
+
32
+ ### Train from scratch
33
+
34
+ ```bash
35
+ lerobot-train \
36
+ --dataset.repo_id=${HF_USER}/<dataset> \
37
+ --policy.type=act \
38
+ --output_dir=outputs/train/<desired_policy_repo_id> \
39
+ --job_name=lerobot_training \
40
+ --policy.device=cuda \
41
+ --policy.repo_id=${HF_USER}/<desired_policy_repo_id>
42
+ --wandb.enable=true
43
+ ```
44
+
45
+ _Writes checkpoints to `outputs/train/<desired_policy_repo_id>/checkpoints/`._
46
+
47
+ ### Evaluate the policy/run inference
48
+
49
+ ```bash
50
+ lerobot-record \
51
+ --robot.type=so100_follower \
52
+ --dataset.repo_id=<hf_user>/eval_<dataset> \
53
+ --policy.path=<hf_user>/<desired_policy_repo_id> \
54
+ --episodes=10
55
+ ```
56
+
57
+ Prefix the dataset repo with **eval\_** and supply `--policy.path` pointing to a local or hub checkpoint.
58
+
59
+ ---
60
+
61
+ ## Model Details
62
+
63
+ - **License:** apache-2.0
config.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "smolvla",
3
+ "n_obs_steps": 1,
4
+ "normalization_mapping": {
5
+ "VISUAL": "IDENTITY",
6
+ "STATE": "MEAN_STD",
7
+ "ACTION": "MEAN_STD"
8
+ },
9
+ "input_features": {
10
+ "observation.images.wrist_image": {
11
+ "type": "VISUAL",
12
+ "shape": [
13
+ 256,
14
+ 256,
15
+ 3
16
+ ]
17
+ },
18
+ "observation.images.image": {
19
+ "type": "VISUAL",
20
+ "shape": [
21
+ 256,
22
+ 256,
23
+ 3
24
+ ]
25
+ },
26
+ "observation.state": {
27
+ "type": "STATE",
28
+ "shape": [
29
+ 8
30
+ ]
31
+ }
32
+ },
33
+ "output_features": {
34
+ "action": {
35
+ "type": "ACTION",
36
+ "shape": [
37
+ 7
38
+ ]
39
+ }
40
+ },
41
+ "device": "cuda",
42
+ "use_amp": false,
43
+ "use_peft": false,
44
+ "push_to_hub": true,
45
+ "repo_id": "combined_frozen_chunk50_noproprio_unified_text_prompt_fullvlm_1010",
46
+ "private": null,
47
+ "tags": null,
48
+ "license": null,
49
+ "use_proprio": false,
50
+ "chunk_size": 50,
51
+ "n_action_steps": 50,
52
+ "normalize_visual": "identity",
53
+ "normalize_state": "mean_std",
54
+ "normalize_action": "mean_std",
55
+ "max_state_dim": 32,
56
+ "max_action_dim": 32,
57
+ "resize_imgs_with_padding": [
58
+ 512,
59
+ 512
60
+ ],
61
+ "empty_cameras": 0,
62
+ "adapt_to_pi_aloha": false,
63
+ "use_delta_joint_actions_aloha": false,
64
+ "tokenizer_max_length": 48,
65
+ "num_steps": 10,
66
+ "use_cache": true,
67
+ "freeze_vision_encoder": true,
68
+ "train_expert_only": true,
69
+ "train_state_proj": true,
70
+ "optimizer_lr": 0.0001,
71
+ "optimizer_betas": [
72
+ 0.9,
73
+ 0.95
74
+ ],
75
+ "optimizer_eps": 1e-08,
76
+ "optimizer_weight_decay": 1e-10,
77
+ "optimizer_grad_clip_norm": 10,
78
+ "scheduler_warmup_steps": 1000,
79
+ "scheduler_decay_steps": 30000,
80
+ "scheduler_decay_lr": 2.5e-06,
81
+ "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
82
+ "load_vlm_weights": true,
83
+ "vlm_model_dtype": "bfloat16",
84
+ "add_image_special_tokens": false,
85
+ "attention_mode": "cross_attn",
86
+ "prefix_length": -1,
87
+ "pad_language_to": "longest",
88
+ "num_expert_layers": -1,
89
+ "num_vlm_layers": -1,
90
+ "self_attn_every_n_layers": 2,
91
+ "expert_width_multiplier": 0.75,
92
+ "min_period": 0.004,
93
+ "max_period": 4.0
94
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32c202a408aba1283f05a752c6811aa83a792e30aa72067f4dce4d95b10ff5fd
3
+ size 1421156816
train_config.json ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dataset": {
3
+ "repo_id": "godnpeter/aopoli-lv-libero_combined_no_noops_lerobot_v21",
4
+ "use_all_local_repos": false,
5
+ "root": null,
6
+ "episodes": null,
7
+ "image_transforms": {
8
+ "enable": false,
9
+ "max_num_transforms": 3,
10
+ "random_order": false,
11
+ "tfs": {
12
+ "brightness": {
13
+ "weight": 1.0,
14
+ "type": "ColorJitter",
15
+ "kwargs": {
16
+ "brightness": [
17
+ 0.8,
18
+ 1.2
19
+ ]
20
+ }
21
+ },
22
+ "contrast": {
23
+ "weight": 1.0,
24
+ "type": "ColorJitter",
25
+ "kwargs": {
26
+ "contrast": [
27
+ 0.8,
28
+ 1.2
29
+ ]
30
+ }
31
+ },
32
+ "saturation": {
33
+ "weight": 1.0,
34
+ "type": "ColorJitter",
35
+ "kwargs": {
36
+ "saturation": [
37
+ 0.5,
38
+ 1.5
39
+ ]
40
+ }
41
+ },
42
+ "hue": {
43
+ "weight": 1.0,
44
+ "type": "ColorJitter",
45
+ "kwargs": {
46
+ "hue": [
47
+ -0.05,
48
+ 0.05
49
+ ]
50
+ }
51
+ },
52
+ "sharpness": {
53
+ "weight": 1.0,
54
+ "type": "SharpnessJitter",
55
+ "kwargs": {
56
+ "sharpness": [
57
+ 0.5,
58
+ 1.5
59
+ ]
60
+ }
61
+ },
62
+ "shift": {
63
+ "weight": 0.0,
64
+ "type": "RandomShift",
65
+ "kwargs": {
66
+ "max_shift": 8,
67
+ "padding_mode": "edge"
68
+ }
69
+ }
70
+ }
71
+ },
72
+ "text_transform": {
73
+ "enable": true,
74
+ "rewrite_map_path": "/fsx/dongyoonhwang/lerobot_raw/modified_libero_prompts/unify_text_prompt.yaml",
75
+ "case_insensitive": true,
76
+ "random_choice": true,
77
+ "rewrite_map": {
78
+ "pick up the black bowl between the plate and the ramekin and place it on the plate": [
79
+ "pick up the black bowl between the plate and the ramekin and place it on the plate"
80
+ ],
81
+ "pick up the black bowl next to the ramekin and place it on the plate": [
82
+ "pick up the black bowl next to the ramekin and place it on the plate"
83
+ ],
84
+ "pick up the black bowl from table center and place it on the plate": [
85
+ "pick up the black bowl from the table center and place it on the plate"
86
+ ],
87
+ "pick up the black bowl on the cookie box and place it on the plate": [
88
+ "pick up the black bowl on the cookie box and place it on the plate"
89
+ ],
90
+ "pick up the black bowl in the top drawer of the wooden cabinet and place it on the plate": [
91
+ "pick up the black bowl in the top drawer of the wooden cabinet and place it on the plate"
92
+ ],
93
+ "pick up the black bowl on the ramekin and place it on the plate": [
94
+ "pick up the black bowl on the ramekin and place it on the plate"
95
+ ],
96
+ "pick up the black bowl next to the cookie box and place it on the plate": [
97
+ "pick up the black bowl next to the cookie box and place it on the plate"
98
+ ],
99
+ "pick up the black bowl on the stove and place it on the plate": [
100
+ "pick up the black bowl on the stove and place it on the plate"
101
+ ],
102
+ "pick up the black bowl next to the plate and place it on the plate": [
103
+ "pick up the black bowl next to the plate and place it on the plate"
104
+ ],
105
+ "pick up the black bowl on the wooden cabinet and place it on the plate": [
106
+ "pick up the black bowl on the wooden cabinet and place it on the plate"
107
+ ],
108
+ "pick up the alphabet soup and place it in the basket": [
109
+ "pick up the alphabet soup can and place it in the basket"
110
+ ],
111
+ "pick up the cream cheese and place it in the basket": [
112
+ "pick up the cream cheese box and place it in the basket"
113
+ ],
114
+ "pick up the salad dressing and place it in the basket": [
115
+ "pick up the salad dressing bottle and place it in the basket"
116
+ ],
117
+ "pick up the bbq sauce and place it in the basket": [
118
+ "pick up the bbq sauce bottle and place it in the basket"
119
+ ],
120
+ "pick up the ketchup and place it in the basket": [
121
+ "pick up the ketchup bottle and place it in the basket"
122
+ ],
123
+ "pick up the tomato sauce and place it in the basket": [
124
+ "pick up the tomato sauce can and place it in the basket"
125
+ ],
126
+ "pick up the butter and place it in the basket": [
127
+ "pick up the butter stick and place it in the basket"
128
+ ],
129
+ "pick up the milk and place it in the basket": [
130
+ "pick up the milk carton and place it in the basket"
131
+ ],
132
+ "pick up the chocolate pudding and place it in the basket": [
133
+ "pick up the chocolate pudding cup and place it in the basket"
134
+ ],
135
+ "pick up the orange juice and place it in the basket": [
136
+ "pick up the orange juice carton and place it in the basket"
137
+ ],
138
+ "open the middle drawer of the cabinet": [
139
+ "open the middle drawer of the cabinet"
140
+ ],
141
+ "put the bowl on the stove": [
142
+ "pick up the black bowl and place it on the stove"
143
+ ],
144
+ "put the wine bottle on top of the cabinet": [
145
+ "pick up the wine bottle and place it on top of the cabinet"
146
+ ],
147
+ "open the top drawer and put the bowl inside": [
148
+ "open the top drawer then pick up the black bowl and place it inside"
149
+ ],
150
+ "put the bowl on top of the cabinet": [
151
+ "pick up the black bowl and place it on top of the cabinet"
152
+ ],
153
+ "push the plate to the front of the stove": [
154
+ "push the plate to the front of the stove"
155
+ ],
156
+ "put the cream cheese in the bowl": [
157
+ "pick up the cream cheese box and place it in the black bowl"
158
+ ],
159
+ "turn on the stove": [
160
+ "turn on the stove"
161
+ ],
162
+ "put the bowl on the plate": [
163
+ "pick up the black bowl and place it on the plate"
164
+ ],
165
+ "put the wine bottle on the rack": [
166
+ "pick up the wine bottle and place it on the rack"
167
+ ],
168
+ "put both the alphabet soup and the tomato sauce in the basket": [
169
+ "pick up the alphabet soup can and place it in the basket then pick up the tomato sauce can and place it in the basket"
170
+ ],
171
+ "put both the cream cheese box and the butter in the basket": [
172
+ "pick up the cream cheese box and place it in the basket then pick up the butter stick and place it in the basket"
173
+ ],
174
+ "turn on the stove and put the moka pot on it": [
175
+ "turn on the stove then pick up the moka pot and place it on the stove"
176
+ ],
177
+ "put the black bowl in the bottom drawer of the cabinet and close it": [
178
+ "pick up the black bowl and place it in the bottom drawer of the cabinet then close the drawer"
179
+ ],
180
+ "put the white mug on the left plate and put the yellow and white mug on the right plate": [
181
+ "pick up the white mug and place it on the left plate then pick up the yellow-and-white mug and place it on the right plate"
182
+ ],
183
+ "pick up the book and place it in the back compartment of the caddy": [
184
+ "pick up the book and place it in the back compartment of the caddy"
185
+ ],
186
+ "put the white mug on the plate and put the chocolate pudding to the right of the plate": [
187
+ "pick up the white mug and place it on the plate then pick up the chocolate pudding cup and place it to the right of the plate"
188
+ ],
189
+ "put both the alphabet soup and the cream cheese box in the basket": [
190
+ "pick up the alphabet soup can and place it in the basket then pick up the cream cheese box and place it in the basket"
191
+ ],
192
+ "put both moka pots on the stove": [
193
+ "pick up one moka pot and place it on the stove then pick up the other moka pot and place it on the stove"
194
+ ],
195
+ "put the yellow and white mug in the microwave and close it": [
196
+ "pick up the yellow-and-white mug and place it in the microwave then close the microwave"
197
+ ]
198
+ }
199
+ },
200
+ "revision": null,
201
+ "use_imagenet_stats": true,
202
+ "video_backend": "torchcodec",
203
+ "only_robot_type": "so100",
204
+ "exclude_tasks": null,
205
+ "report_task_stats": true
206
+ },
207
+ "env": null,
208
+ "policy": {
209
+ "type": "smolvla",
210
+ "n_obs_steps": 1,
211
+ "normalization_mapping": {
212
+ "VISUAL": "IDENTITY",
213
+ "STATE": "MEAN_STD",
214
+ "ACTION": "MEAN_STD"
215
+ },
216
+ "input_features": {
217
+ "observation.images.wrist_image": {
218
+ "type": "VISUAL",
219
+ "shape": [
220
+ 256,
221
+ 256,
222
+ 3
223
+ ]
224
+ },
225
+ "observation.images.image": {
226
+ "type": "VISUAL",
227
+ "shape": [
228
+ 256,
229
+ 256,
230
+ 3
231
+ ]
232
+ },
233
+ "observation.state": {
234
+ "type": "STATE",
235
+ "shape": [
236
+ 8
237
+ ]
238
+ }
239
+ },
240
+ "output_features": {
241
+ "action": {
242
+ "type": "ACTION",
243
+ "shape": [
244
+ 7
245
+ ]
246
+ }
247
+ },
248
+ "device": "cuda",
249
+ "use_amp": false,
250
+ "use_peft": false,
251
+ "push_to_hub": true,
252
+ "repo_id": "combined_frozen_chunk50_noproprio_unified_text_prompt_fullvlm_1010",
253
+ "private": null,
254
+ "tags": null,
255
+ "license": null,
256
+ "use_proprio": false,
257
+ "chunk_size": 50,
258
+ "n_action_steps": 50,
259
+ "normalize_visual": "identity",
260
+ "normalize_state": "mean_std",
261
+ "normalize_action": "mean_std",
262
+ "max_state_dim": 32,
263
+ "max_action_dim": 32,
264
+ "resize_imgs_with_padding": [
265
+ 512,
266
+ 512
267
+ ],
268
+ "empty_cameras": 0,
269
+ "adapt_to_pi_aloha": false,
270
+ "use_delta_joint_actions_aloha": false,
271
+ "tokenizer_max_length": 48,
272
+ "num_steps": 10,
273
+ "use_cache": true,
274
+ "freeze_vision_encoder": true,
275
+ "train_expert_only": true,
276
+ "train_state_proj": true,
277
+ "optimizer_lr": 0.0001,
278
+ "optimizer_betas": [
279
+ 0.9,
280
+ 0.95
281
+ ],
282
+ "optimizer_eps": 1e-08,
283
+ "optimizer_weight_decay": 1e-10,
284
+ "optimizer_grad_clip_norm": 10,
285
+ "scheduler_warmup_steps": 1000,
286
+ "scheduler_decay_steps": 30000,
287
+ "scheduler_decay_lr": 2.5e-06,
288
+ "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
289
+ "load_vlm_weights": true,
290
+ "vlm_model_dtype": "bfloat16",
291
+ "add_image_special_tokens": false,
292
+ "attention_mode": "cross_attn",
293
+ "prefix_length": -1,
294
+ "pad_language_to": "longest",
295
+ "num_expert_layers": -1,
296
+ "num_vlm_layers": -1,
297
+ "self_attn_every_n_layers": 2,
298
+ "expert_width_multiplier": 0.75,
299
+ "min_period": 0.004,
300
+ "max_period": 4.0
301
+ },
302
+ "output_dir": "outputs/combined_frozen_unified_text_prompt_fullvlm_1010/combined_frozen_chunk50_noproprio_unified_text_prompt_fullvlm_1010/2025-10-11/12-19-59",
303
+ "exp_name": "combined_frozen_chunk50_noproprio_unified_text_prompt_fullvlm_1010/2025-10-11/12-19-59",
304
+ "group_name": "combined_frozen_unified_text_prompt_fullvlm_1010",
305
+ "resume": false,
306
+ "seed": 1000,
307
+ "num_workers": 8,
308
+ "batch_size": 64,
309
+ "update_steps": 50000,
310
+ "eval_freq": 20000,
311
+ "log_freq": 200,
312
+ "save_checkpoint": true,
313
+ "save_freq": 10000,
314
+ "use_policy_training_preset": true,
315
+ "optimizer": {
316
+ "type": "adamw",
317
+ "lr": 0.0001,
318
+ "weight_decay": 1e-10,
319
+ "grad_clip_norm": 10,
320
+ "betas": [
321
+ 0.9,
322
+ 0.95
323
+ ],
324
+ "eps": 1e-08
325
+ },
326
+ "scheduler": {
327
+ "type": "cosine_decay_with_warmup",
328
+ "num_warmup_steps": 1000,
329
+ "num_decay_steps": 30000,
330
+ "peak_lr": 0.0001,
331
+ "decay_lr": 2.5e-06
332
+ },
333
+ "eval": {
334
+ "n_episodes": 50,
335
+ "batch_size": 50,
336
+ "use_async_envs": false
337
+ },
338
+ "log_with": "wandb",
339
+ "wandb": {
340
+ "enable": false,
341
+ "disable_artifact": false,
342
+ "project": "lerobot",
343
+ "entity": null,
344
+ "notes": null,
345
+ "run_id": "qlamyc4i",
346
+ "mode": null
347
+ },
348
+ "gradient_accumulation_steps": 1,
349
+ "use_peft": false,
350
+ "autocast_adapter_dtype": true,
351
+ "peft": {
352
+ "target_modules": null,
353
+ "modules_to_save": null,
354
+ "method_type": "LORA",
355
+ "init_type": null,
356
+ "r": 64,
357
+ "lora_alpha": 128,
358
+ "fullfinetune_vlm_patch_embeddings": false,
359
+ "fullfinetune_vlm_vision_model": false
360
+ }
361
+ }