ayushadarsh7 commited on
Commit
6b0d8b1
·
verified ·
1 Parent(s): cec0d92

Update Hyper_Params.md

Browse files
Files changed (1) hide show
  1. Hyper_Params.md +155 -1
Hyper_Params.md CHANGED
@@ -1,6 +1,160 @@
1
  # Finetuning midddle layers (vision module layers = 22 to 26 +projector + language layers = 0 to 4)
2
  # gemma-3-4b-it
3
  # No. of images used in training = 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  HyperParameters of this model
5
 
6
  lr: 2e-5
@@ -9,4 +163,4 @@ epochs: 10
9
 
10
  batch size: 1
11
 
12
- grad acumulation step:
 
1
  # Finetuning midddle layers (vision module layers = 22 to 26 +projector + language layers = 0 to 4)
2
  # gemma-3-4b-it
3
  # No. of images used in training = 500
4
+
5
+ ```
6
+ model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight | shape=(1152, 1152) | requires_grad=True
7
+ model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias | shape=(1152,) | requires_grad=True
8
+ model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight | shape=(1152, 1152) | requires_grad=True
9
+ model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias | shape=(1152,) | requires_grad=True
10
+ model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight | shape=(1152, 1152) | requires_grad=True
11
+ model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias | shape=(1152,) | requires_grad=True
12
+ model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight | shape=(1152, 1152) | requires_grad=True
13
+ model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias | shape=(1152,) | requires_grad=True
14
+ model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight | shape=(1152,) | requires_grad=True
15
+ model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias | shape=(1152,) | requires_grad=True
16
+ model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight | shape=(4304, 1152) | requires_grad=True
17
+ model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias | shape=(4304,) | requires_grad=True
18
+ model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight | shape=(1152, 4304) | requires_grad=True
19
+ model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias | shape=(1152,) | requires_grad=True
20
+ model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight | shape=(1152,) | requires_grad=True
21
+ model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias | shape=(1152,) | requires_grad=True
22
+ model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight | shape=(1152, 1152) | requires_grad=True
23
+ model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias | shape=(1152,) | requires_grad=True
24
+ model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight | shape=(1152, 1152) | requires_grad=True
25
+ model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias | shape=(1152,) | requires_grad=True
26
+ model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight | shape=(1152, 1152) | requires_grad=True
27
+ model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias | shape=(1152,) | requires_grad=True
28
+ model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight | shape=(1152, 1152) | requires_grad=True
29
+ model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias | shape=(1152,) | requires_grad=True
30
+ model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight | shape=(1152,) | requires_grad=True
31
+ model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias | shape=(1152,) | requires_grad=True
32
+ model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight | shape=(4304, 1152) | requires_grad=True
33
+ model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias | shape=(4304,) | requires_grad=True
34
+ model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight | shape=(1152, 4304) | requires_grad=True
35
+ model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias | shape=(1152,) | requires_grad=True
36
+ model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight | shape=(1152,) | requires_grad=True
37
+ model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias | shape=(1152,) | requires_grad=True
38
+ model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight | shape=(1152, 1152) | requires_grad=True
39
+ model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias | shape=(1152,) | requires_grad=True
40
+ model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight | shape=(1152, 1152) | requires_grad=True
41
+ model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias | shape=(1152,) | requires_grad=True
42
+ model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight | shape=(1152, 1152) | requires_grad=True
43
+ model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias | shape=(1152,) | requires_grad=True
44
+ model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight | shape=(1152, 1152) | requires_grad=True
45
+ model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias | shape=(1152,) | requires_grad=True
46
+ model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight | shape=(1152,) | requires_grad=True
47
+ model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias | shape=(1152,) | requires_grad=True
48
+ model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight | shape=(4304, 1152) | requires_grad=True
49
+ model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias | shape=(4304,) | requires_grad=True
50
+ model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight | shape=(1152, 4304) | requires_grad=True
51
+ model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias | shape=(1152,) | requires_grad=True
52
+ model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight | shape=(1152,) | requires_grad=True
53
+ model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias | shape=(1152,) | requires_grad=True
54
+ model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight | shape=(1152, 1152) | requires_grad=True
55
+ model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias | shape=(1152,) | requires_grad=True
56
+ model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight | shape=(1152, 1152) | requires_grad=True
57
+ model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias | shape=(1152,) | requires_grad=True
58
+ model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight | shape=(1152, 1152) | requires_grad=True
59
+ model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias | shape=(1152,) | requires_grad=True
60
+ model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight | shape=(1152, 1152) | requires_grad=True
61
+ model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias | shape=(1152,) | requires_grad=True
62
+ model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight | shape=(1152,) | requires_grad=True
63
+ model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias | shape=(1152,) | requires_grad=True
64
+ model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight | shape=(4304, 1152) | requires_grad=True
65
+ model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias | shape=(4304,) | requires_grad=True
66
+ model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight | shape=(1152, 4304) | requires_grad=True
67
+ model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias | shape=(1152,) | requires_grad=True
68
+ model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight | shape=(1152,) | requires_grad=True
69
+ model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias | shape=(1152,) | requires_grad=True
70
+ model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight | shape=(1152, 1152) | requires_grad=True
71
+ model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias | shape=(1152,) | requires_grad=True
72
+ model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight | shape=(1152, 1152) | requires_grad=True
73
+ model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias | shape=(1152,) | requires_grad=True
74
+ model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight | shape=(1152, 1152) | requires_grad=True
75
+ model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias | shape=(1152,) | requires_grad=True
76
+ model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight | shape=(1152, 1152) | requires_grad=True
77
+ model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias | shape=(1152,) | requires_grad=True
78
+ model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight | shape=(1152,) | requires_grad=True
79
+ model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias | shape=(1152,) | requires_grad=True
80
+ model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight | shape=(4304, 1152) | requires_grad=True
81
+ model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias | shape=(4304,) | requires_grad=True
82
+ model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight | shape=(1152, 4304) | requires_grad=True
83
+ model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias | shape=(1152,) | requires_grad=True
84
+ model.vision_tower.vision_model.post_layernorm.weight | shape=(1152,) | requires_grad=True
85
+ model.vision_tower.vision_model.post_layernorm.bias | shape=(1152,) | requires_grad=True
86
+ model.multi_modal_projector.mm_input_projection_weight | shape=(1152, 2560) | requires_grad=True
87
+ model.multi_modal_projector.mm_soft_emb_norm.weight | shape=(1152,) | requires_grad=True
88
+ model.language_model.embed_tokens.weight | shape=(262208, 2560) | requires_grad=True
89
+ model.language_model.layers.0.self_attn.q_proj.weight | shape=(2048, 2560) | requires_grad=True
90
+ model.language_model.layers.0.self_attn.k_proj.weight | shape=(1024, 2560) | requires_grad=True
91
+ model.language_model.layers.0.self_attn.v_proj.weight | shape=(1024, 2560) | requires_grad=True
92
+ model.language_model.layers.0.self_attn.o_proj.weight | shape=(2560, 2048) | requires_grad=True
93
+ model.language_model.layers.0.self_attn.q_norm.weight | shape=(256,) | requires_grad=True
94
+ model.language_model.layers.0.self_attn.k_norm.weight | shape=(256,) | requires_grad=True
95
+ model.language_model.layers.0.mlp.gate_proj.weight | shape=(10240, 2560) | requires_grad=True
96
+ model.language_model.layers.0.mlp.up_proj.weight | shape=(10240, 2560) | requires_grad=True
97
+ model.language_model.layers.0.mlp.down_proj.weight | shape=(2560, 10240) | requires_grad=True
98
+ model.language_model.layers.0.input_layernorm.weight | shape=(2560,) | requires_grad=True
99
+ model.language_model.layers.0.post_attention_layernorm.weight | shape=(2560,) | requires_grad=True
100
+ model.language_model.layers.0.pre_feedforward_layernorm.weight | shape=(2560,) | requires_grad=True
101
+ model.language_model.layers.0.post_feedforward_layernorm.weight | shape=(2560,) | requires_grad=True
102
+ model.language_model.layers.1.self_attn.q_proj.weight | shape=(2048, 2560) | requires_grad=True
103
+ model.language_model.layers.1.self_attn.k_proj.weight | shape=(1024, 2560) | requires_grad=True
104
+ model.language_model.layers.1.self_attn.v_proj.weight | shape=(1024, 2560) | requires_grad=True
105
+ model.language_model.layers.1.self_attn.o_proj.weight | shape=(2560, 2048) | requires_grad=True
106
+ model.language_model.layers.1.self_attn.q_norm.weight | shape=(256,) | requires_grad=True
107
+ model.language_model.layers.1.self_attn.k_norm.weight | shape=(256,) | requires_grad=True
108
+ model.language_model.layers.1.mlp.gate_proj.weight | shape=(10240, 2560) | requires_grad=True
109
+ model.language_model.layers.1.mlp.up_proj.weight | shape=(10240, 2560) | requires_grad=True
110
+ model.language_model.layers.1.mlp.down_proj.weight | shape=(2560, 10240) | requires_grad=True
111
+ model.language_model.layers.1.input_layernorm.weight | shape=(2560,) | requires_grad=True
112
+ model.language_model.layers.1.post_attention_layernorm.weight | shape=(2560,) | requires_grad=True
113
+ model.language_model.layers.1.pre_feedforward_layernorm.weight | shape=(2560,) | requires_grad=True
114
+ model.language_model.layers.1.post_feedforward_layernorm.weight | shape=(2560,) | requires_grad=True
115
+ model.language_model.layers.2.self_attn.q_proj.weight | shape=(2048, 2560) | requires_grad=True
116
+ model.language_model.layers.2.self_attn.k_proj.weight | shape=(1024, 2560) | requires_grad=True
117
+ model.language_model.layers.2.self_attn.v_proj.weight | shape=(1024, 2560) | requires_grad=True
118
+ model.language_model.layers.2.self_attn.o_proj.weight | shape=(2560, 2048) | requires_grad=True
119
+ model.language_model.layers.2.self_attn.q_norm.weight | shape=(256,) | requires_grad=True
120
+ model.language_model.layers.2.self_attn.k_norm.weight | shape=(256,) | requires_grad=True
121
+ model.language_model.layers.2.mlp.gate_proj.weight | shape=(10240, 2560) | requires_grad=True
122
+ model.language_model.layers.2.mlp.up_proj.weight | shape=(10240, 2560) | requires_grad=True
123
+ model.language_model.layers.2.mlp.down_proj.weight | shape=(2560, 10240) | requires_grad=True
124
+ model.language_model.layers.2.input_layernorm.weight | shape=(2560,) | requires_grad=True
125
+ model.language_model.layers.2.post_attention_layernorm.weight | shape=(2560,) | requires_grad=True
126
+ model.language_model.layers.2.pre_feedforward_layernorm.weight | shape=(2560,) | requires_grad=True
127
+ model.language_model.layers.2.post_feedforward_layernorm.weight | shape=(2560,) | requires_grad=True
128
+ model.language_model.layers.3.self_attn.q_proj.weight | shape=(2048, 2560) | requires_grad=True
129
+ model.language_model.layers.3.self_attn.k_proj.weight | shape=(1024, 2560) | requires_grad=True
130
+ model.language_model.layers.3.self_attn.v_proj.weight | shape=(1024, 2560) | requires_grad=True
131
+ model.language_model.layers.3.self_attn.o_proj.weight | shape=(2560, 2048) | requires_grad=True
132
+ model.language_model.layers.3.self_attn.q_norm.weight | shape=(256,) | requires_grad=True
133
+ model.language_model.layers.3.self_attn.k_norm.weight | shape=(256,) | requires_grad=True
134
+ model.language_model.layers.3.mlp.gate_proj.weight | shape=(10240, 2560) | requires_grad=True
135
+ model.language_model.layers.3.mlp.up_proj.weight | shape=(10240, 2560) | requires_grad=True
136
+ model.language_model.layers.3.mlp.down_proj.weight | shape=(2560, 10240) | requires_grad=True
137
+ model.language_model.layers.3.input_layernorm.weight | shape=(2560,) | requires_grad=True
138
+ model.language_model.layers.3.post_attention_layernorm.weight | shape=(2560,) | requires_grad=True
139
+ model.language_model.layers.3.pre_feedforward_layernorm.weight | shape=(2560,) | requires_grad=True
140
+ model.language_model.layers.3.post_feedforward_layernorm.weight | shape=(2560,) | requires_grad=True
141
+ model.language_model.layers.4.self_attn.q_proj.weight | shape=(2048, 2560) | requires_grad=True
142
+ model.language_model.layers.4.self_attn.k_proj.weight | shape=(1024, 2560) | requires_grad=True
143
+ model.language_model.layers.4.self_attn.v_proj.weight | shape=(1024, 2560) | requires_grad=True
144
+ model.language_model.layers.4.self_attn.o_proj.weight | shape=(2560, 2048) | requires_grad=True
145
+ model.language_model.layers.4.self_attn.q_norm.weight | shape=(256,) | requires_grad=True
146
+ model.language_model.layers.4.self_attn.k_norm.weight | shape=(256,) | requires_grad=True
147
+ model.language_model.layers.4.mlp.gate_proj.weight | shape=(10240, 2560) | requires_grad=True
148
+ model.language_model.layers.4.mlp.up_proj.weight | shape=(10240, 2560) | requires_grad=True
149
+ model.language_model.layers.4.mlp.down_proj.weight | shape=(2560, 10240) | requires_grad=True
150
+ model.language_model.layers.4.input_layernorm.weight | shape=(2560,) | requires_grad=True
151
+ model.language_model.layers.4.post_attention_layernorm.weight | shape=(2560,) | requires_grad=True
152
+ model.language_model.layers.4.pre_feedforward_layernorm.weight | shape=(2560,) | requires_grad=True
153
+ model.language_model.layers.4.post_feedforward_layernorm.weight | shape=(2560,) | requires_grad=True
154
+
155
+ ```
156
+
157
+
158
  HyperParameters of this model
159
 
160
  lr: 2e-5
 
163
 
164
  batch size: 1
165
 
166
+ grad acumulation step: 4