baidu
/

ERNIE-4.5-VL-28B-A3B-PT

Image-Text-to-Text

ernie4_5_moe_vl

feature-extraction

Model card Files Files and versions

SFLY5 commited on Jul 9

Commit

ce3b626

·

1 Parent(s): ba47981

performance optimization

Files changed (1) hide show

modeling_ernie_45t_vl.py +18 -24

modeling_ernie_45t_vl.py CHANGED Viewed

@@ -3457,33 +3457,27 @@ class VisionAttention(nn.Module):
         k = apply_rotary_pos_emb_vision(k.unsqueeze(dim=0), rotary_pos_emb).squeeze(
             dim=0
         )
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        attention_mask = torch.full(
-            [1, seq_length, seq_length],
-            torch.finfo(q.dtype).min,
-            device=q.device,
-            dtype=q.dtype,
-        )
-        for i in range(1, len(cu_seqlens)):
-            attention_mask[
-                ...,
-                cu_seqlens[i - 1] : cu_seqlens[i],
-                cu_seqlens[i - 1] : cu_seqlens[i],
-            ] = 0
         q = q.transpose(0, 1)
         k = k.transpose(0, 1)
         v = v.transpose(0, 1)
-        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
-        attn_weights = attn_weights + attention_mask
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32
-        ).to(q.dtype)
-        attn_output = torch.matmul(attn_weights, v)
-        attn_output = attn_output.transpose(0, 1)
-        attn_output = attn_output.reshape(seq_length, -1)
         attn_output = self.proj(attn_output)
         return attn_output

         k = apply_rotary_pos_emb_vision(k.unsqueeze(dim=0), rotary_pos_emb).squeeze(
             dim=0
         )
         q = q.transpose(0, 1)
         k = k.transpose(0, 1)
         v = v.transpose(0, 1)
+        lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+        splits = [
+            torch.split(tensor, lengths.tolist(), dim=1) for tensor in (q, k, v)
+        ]
+        attn_output = []
+        for q, k, v in zip(*splits):
+            attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+            attn_weights = nn.functional.softmax(
+                attn_weights, dim=-1, dtype=torch.float32
+            ).to(q.dtype)
+            attn_output_splited = torch.matmul(attn_weights, v)
+            attn_output_splited = attn_output_splited.transpose(0, 1)
+            attn_output.append(attn_output_splited)
+        attn_output = torch.cat(attn_output, dim=0)
+        attn_output = attn_output.reshape(seq_length, -1).contiguous()
         attn_output = self.proj(attn_output)
         return attn_output