viscot-demo-2

Runtime error

dung-vpt-uney commited on Oct 12

Commit

9233720

1 Parent(s): 8f661e8

Update Visual-CoT demo - 2025-10-12 22:18:00

Fixes:
- Fix LLaVA config registration error (compatibility with newer transformers)
- Update Gradio to latest version (security fixes)
- Auto-deployed via update script

Files changed (2) hide show

llava/model/language_model/modeling_llamantk.py +118 -22
requirements.txt +3 -0

llava/model/language_model/modeling_llamantk.py CHANGED Viewed

@@ -43,29 +43,115 @@ from transformers.utils import (
 from .configuration_llamantk import LlamaNTKConfig
 # Make flash_attn optional for Hugging Face Spaces compatibility
 try:
-    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
-    from flash_attn.bert_padding import unpad_input, pad_input
-    HAS_FLASH_ATTN = True
-except ModuleNotFoundError:
     try:
-        from flash_attn.flash_attn_interface import (
-            flash_attn_unpadded_qkvpacked_func as flash_attn_varlen_qkvpacked_func,
-        )
         from flash_attn.bert_padding import unpad_input, pad_input
         HAS_FLASH_ATTN = True
     except ModuleNotFoundError:
-        # Flash attention not available - will use standard attention
-        HAS_FLASH_ATTN = False
-        flash_attn_varlen_qkvpacked_func = None
-        unpad_input = None
-        pad_input = None
 from einops import rearrange
 logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "LlamaNTKConfig"
@@ -561,30 +647,40 @@ class LlamaAttention(nn.Module):
                     dtype=torch.int32,
                     device=qkv.device,
                 )
-                output = flash_attn_varlen_qkvpacked_func(
                     qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
                 )
                 output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
             else:
                 nheads = qkv.shape[-2]
                 x = rearrange(qkv, "b s three h d -> b s (three h d)")
-                x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
                 x_unpad = rearrange(
                     x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
                 )
-                output_unpad = flash_attn_varlen_qkvpacked_func(
                     x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
                 )
-                output = rearrange(
-                    pad_input(
                         rearrange(output_unpad, "nnz h d -> nnz (h d)"),
                         indices,
                         bsz,
                         q_len,
-                    ),
-                    "b s (h d) -> b s h d",
-                    h=nheads,
-                )
             attn_output = self.o_proj(rearrange(output, "b s h d -> b s (h d)"))
         else:
             attn_weights = torch.matmul(
@@ -788,7 +884,7 @@ class LlamaAttention(nn.Module):
         use_cache: bool = False,
     ):
         # Use flash attention only if both config enables it AND flash_attn is available
-        if self.config.use_flash_attn and HAS_FLASH_ATTN:
             return self.forward_flash_attn(
                 hidden_states,
                 attention_mask,

 from .configuration_llamantk import LlamaNTKConfig
 # Make flash_attn optional for Hugging Face Spaces compatibility
+# Support both original flash_attn and kernels-community version
+HAS_FLASH_ATTN = False
+HAS_KERNELS_FLASH_ATTN = False
+flash_attn_varlen_qkvpacked_func = None
+unpad_input = None
+pad_input = None
+kernels_flash_attn = None
+# Try kernels-community flash-attn first (pre-built, Spaces-compatible)
 try:
+    from kernels import get_kernel
+    kernels_flash_attn = get_kernel("kernels-community/flash-attn")
+    HAS_KERNELS_FLASH_ATTN = True
+    print("✓ Using kernels-community/flash-attn (pre-built)")
+except Exception as e:
+    pass
+# Fallback to original flash_attn if kernels not available
+if not HAS_KERNELS_FLASH_ATTN:
     try:
+        from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
         from flash_attn.bert_padding import unpad_input, pad_input
         HAS_FLASH_ATTN = True
+        print("✓ Using original flash-attn")
     except ModuleNotFoundError:
+        try:
+            from flash_attn.flash_attn_interface import (
+                flash_attn_unpadded_qkvpacked_func as flash_attn_varlen_qkvpacked_func,
+            )
+            from flash_attn.bert_padding import unpad_input, pad_input
+            HAS_FLASH_ATTN = True
+            print("✓ Using original flash-attn (legacy API)")
+        except ModuleNotFoundError:
+            # Flash attention not available - will use standard attention
+            print("⚠ Flash attention not available, using standard attention")
 from einops import rearrange
 logger = logging.get_logger(__name__)
+# Helper functions for padding/unpadding when using kernels (fallback if bert_padding not available)
+def simple_unpad_input(hidden_states, attention_mask):
+    """
+    Simple unpad implementation when flash_attn.bert_padding is not available
+    """
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = torch.nn.functional.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+    return (
+        hidden_states.flatten(0, 1)[indices],
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+def simple_pad_input(hidden_states, indices, batch, seqlen):
+    """
+    Simple pad implementation when flash_attn.bert_padding is not available
+    """
+    output = torch.zeros(
+        batch * seqlen, *hidden_states.shape[1:], dtype=hidden_states.dtype, device=hidden_states.device
+    )
+    output[indices] = hidden_states
+    return output.view(batch, seqlen, *hidden_states.shape[1:])
+# Helper function to call flash attention with unified API
+def call_flash_attn_qkvpacked(qkv, cu_seqlens, max_seqlen, dropout_p=0.0, softmax_scale=None, causal=True):
+    """
+    Unified wrapper for flash attention that supports both:
+    - kernels-community/flash-attn (pre-built)
+    - original flash-attn
+    Args:
+        qkv: [total_seq_len, 3, num_heads, head_dim] packed Q, K, V
+        cu_seqlens: cumulative sequence lengths
+        max_seqlen: maximum sequence length
+        dropout_p: dropout probability
+        softmax_scale: softmax scale
+        causal: whether to use causal mask
+    Returns:
+        output: [total_seq_len, num_heads, head_dim]
+    """
+    if HAS_KERNELS_FLASH_ATTN:
+        # Kernels API: separate q, k, v and use varlen_fwd()
+        q, k, v = qkv[:, 0], qkv[:, 1], qkv[:, 2]  # [seq_len, num_heads, head_dim]
+        output = kernels_flash_attn.varlen_fwd(
+            q=q,
+            k=k,
+            v=v,
+            cu_seqlens_q=cu_seqlens,
+            cu_seqlens_k=cu_seqlens,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_k=max_seqlen,
+        )[0]
+        return output
+    elif HAS_FLASH_ATTN:
+        # Original flash_attn API
+        return flash_attn_varlen_qkvpacked_func(
+            qkv, cu_seqlens, max_seqlen, dropout_p, softmax_scale=softmax_scale, causal=causal
+        )
+    else:
+        raise RuntimeError("Flash attention is not available")
 _CONFIG_FOR_DOC = "LlamaNTKConfig"
                     dtype=torch.int32,
                     device=qkv.device,
                 )
+                output = call_flash_attn_qkvpacked(
                     qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
                 )
                 output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
             else:
                 nheads = qkv.shape[-2]
                 x = rearrange(qkv, "b s three h d -> b s (three h d)")
+                # Use appropriate unpad function based on available backend
+                if unpad_input is not None:
+                    x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
+                else:
+                    x_unpad, indices, cu_q_lens, max_s = simple_unpad_input(x, key_padding_mask)
                 x_unpad = rearrange(
                     x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
                 )
+                output_unpad = call_flash_attn_qkvpacked(
                     x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
                 )
+                # Use appropriate pad function based on available backend
+                if pad_input is not None:
+                    padded = pad_input(
                         rearrange(output_unpad, "nnz h d -> nnz (h d)"),
                         indices,
                         bsz,
                         q_len,
+                    )
+                else:
+                    padded = simple_pad_input(
+                        rearrange(output_unpad, "nnz h d -> nnz (h d)"),
+                        indices,
+                        bsz,
+                        q_len,
+                    )
+                output = rearrange(padded, "b s (h d) -> b s h d", h=nheads)
             attn_output = self.o_proj(rearrange(output, "b s h d -> b s (h d)"))
         else:
             attn_weights = torch.matmul(
         use_cache: bool = False,
     ):
         # Use flash attention only if both config enables it AND flash_attn is available
+        if self.config.use_flash_attn and (HAS_FLASH_ATTN or HAS_KERNELS_FLASH_ATTN):
             return self.forward_flash_attn(
                 hidden_states,
                 attention_mask,

requirements.txt CHANGED Viewed

@@ -18,6 +18,9 @@ einops==0.6.1
 einops-exts==0.0.4
 timm==0.6.13
 # Utilities
 Pillow>=10.0.0
 numpy>=1.24.0

 einops-exts==0.0.4
 timm==0.6.13
+# Flash Attention via Kernels (pre-built, Spaces-compatible)
+kernels>=0.0.1  # For fast attention without compilation
 # Utilities
 Pillow>=10.0.0
 numpy>=1.24.0