Add build files

Browse files

Files changed (17) hide show

.gitattributes +2 -0
README.md +4 -4
benchmark_flash_sdpa.py +4 -4
build.toml +1 -1
{torch-ext/sdpa_flash → build/torch27-metal-aarch64-darwin/metal_flash_sdpa}/__init__.py +0 -0
build/torch27-metal-aarch64-darwin/metal_flash_sdpa/__pycache__/__init__.cpython-312.pyc +0 -0
build/torch27-metal-aarch64-darwin/metal_flash_sdpa/__pycache__/_custom_ops.cpython-312.pyc +0 -0
build/torch27-metal-aarch64-darwin/metal_flash_sdpa/__pycache__/_ops.cpython-312.pyc +0 -0
{torch-ext/sdpa_flash → build/torch27-metal-aarch64-darwin/metal_flash_sdpa}/_custom_ops.py +0 -0
build/torch27-metal-aarch64-darwin/metal_flash_sdpa/_metal_flash_sdpa_032c946.abi3.so +3 -0
build/torch27-metal-aarch64-darwin/metal_flash_sdpa/_metal_flash_sdpa_032c946.metallib +3 -0
build/torch27-metal-aarch64-darwin/metal_flash_sdpa/_ops.py +9 -0
flake.lock +168 -0
flake.nix +1 -1
tests/test_flash_attention.py +24 -24
torch-ext/metal_flash_sdpa/__init__.py +11 -0
torch-ext/metal_flash_sdpa/_custom_ops.py +117 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+*.metallib filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -4,9 +4,9 @@ tags:
 - kernel
 ---
-# Metal Flash Attention
-A PyTorch extension that provides optimized Metal implementations of Flash Attention kernels for Metal.
 ## Supported Features
@@ -22,7 +22,7 @@ A PyTorch extension that provides optimized Metal implementations of Flash Atten
 ### flash_attention_varlen
 ```python
-sdpa_flash.flash_attention_varlen(
     out: torch.Tensor,
     query: torch.Tensor,
     key: torch.Tensor,
@@ -50,7 +50,7 @@ sdpa_flash.flash_attention_varlen(
 Compatibility wrapper matching the original Flash Attention API:
 ```python
-out = sdpa_flash.flash_attn_varlen_func(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,

 - kernel
 ---
+# Metal Flash SDPA
+Optimized SDPA kernels inspired by Flash Attention for Metal.
 ## Supported Features
 ### flash_attention_varlen
 ```python
+metal_flash_sdpa.flash_attention_varlen(
     out: torch.Tensor,
     query: torch.Tensor,
     key: torch.Tensor,
 Compatibility wrapper matching the original Flash Attention API:
 ```python
+out = metal_flash_sdpa.flash_attn_varlen_func(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,

benchmark_flash_sdpa.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import torch
 import time
-import sdpa_flash
 from typing import List, Tuple
 import numpy as np
@@ -49,7 +49,7 @@ def benchmark_flash_sdpa(
     # Define the function to benchmark
     def run_flash_sdpa():
-        sdpa_flash.flash_attention_varlen(
             out=out,
             query=query,
             key=key,
@@ -108,7 +108,7 @@ def benchmark_flash_gqa(
     # Define the function to benchmark
     def run_flash_gqa():
-        sdpa_flash.flash_attention_varlen(
             out=out,
             query=query,
             key=key,
@@ -164,7 +164,7 @@ def benchmark_variable_length(
     # Define the function to benchmark
     def run_varlen():
-        sdpa_flash.flash_attention_varlen(
             out=out,
             query=query,
             key=key,

 import torch
 import time
+import metal_flash_sdpa
 from typing import List, Tuple
 import numpy as np
     # Define the function to benchmark
     def run_flash_sdpa():
+        metal_flash_sdpa.flash_attention_varlen(
             out=out,
             query=query,
             key=key,
     # Define the function to benchmark
     def run_flash_gqa():
+        metal_flash_sdpa.flash_attention_varlen(
             out=out,
             query=query,
             key=key,
     # Define the function to benchmark
     def run_varlen():
+        metal_flash_sdpa.flash_attention_varlen(
             out=out,
             query=query,
             key=key,

build.toml CHANGED Viewed

@@ -1,5 +1,5 @@
 [general]
-name = "sdpa_flash"
 universal = false
 [torch]

 [general]
+name = "metal_flash_sdpa"
 universal = false
 [torch]

{torch-ext/sdpa_flash → build/torch27-metal-aarch64-darwin/metal_flash_sdpa}/__init__.py RENAMED Viewed

File without changes

build/torch27-metal-aarch64-darwin/metal_flash_sdpa/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (400 Bytes). View file

build/torch27-metal-aarch64-darwin/metal_flash_sdpa/__pycache__/_custom_ops.cpython-312.pyc ADDED Viewed

Binary file (3.99 kB). View file

build/torch27-metal-aarch64-darwin/metal_flash_sdpa/__pycache__/_ops.cpython-312.pyc ADDED Viewed

Binary file (595 Bytes). View file

{torch-ext/sdpa_flash → build/torch27-metal-aarch64-darwin/metal_flash_sdpa}/_custom_ops.py RENAMED Viewed

File without changes

build/torch27-metal-aarch64-darwin/metal_flash_sdpa/_metal_flash_sdpa_032c946.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b16d8835b2d0e86339095de249268b1f28ce41ba4dedca70326226e6267f8354
+size 104672

build/torch27-metal-aarch64-darwin/metal_flash_sdpa/_metal_flash_sdpa_032c946.metallib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8ae6e3b9eb9fb2d3a4d86983308d23009ee03097fa0668d973e561fb235110e
+size 622095

build/torch27-metal-aarch64-darwin/metal_flash_sdpa/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _metal_flash_sdpa_032c946
+ops = torch.ops._metal_flash_sdpa_032c946
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_metal_flash_sdpa_032c946::{op_name}"

flake.lock ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1751968576,
+        "narHash": "sha256-cmKrlWpNTG/hq1bCaHXfbdm9T+Y6V+5//EHAVc1TLBE=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "3fcd1e1b46da91b6691261640ffd6b7123d0cb9e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1752505139,
+        "narHash": "sha256-gdIuOhU/adUjNGNgIk1cDTfN7J2tH0UuHSU3FanFfxE=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "a5cebbc02f01a9d359d18ceb9e8bdadead2a289a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1747820358,
+        "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
+        "owner": "danieldk",
+        "repo": "nixpkgs",
+        "rev": "d3c1681180717528068082103bf323147de6ab0b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "danieldk",
+        "ref": "cudatoolkit-12.9-kernel-builder",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix CHANGED Viewed

@@ -2,7 +2,7 @@
   description = "Flake for SDPA kernel";
   inputs = {
-    kernel-builder.url = "path:../..";
   };
   outputs =

   description = "Flake for SDPA kernel";
   inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
   };
   outputs =

tests/test_flash_attention.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
 import pytest
-import sdpa_flash
 def create_cu_seqlens(seq_lengths):
@@ -34,7 +34,7 @@ def test_flash_attention_single_sequence(dtype, head_dim):
     # Call Flash Attention
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -102,7 +102,7 @@ def test_flash_attention_variable_lengths(dtype, head_dim):
     # Call Flash Attention
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -173,7 +173,7 @@ def test_flash_attention_causal(dtype, head_dim):
     # Call Flash Attention with causal mask
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -247,7 +247,7 @@ def test_flash_attention_gqa(dtype, head_dim):
     # Call Flash Attention
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -309,7 +309,7 @@ def test_flash_attention_head_dimensions(head_dim):
     # Call Flash Attention
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -353,7 +353,7 @@ def test_flash_attention_large_head_dim(dtype):
     # Call Flash Attention
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -420,7 +420,7 @@ def test_flash_attention_large_head_dim_causal(dtype):
     # Call Flash Attention with causal mask
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -485,7 +485,7 @@ def test_flash_attention_large_head_dim_gqa():
     # Call Flash Attention
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -528,7 +528,7 @@ def test_flash_attention_edge_cases():
     cu_seqlens = create_cu_seqlens([1])
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -556,7 +556,7 @@ def test_flash_attention_edge_cases():
     out = torch.empty_like(query)
     # This should handle empty sequences gracefully
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -582,7 +582,7 @@ def test_flash_attention_unsupported_cases():
     out = torch.empty_like(query)
     with pytest.raises(RuntimeError, match="Head dimension .* is not supported"):
-        sdpa_flash.flash_attention_varlen(
             out=out,
             query=query,
             key=key,
@@ -606,7 +606,7 @@ def test_flash_attention_unsupported_cases():
     # The function signature no longer accepts mask parameter
     with pytest.raises(TypeError):
-        sdpa_flash.flash_attention_varlen(
             out=out,
             query=query,
             key=key,
@@ -627,7 +627,7 @@ def test_flash_attention_unsupported_cases():
     # This will silently fail (output will be unchanged)
     # We can detect this by initializing output to a known value
     out = torch.full_like(query, -999.0)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -668,7 +668,7 @@ def test_flash_attention_small_sequences(dtype, head_dim):
         # Call Flash Attention
         out = torch.empty_like(query)
-        sdpa_flash.flash_attention_varlen(
             out=out,
             query=query,
             key=key,
@@ -734,7 +734,7 @@ def test_flash_attention_cross_attention(dtype, head_dim):
         # Call Flash Attention
         out = torch.empty_like(query)
-        sdpa_flash.flash_attention_varlen(
             out=out,
             query=query,
             key=key,
@@ -794,7 +794,7 @@ def test_flash_attention_large_sequences(dtype):
     # Call Flash Attention
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -854,7 +854,7 @@ def test_flash_attention_gqa_ratios(gqa_ratio, head_dim):
     # Call Flash Attention
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -911,7 +911,7 @@ def test_flash_attention_single_query_token():
     # Call Flash Attention
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -959,7 +959,7 @@ def test_flash_attn_varlen_func():
     v = torch.randn(total_tokens, num_heads, head_dim, device="mps")
     # Call the compatibility function
-    out = sdpa_flash.flash_attn_varlen_func(
         q=q,
         k=k,
         v=v,
@@ -977,7 +977,7 @@ def test_flash_attn_varlen_func():
     assert out.abs().max().item() > 0
     # Test with causal
-    out_causal = sdpa_flash.flash_attn_varlen_func(
         q=q,
         k=k,
         v=v,
@@ -1020,7 +1020,7 @@ def test_flash_attention_softcapping(dtype, head_dim):
     # Call Flash Attention with softcapping
     out = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
@@ -1083,7 +1083,7 @@ def test_flash_attention_softcapping_edge_cases(dtype):
     # With softcapping = 1.0 (no effect)
     out_no_cap = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out_no_cap,
         query=query,
         key=key,
@@ -1114,7 +1114,7 @@ def test_flash_attention_softcapping_edge_cases(dtype):
     # Test with very large softcapping value
     out_large_cap = torch.empty_like(query)
-    sdpa_flash.flash_attention_varlen(
         out=out_large_cap,
         query=query,
         key=key,

 import torch
 import pytest
+import metal_flash_sdpa
 def create_cu_seqlens(seq_lengths):
     # Call Flash Attention
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     # Call Flash Attention
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     # Call Flash Attention with causal mask
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     # Call Flash Attention
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     # Call Flash Attention
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     # Call Flash Attention
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     # Call Flash Attention with causal mask
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     # Call Flash Attention
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     cu_seqlens = create_cu_seqlens([1])
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     out = torch.empty_like(query)
     # This should handle empty sequences gracefully
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     out = torch.empty_like(query)
     with pytest.raises(RuntimeError, match="Head dimension .* is not supported"):
+        metal_flash_sdpa.flash_attention_varlen(
             out=out,
             query=query,
             key=key,
     # The function signature no longer accepts mask parameter
     with pytest.raises(TypeError):
+        metal_flash_sdpa.flash_attention_varlen(
             out=out,
             query=query,
             key=key,
     # This will silently fail (output will be unchanged)
     # We can detect this by initializing output to a known value
     out = torch.full_like(query, -999.0)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
         # Call Flash Attention
         out = torch.empty_like(query)
+        metal_flash_sdpa.flash_attention_varlen(
             out=out,
             query=query,
             key=key,
         # Call Flash Attention
         out = torch.empty_like(query)
+        metal_flash_sdpa.flash_attention_varlen(
             out=out,
             query=query,
             key=key,
     # Call Flash Attention
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     # Call Flash Attention
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     # Call Flash Attention
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     v = torch.randn(total_tokens, num_heads, head_dim, device="mps")
     # Call the compatibility function
+    out = metal_flash_sdpa.flash_attn_varlen_func(
         q=q,
         k=k,
         v=v,
     assert out.abs().max().item() > 0
     # Test with causal
+    out_causal = metal_flash_sdpa.flash_attn_varlen_func(
         q=q,
         k=k,
         v=v,
     # Call Flash Attention with softcapping
     out = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
     # With softcapping = 1.0 (no effect)
     out_no_cap = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out_no_cap,
         query=query,
         key=key,
     # Test with very large softcapping value
     out_large_cap = torch.empty_like(query)
+    metal_flash_sdpa.flash_attention_varlen(
         out=out_large_cap,
         query=query,
         key=key,

torch-ext/metal_flash_sdpa/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from ._custom_ops import (
+    flash_attention_varlen,
+    flash_attn_varlen_func,
+)
+from ._ops import ops
+__all__ = [
+    "flash_attention_varlen",
+    "flash_attn_varlen_func",
+    "ops",
+]

torch-ext/metal_flash_sdpa/_custom_ops.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from typing import List, Optional
+import torch
+from ._ops import ops
+def flash_attention_varlen(
+    out: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    do_causal: bool = False,
+    scale: Optional[float] = None,
+    softcapping: float = 1.0,
+) -> None:
+    """
+    Flash Attention with variable-length sequences.
+    Args:
+        out: Output tensor of shape [total_q_tokens, num_heads, head_dim]
+        query: Query tensor of shape [total_q_tokens, num_heads, head_dim]
+        key: Key tensor of shape [total_k_tokens, num_heads_kv, head_dim]
+        value: Value tensor of shape [total_k_tokens, num_heads_kv, head_dim]
+        cu_seqlens_q: Cumulative sequence lengths for queries, shape [batch_size + 1], dtype must be torch.int32
+        cu_seqlens_k: Cumulative sequence lengths for keys, shape [batch_size + 1], dtype must be torch.int32
+        max_seqlen_q: Maximum sequence length in the query batch
+        max_seqlen_k: Maximum sequence length in the key batch
+        do_causal: Whether to apply causal masking
+        scale: Attention scale factor (default: 1/sqrt(head_dim))
+        softcapping: Softcapping value (default: 1.0, must be 1.0 for this implementation)
+    Note:
+        - cu_seqlens_q and cu_seqlens_k must have dtype torch.int32 for Metal compatibility
+        - Supported head dimensions: 32, 64, 72, 80, 96, 128
+        - Masks are not supported
+    """
+    if scale is None:
+        scale = query.shape[-1] ** -0.5
+    ops.flash_attention_varlen(
+        out,
+        query,
+        key,
+        value,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        do_causal,
+        scale,
+        softcapping,
+    )
+def flash_attn_varlen_func(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    dropout_p: float = 0.0,
+    softmax_scale: Optional[float] = None,
+    causal: bool = False,
+    window_size: tuple = (-1, -1),
+    alibi_slopes: Optional[torch.Tensor] = None,
+    deterministic: bool = False,
+    return_attn_probs: bool = False,
+) -> torch.Tensor:
+    """
+    Flash Attention function with API compatible with the original Flash Attention.
+    Note: This implementation does not support:
+    - dropout
+    - window attention
+    - alibi slopes
+    - returning attention probabilities
+    """
+    if dropout_p > 0:
+        raise NotImplementedError("Dropout is not supported in this implementation")
+    if window_size != (-1, -1):
+        raise NotImplementedError("Window attention is not supported")
+    if alibi_slopes is not None:
+        raise NotImplementedError("ALiBi is not supported")
+    if return_attn_probs:
+        raise NotImplementedError("Returning attention probabilities is not supported")
+    # Create output tensor
+    out = torch.empty_like(q)
+    # Call the kernel
+    flash_attention_varlen(
+        out=out,
+        query=q,
+        key=k,
+        value=v,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seqlen_q=max_seqlen_q,
+        max_seqlen_k=max_seqlen_k,
+        do_causal=causal,
+        scale=softmax_scale,
+        softcapping=1.0,
+    )
+    return out
+__all__ = [
+    "flash_attention_varlen",
+    "flash_attn_varlen_func",
+]