Aduc-sdr-2_5s

Paused

App Files Files Community

carlex3321 commited on Sep 29

Commit

1c5dcff

verified ·

1 Parent(s): 4c17046

Update info.sh

Browse files

Files changed (1) hide show

info.sh +123 -79

info.sh CHANGED Viewed

@@ -2,112 +2,156 @@
 set -euo pipefail
 echo "================= RUNTIME CAPABILITIES ================="
-nvidia-smi || true
-echo "CUDA_HOME: ${CUDA_HOME:-/usr/local/cuda}"
-echo "NVCC: $(nvcc --version 2>/dev/null | tail -n1 || echo 'N/A')"
 echo
 echo "[PyTorch / CUDA backend]"
-python3 - <<'PY'
-import json
-try:
-    import torch
-    info = {
-      "torch": torch.__version__,
-      "cuda_available": torch.cuda.is_available(),
-      "cuda_device_count": torch.cuda.device_count(),
-      "cuda_runtime_version": getattr(torch.version, "cuda", None),
-      "cudnn_version": (torch.backends.cudnn.version() if torch.cuda.is_available() else None),
-      "tf32": (torch.backends.cuda.matmul.allow_tf32 if torch.cuda.is_available() else None),
-      "flash_sdp": (torch.backends.cuda.flash_sdp_enabled() if hasattr(torch.backends.cuda,"flash_sdp_enabled") else None),
-      "mem_efficient_sdp": (torch.backends.cuda.mem_efficient_sdp_enabled() if hasattr(torch.backends.cuda,"mem_efficient_sdp_enabled") else None),
-      "math_sdp": (torch.backends.cuda.math_sdp_enabled() if hasattr(torch.backends.cuda,"math_sdp_enabled") else None),
-    }
-    print(json.dumps(info, indent=2))
-    if torch.cuda.is_available():
-        for i in range(torch.cuda.device_count()):
-            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
-except Exception as e:
-    print(f"[ERR torch] {type(e).__name__}: {e}")
 PY
-echo
 echo "[Apex]"
-python3 - <<'PY'
 try:
-    import importlib
-    importlib.import_module("apex.normalization")
-    print("apex.normalization: OK")
 except Exception as e:
-    print(f"Apex: ERR {type(e).__name__}: {e}")
 PY
-echo
 echo "[FlashAttention]"
-python3 - <<'PY'
 try:
     import flash_attn
-    print(f"flash_attn: OK (version={getattr(flash_attn,'__version__', 'unknown')})")
-    try:
-        import flash_attn_2_cuda
-        print("flash_attn_2_cuda: OK")
-    except Exception as e:
-        print(f"flash_attn_2_cuda: ERR {type(e).__name__}: {e}")
-except Exception as e:
-    print(f"flash_attn: ERR {type(e).__name__}: {e}")
 PY
-echo
 echo "[Triton]"
-python3 - <<'PY'
 try:
-    import triton
-    print(f"triton: OK (version={getattr(triton,'__version__','unknown')})")
-    try:
-        import triton.ops
-        print("triton.ops: legacy module present")
-    except ModuleNotFoundError:
-        print("triton.ops: not present (ok on Triton>=3.x)")
-    except Exception as e:
-        print(f"triton.ops: WARN {type(e).__name__}: {e}")
 except Exception as e:
-    print(f"triton: ERR {type(e).__name__}: {e}")
 PY
-echo
 echo "[BitsAndBytes (Q8/Q4)]"
-python3 - <<'PY'
 try:
-    import bitsandbytes as bnb
-    v = getattr(bnb, "__version__", "unknown")
-    print(f"bitsandbytes: OK (version={v})")
-    try:
-        import bitsandbytes.triton.int8_matmul_mixed_dequantize as q8
-        print("bnb.triton.int8_matmul_mixed_dequantize: OK")
-    except ModuleNotFoundError:
-        print("bnb.q8.triton: not present (disabled or no GPU build)")
-    except Exception as e:
-        print(f"bnb.q8.triton: WARN {type(e).__name__}: {e}")
 except Exception as e:
-    print(f"bitsandbytes: ERR {type(e).__name__}: {e}")
 PY
-echo
 echo "[Transformers / Diffusers / XFormers]"
-python3 - <<'PY'
-import importlib
-def ver(name):
-    try:
-        m = importlib.import_module(name)
-        return getattr(m, "__version__", "unknown")
-    except Exception as e:
-        return f"ERR:{type(e).__name__}"
-print("transformers:", ver("transformers"))
-print("diffusers:", ver("diffusers"))
-print("xformers:", ver("xformers"))
 PY
-echo
 echo "[Distribuído / NCCL Env]"
-env | egrep 'MASTER_|NCCL|CUDA_VISIBLE_DEVICES|TORCH_|ENABLE_' | sort
 echo "================= END CAPABILITIES ================="

 set -euo pipefail
 echo "================= RUNTIME CAPABILITIES ================="
+date
+if command -v nvidia-smi >/dev/null 2>&1; then
+  nvidia-smi
+else
+  echo "nvidia-smi: not available"
+fi
 echo
+echo "CUDA_HOME: ${CUDA_HOME:-/usr/local/cuda}"
+if command -v nvcc >/dev/null 2>&1; then
+  nvcc --version || true
+else
+  echo "nvcc: not available"
+fi
+echo
 echo "[PyTorch / CUDA backend]"
+python - <<'PY'
+import json, os, torch
+info = {
+  "torch": getattr(torch, "__version__", None),
+  "cuda_available": torch.cuda.is_available(),
+  "cuda_device_count": torch.cuda.device_count(),
+  "cuda_runtime_version": getattr(torch.version, "cuda", None),
+  "cudnn_version": torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else None,
+  "tf32": torch.backends.cuda.matmul.allow_tf32 if torch.cuda.is_available() else None,
+  "flash_sdp": torch.backends.cuda.enable_flash_sdp if torch.cuda.is_available() else None,
+  "mem_efficient_sdp": torch.backends.cuda.enable_mem_efficient_sdp if torch.cuda.is_available() else None,
+  "math_sdp": torch.backends.cuda.enable_math_sdp if torch.cuda.is_available() else None,
+}
+print(json.dumps(info, indent=2))
+for i in range(min(torch.cuda.device_count(), 8)):
+  print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
 PY
+echo
 echo "[Apex]"
+python - <<'PY'
 try:
+  from apex.normalization import FusedLayerNorm, FusedRMSNorm
+  import importlib; importlib.import_module("fused_layer_norm_cuda")
+  print("apex.normalization: OK")
 except Exception as e:
+  print("apex.normalization: FAIL ->", e)
 PY
+echo
 echo "[FlashAttention]"
+python - <<'PY'
+import importlib, sys
+mods = ["flash_attn", "flash_attn_2_cuda"]
+for m in mods:
+  try:
+    importlib.import_module(m); print(f"{m}: OK")
+  except Exception as e:
+    print(f"{m}: FAIL -> {e}")
+PY
+echo
+echo "[FlashAttention LN test]"
+python - <<'PY'
+import os, warnings, importlib
+warnings.filterwarnings("ignore", category=FutureWarning)
+def ok_import(names):
+    for n in names:
+        try:
+            importlib.import_module(n)
+            print(f"  [+] import '{n}' OK")
+            return True
+        except Exception as e:
+            print(f"  [-] import '{n}' fail: {e}")
+    return False
+fa_ver = None
 try:
     import flash_attn
+    fa_ver = getattr(flash_attn, "__version__", None)
+except Exception:
+    pass
+try:
+    import torch
+    tv = torch.__version__
+    cu = getattr(torch.version, "cuda", None)
+except Exception:
+    tv, cu = "unknown", "unknown"
+print(f"  flash_attn version: {fa_ver}")
+print(f"  torch: {tv} | cuda: {cu} | TORCH_CUDA_ARCH_LIST={os.getenv('TORCH_CUDA_ARCH_LIST')}")
+names_to_try = [
+    "flash_attn_2_cuda",
+    "flash_attn.ops.layer_norm",
+    "flash_attn.layers.layer_norm",
+]
+ok = ok_import(names_to_try)
+if not ok:
+    print("  Hint: faltam kernels de layer_norm/RMSNorm do FlashAttention.")
+    print("  Aceleração ficará reduzida; para instalar:")
+    print("   - Rodar builder para compilar e instalar flash_attn e salvar wheel para reuso;")
+    print("   - Ou instalar manualmente a tag compatível: Dao-AILab/flash-attention (csrc/layer_norm).")
+    print("  Doc: https://github.com/Dao-AILab/flash-attention/tree/main/csrc/layer_norm")
 PY
+echo
 echo "[Triton]"
+python - <<'PY'
 try:
+  import triton
+  print("triton:", triton.__version__)
+  try:
+    import triton.ops as _; print("triton.ops: OK")
+  except Exception as e:
+    print("triton.ops: not present (ok on Triton>=3.x)")
 except Exception as e:
+  print("triton: FAIL ->", e)
 PY
+echo
 echo "[BitsAndBytes (Q8/Q4)]"
+python - <<'PY'
 try:
+  import bitsandbytes as bnb
+  print("bitsandbytes:", bnb.__version__)
+  try:
+    from bitsandbytes.triton import _custom_ops as _; print("bnb.triton.int8_matmul_mixed_dequantize: OK")
+  except Exception as e:
+    print("bnb.triton: partial ->", e)
 except Exception as e:
+  print("bitsandbytes: FAIL ->", e)
 PY
+echo
 echo "[Transformers / Diffusers / XFormers]"
+python - <<'PY'
+def _v(m):
+  try:
+    mod = __import__(m)
+    print(f"{m}:", getattr(mod, "__version__", "unknown"))
+  except Exception as e:
+    print(f"{m}: FAIL -> {e}")
+for m in ("transformers","diffusers","xformers"):
+  _v(m)
 PY
+echo
 echo "[Distribuído / NCCL Env]"
+env | grep -E '^(CUDA_VISIBLE_DEVICES|NCCL_|TORCH_|ENABLE_.*SDP|HF_HUB_.*|CUDA_|NV_.*NCCL.*|PYTORCH_CUDA_ALLOC_CONF)=' | sort
+echo
+echo "[Caminhos e permissões de saída]"
+OUT="/app/output"
+echo "OUT dir: $OUT"
+mkdir -p "$OUT"
+ls -la "$OUT" || true
 echo "================= END CAPABILITIES ================="