carlex3321 commited on
Commit
737853b
·
verified ·
1 Parent(s): 56f7fcf

Update builder.sh

Browse files
Files changed (1) hide show
  1. builder.sh +161 -180
builder.sh CHANGED
@@ -1,9 +1,10 @@
1
  #!/usr/bin/env bash
2
  set -euo pipefail
3
 
4
- echo "🚀 Builder (Apex + Q8 + FlashAttn + layer_norm) — runtime, GPU visível, cache persistente"
5
 
6
  # ===== Persistência e caches =====
 
7
  if [ -d /data ]; then
8
  export HF_HOME="${HF_HOME:-/data/.cache/huggingface}"
9
  export TORCH_HOME="${TORCH_HOME:-/data/.cache/torch}"
@@ -13,7 +14,10 @@ else
13
  fi
14
  export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
15
  mkdir -p "$HF_HOME" "$HF_HUB_CACHE" "$TORCH_HOME"
16
- mkdir -p /app/.cache && ln -sf "$HF_HOME" /app/.cache/huggingface
 
 
 
17
 
18
  # ===== Repositório de wheels no Hub =====
19
  export SELF_HF_REPO_ID="${SELF_HF_REPO_ID:-carlex3321/aduc-sdr}"
@@ -27,195 +31,174 @@ mkdir -p /app/wheels /app/cuda_cache /app/wheels/src
27
  chmod -R 777 /app/wheels || true
28
  export CUDA_CACHE_PATH="/app/cuda_cache"
29
 
30
- # ===== Dependências mínimas =====
31
- python - <<'PY'
32
- import sys, subprocess
33
- def ensure(p):
34
- try: __import__(p)
35
- except Exception: subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", p])
36
- for p in ["pip","setuptools","wheel","packaging","ninja","huggingface_hub"]:
37
- ensure(p)
38
- PY
39
 
40
- # ===== Detectar Python, Torch e CUDA =====
41
- export PY_TAG="$(python -c 'import sys;print(f"cp{sys.version_info[0]}{sys.version_info[1]}")')"
42
- export TORCH_VER="$(python - <<'PY'
43
- import torch, re
44
- v = torch.__version__.split('+')[0]
45
- print(re.sub(r'[^0-9a-zA-Z\.-]', '', v))
46
- PY
47
- )"
48
- export CU_TAG="cu$(python - <<'PY'
49
- import torch
50
- print((torch.version.cuda or "").replace(".","") or "unknown")
 
51
  PY
52
  )"
53
- echo "[env] PY_TAG=$PY_TAG TORCH_VER=$TORCH_VER CU_TAG=$CU_TAG"
54
-
55
- log() { echo "[$(date +%H:%M:%S)] $*"; }
56
-
57
- # ===== Downloader silencioso: retorna só o caminho =====
58
- install_from_hf () {
59
- # $1: prefixo do nome do pacote (.whl) a procurar
60
- local NAME="$1"
61
- echo "[hub] buscando wheel '$NAME' em ${SELF_HF_REPO_ID} (py=${PY_TAG} cu=${CU_TAG} torch=${TORCH_VER})"
62
- python - "$NAME" <<'PY' 2>/dev/null || exit 0
63
- from huggingface_hub import HfApi, HfFolder
64
- import os, re, sys
65
- name = sys.argv[1]
66
- repo = os.environ.get("SELF_HF_REPO_ID") or ""
67
- py = os.environ.get("PY_TAG") or ""
68
- cu = os.environ.get("CU_TAG") or ""
69
- tv = os.environ.get("TORCH_VER") or ""
70
- api = HfApi(token=os.getenv("HF_TOKEN") or HfFolder.get_token())
71
  try:
72
- files = api.list_repo_files(repo_id=repo, repo_type="model")
 
 
73
  except Exception:
74
- sys.exit(0)
75
- def pick(patts):
76
- c=[]
77
- for f in files:
78
- if f.endswith(".whl") and any(p.search(f) for p in patts):
79
- c.append(f)
80
- if not c: return None
81
- c.sort(reverse=True)
82
- return c[0]
83
- p_strict=[re.compile(rf"{re.escape(name)}.*{re.escape(py)}.*{re.escape(cu)}.*{re.escape(tv)}.*\.whl$")]
84
- p_relax =[re.compile(rf"{re.escape(name)}.*{re.escape(py)}.*{re.escape(cu)}.*\.whl$")]
85
- target = pick(p_strict) or pick(p_relax)
86
- if not target: sys.exit(0)
87
- local = api.hf_hub_download(repo_id=repo, filename=target, repo_type="model", local_dir="/app/wheels")
88
- print(local)
89
  PY
90
- }
91
-
92
 
93
  # ===== Checkers =====
94
- check_apex () { python - <<'PY'
95
- import importlib, sys
96
  try:
97
- import apex
98
- importlib.import_module("fused_layer_norm_cuda")
99
- sys.exit(0)
100
  except Exception:
101
- sys.exit(1)
 
102
  PY
103
  }
104
- check_q8 () { python - <<'PY'
105
- import importlib, sys
106
- for m in ("ltx_q8_kernels","q8_kernels"):
107
- try: importlib.import_module(m); sys.exit(0)
108
- except Exception: pass
109
- sys.exit(1)
110
  PY
111
  }
112
- check_flashattn () { python - <<'PY'
113
- import importlib, sys
 
 
 
 
 
 
 
 
 
 
 
114
  try:
115
- importlib.import_module("flash_attn._C")
116
- sys.exit(0)
117
  except Exception:
118
- sys.exit(1)
119
- PY
120
- }
121
- check_layernorm () { python - <<'PY'
122
- import importlib, sys
123
- try:
124
- importlib.import_module("layer_norm_cuda")
125
- sys.exit(0)
126
- except Exception:
127
- sys.exit(1)
128
  PY
129
  }
130
 
131
- # ===== Apex =====
132
- APEX_REPO="${APEX_REPO:-https://github.com/NVIDIA/apex.git}"
133
- APEX_COMMIT="${APEX_COMMIT:-master}"
134
  build_apex () {
135
  local SRC="/app/wheels/src/apex"
136
- rm -rf "$SRC"
137
- git clone --filter=blob:none "$APEX_REPO" "$SRC"
138
- git -C "$SRC" checkout "$APEX_COMMIT" || true
139
- export MAX_JOBS="${MAX_JOBS:-4}" APEX_CPP_EXT=1 APEX_CUDA_EXT=1
140
- log "[build] apex MAX_JOBS=$MAX_JOBS"
 
 
 
 
 
 
 
141
  python -m pip wheel --no-build-isolation --no-deps "$SRC" -w /app/wheels || true
 
142
  local W="$(ls -t /app/wheels/apex-*.whl 2>/dev/null | head -n1 || true)"
143
- if [ -n "$W" ]; then python -m pip install -U --no-deps --no-build-isolation "$W" || true
144
- else (cd "$SRC" && python -m pip install -U --no-build-isolation .) || true
145
- fi
146
- }
147
- ensure_apex () {
148
- log "[flow] === apex ==="
149
- if check_apex; then log "[flow] apex OK"; return 0; fi
150
- local HFW="$(install_from_hf "apex" || true)"
151
- if [ -n "${HFW:-}" ]; then
152
- python -m pip install -U --no-build-isolation "$HFW" || true
153
- if check_apex; then log "[flow] apex via Hub OK"; return 0; fi
154
  fi
155
- build_apex
156
- if check_apex; then log "[flow] apex build OK"; return 0; fi
157
- log "[flow] apex falhou"; return 1
158
  }
159
 
160
- # ===== Q8 =====
161
- Q8_REPO="${Q8_REPO:-https://github.com/lynix94/llm-q8-kernels.git}"
162
- Q8_COMMIT="${Q8_COMMIT:-main}"
163
  build_q8 () {
164
- local SRC="/app/wheels/src/q8"
165
  rm -rf "$SRC"
166
  git clone --filter=blob:none "$Q8_REPO" "$SRC"
167
- git -C "$SRC" checkout "$Q8_COMMIT" || true
168
- export MAX_JOBS="${MAX_JOBS:-4}"
169
- log "[build] q8 — MAX_JOBS=$MAX_JOBS"
170
- python -m pip wheel --no-build-isolation --no-deps "$SRC" -w /app/wheels || true
171
- local W="$(ls -t /app/wheels/*q8*kernels*-*.whl /app/wheels/*q8*kernel*-*.whl 2>/dev/null | head -n1 || true)"
172
- if [ -n "$W" ]; then python -m pip install -U --no-deps --no-build-isolation "$W" || true
173
- else (cd "$SRC" && python -m pip install -U --no-build-isolation .) || true
 
 
 
 
 
 
174
  fi
175
  }
176
- ensure_q8 () {
177
- log "[flow] === q8 ==="
178
- if check_q8; then log "[flow] q8 OK"; return 0; fi
179
- local HFW="$(install_from_hf "q8" || true)"; [ -z "${HFW:-}" ] && HFW="$(install_from_hf "kernels" || true)"
180
- if [ -n "${HFW:-}" ]; then
181
- python -m pip install -U --no-build-isolation "$HFW" || true
182
- if check_q8; then log "[flow] q8 via Hub OK"; return 0; fi
 
 
 
 
183
  fi
184
- build_q8
185
- if check_q8; then log "[flow] q8 build OK"; return 0; fi
186
- log "[flow] q8 falhou"; return 1
187
- }
188
 
189
- # ===== FlashAttention =====
190
- FLASHATTN_REPO="${FLASHATTN_REPO:-https://github.com/Dao-AILab/flash-attention.git}"
191
- FLASHATTN_REF="${FLASHATTN_REF:-v2.8.3}"
192
- build_flashattn_repo () {
193
- local SRC="/app/wheels/src/flash-attention"
194
- rm -rf "$SRC"
195
- git clone --filter=blob:none "$FLASHATTN_REPO" "$SRC"
196
- git -C "$SRC" checkout "$FLASHATTN_REF" || true
197
- git -C "$SRC" submodule update --init --recursive || true
198
- export MAX_JOBS="${MAX_JOBS:-4}"
199
- log "[build] flash-attn MAX_JOBS=$MAX_JOBS"
200
- python -m pip wheel --no-build-isolation --no-deps "$SRC" -w /app/wheels || true
201
- local W="$(ls -t /app/wheels/flash_attn-*.whl 2>/dev/null | head -n1 || true)"
202
- if [ -n "$W" ]; then python -m pip install -U --no-deps --no-build-isolation "$W" || true
203
- else (cd "$SRC" && python -m pip install -U --no-build-isolation .) || true
204
  fi
205
- }
206
- ensure_flashattn () {
207
- log "[flow] === flash-attn ==="
208
- if check_flashattn; then log "[flow] flash-attn OK"; return 0; fi
209
- local HF1="$(install_from_hf "flash_attn" || true)"
210
- if [ -n "${HF1:-}" ]; then
211
- python -m pip install -U --no-build-isolation "$HF1" || true
212
- if check_flashattn; then log "[flow] flash-attn via Hub OK"; return 0; fi
213
  fi
214
- build_flashattn_repo
215
- if check_flashattn; then log "[flow] flash-attn build OK"; return 0; fi
216
- log "[flow] flash-attn falhou"; return 1
217
  }
218
 
 
219
  # ===== layer_norm (wheel separada) =====
220
  check_layernorm () { python - <<'PY'
221
  import importlib, sys
@@ -281,34 +264,32 @@ ensure_layernorm () {
281
  log "[flow] layer_norm falhou"; return 1
282
  }
283
 
284
- # ===== Execução =====
285
- ensure_apex || true
286
- ensure_q8 || true
287
- ensure_flashattn || true
288
- ensure_layernorm || true
289
 
290
 
291
- # ===== Upload para HF =====
292
- echo "[hub] upload de wheels para $SELF_HF_REPO_ID"
 
 
 
 
 
293
  python - <<'PY'
294
- import os, glob, json, time
295
- from huggingface_hub import HfApi, HfFolder, upload_file
296
- repo = os.environ.get("SELF_HF_REPO_ID")
297
- token = HfFolder.get_token()
298
- if not (repo and token):
299
- print("[hub] token/repo ausentes; pulando upload"); raise SystemExit(0)
300
- paths = sorted(glob.glob("/app/wheels/*.whl"))
301
- py = os.environ.get("PY_TAG"); cu = os.environ.get("CU_TAG"); torchv = os.environ.get("TORCH_VER")
302
- stamp = time.strftime("%Y%m%d-%H%M%S")
303
- for p in paths:
304
- dest = f"wheels/{py}-{cu}-torch{torchv}/" + os.path.basename(p)
305
- upload_file(path_or_fileobj=p, path_in_repo=dest, repo_id=repo, repo_type="model")
306
- print("[hub] uploaded:", dest)
307
- meta = {"py": py, "cuda": cu, "torch": torchv, "timestamp": stamp, "files": [os.path.basename(x) for x in paths]}
308
- upload_file(path_or_fileobj=bytes(json.dumps(meta, indent=2), "utf-8"),
309
- path_in_repo=f"wheels/{py}-{cu}-torch{torchv}/manifest-{stamp}.json",
310
- repo_id=repo, repo_type="model")
311
- print("[hub] manifest enviado")
312
  PY
313
 
314
- echo "[done] builder-10 concluído"
 
 
1
  #!/usr/bin/env bash
2
  set -euo pipefail
3
 
4
+ echo "🚀 Builder (Apex + Q8) — runtime, GPU visível, cache persistente"
5
 
6
  # ===== Persistência e caches =====
7
+ # Prioriza /data (HF Spaces) e mantém compatibilidade com /app
8
  if [ -d /data ]; then
9
  export HF_HOME="${HF_HOME:-/data/.cache/huggingface}"
10
  export TORCH_HOME="${TORCH_HOME:-/data/.cache/torch}"
 
14
  fi
15
  export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
16
  mkdir -p "$HF_HOME" "$HF_HUB_CACHE" "$TORCH_HOME"
17
+
18
+ # Symlink de compatibilidade (se scripts esperarem /app/.cache/huggingface)
19
+ mkdir -p /app/.cache
20
+ ln -sf "$HF_HOME" /app/.cache/huggingface
21
 
22
  # ===== Repositório de wheels no Hub =====
23
  export SELF_HF_REPO_ID="${SELF_HF_REPO_ID:-carlex3321/aduc-sdr}"
 
31
  chmod -R 777 /app/wheels || true
32
  export CUDA_CACHE_PATH="/app/cuda_cache"
33
 
34
+ # Licença (NVIDIA NGC) se presente
35
+ if [ -f "/NGC-DL-CONTAINER-LICENSE" ]; then
36
+ cp -f /NGC-DL-CONTAINER-LICENSE /app/wheels/NGC-DL-CONTAINER-LICENSE || true
37
+ fi
 
 
 
 
 
38
 
39
+ # ===== Dependências mínimas de build =====
40
+ python -m pip install -v -U pip build setuptools wheel hatchling hatch-vcs scikit-build-core cmake ninja packaging "huggingface_hub[hf_transfer]" || true
41
+
42
+ # ===== Tags de ambiente (Python/CUDA/Torch) =====
43
+ PY_TAG="$(python -c 'import sys; print(f"cp{sys.version_info[0]}{sys.version_info[1]}")' 2>/dev/null || echo cp310)"
44
+ TORCH_VER="$(python - <<'PY'
45
+ try:
46
+ import torch, re
47
+ v = torch.__version__
48
+ print(re.sub(r'\+.*$', '', v))
49
+ except Exception:
50
+ print("unknown")
51
  PY
52
  )"
53
+ CU_TAG="$(python - <<'PY'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  try:
55
+ import torch
56
+ cu = getattr(torch.version, "cuda", None)
57
+ print("cu"+cu.replace(".","")) if cu else print("")
58
  except Exception:
59
+ print("")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  PY
61
+ )"
62
+ echo "[env] PY_TAG=${PY_TAG} TORCH_VER=${TORCH_VER} CU_TAG=${CU_TAG}"
63
 
64
  # ===== Checkers =====
65
+ check_apex() {
66
+ python - <<'PY'
67
  try:
68
+ from apex.normalization import FusedLayerNorm, FusedRMSNorm
69
+ import importlib; importlib.import_module("fused_layer_norm_cuda")
70
+ ok = True
71
  except Exception:
72
+ ok = False
73
+ raise SystemExit(0 if ok else 1)
74
  PY
75
  }
76
+
77
+ check_q8() {
78
+ python - <<'PY'
79
+ import importlib.util
80
+ spec = importlib.util.find_spec("ltx_q8_kernels") or importlib.util.find_spec("q8_kernels")
81
+ raise SystemExit(0 if spec else 1)
82
  PY
83
  }
84
+
85
+ # ===== Download de wheels do Hub =====
86
+ install_from_hf () {
87
+ local PKG="$1" # 'apex' ou 'q8_kernels'
88
+ echo "[hub] Buscando wheel de ${PKG} em ${SELF_HF_REPO_ID} (py=${PY_TAG}, cu=${CU_TAG})"
89
+ python - "$PKG" "$PY_TAG" "$CU_TAG" <<'PY' || exit 0
90
+ import os, sys
91
+ from huggingface_hub import HfApi, hf_hub_download, HfFolder
92
+
93
+ pkg, py_tag, cu_tag = sys.argv[1], sys.argv[2], sys.argv[3]
94
+ repo = os.environ.get("SELF_HF_REPO_ID","carlex3321/aduc-sdr")
95
+ api = HfApi(token=os.getenv("HF_TOKEN") or HfFolder.get_token())
96
+
97
  try:
98
+ files = api.list_repo_files(repo_id=repo, repo_type="model")
 
99
  except Exception:
100
+ raise SystemExit(0)
101
+
102
+ cands = [f for f in files if f.endswith(".whl") and f.rsplit("/",1)[-1].startswith(pkg+"-") and py_tag in f]
103
+ pref = [f for f in cands if cu_tag and cu_tag in f] or cands
104
+ if not pref:
105
+ raise SystemExit(0)
106
+ target = sorted(pref, reverse=True)[0]
107
+ print(target)
108
+ path = hf_hub_download(repo_id=repo, filename=target, repo_type="model", local_dir="/app/wheels")
109
+ print(path)
110
  PY
111
  }
112
 
113
+ # ===== Builders =====
 
 
114
  build_apex () {
115
  local SRC="/app/wheels/src/apex"
116
+ echo "[build] Fonte Apex em ${SRC}"
117
+ if [ -d "$SRC/.git" ]; then
118
+ git -C "$SRC" fetch --all -p || true
119
+ git -C "$SRC" reset --hard HEAD || true
120
+ git -C "$SRC" clean -fdx || true
121
+ else
122
+ rm -rf "$SRC"
123
+ git clone --depth 1 https://github.com/NVIDIA/apex "$SRC"
124
+ fi
125
+
126
+ echo "[build] Compilando Apex -> wheel"
127
+ export APEX_CPP_EXT=1 APEX_CUDA_EXT=1 APEX_ALL_CONTRIB_EXT=0
128
  python -m pip wheel --no-build-isolation --no-deps "$SRC" -w /app/wheels || true
129
+
130
  local W="$(ls -t /app/wheels/apex-*.whl 2>/dev/null | head -n1 || true)"
131
+ if [ -n "${W}" ]; then
132
+ python -m pip install -U --no-deps "${W}" || true
133
+ echo "[build] Apex instalado da wheel: ${W}"
134
+ else
135
+ echo "[build] Nenhuma wheel Apex gerada; instalando do source"
136
+ python -m pip install --no-build-isolation "$SRC" || true
 
 
 
 
 
137
  fi
 
 
 
138
  }
139
 
140
+ Q8_REPO="${Q8_REPO:-https://github.com/Lightricks/LTX-Video-Q8-Kernels.git}"
141
+ Q8_COMMIT="${Q8_COMMIT:-f3066edea210082799ca5a2bbf9ef0321c5dd8fc}"
 
142
  build_q8 () {
143
+ local SRC="/app/wheels/src/q8_kernels"
144
  rm -rf "$SRC"
145
  git clone --filter=blob:none "$Q8_REPO" "$SRC"
146
+ git -C "$SRC" checkout "$Q8_COMMIT"
147
+ git -C "$SRC" submodule update --init --recursive
148
+
149
+ echo "[build] Compilando Q8 Kernels -> wheel"
150
+ python -m pip wheel --no-build-isolation "$SRC" -w /app/wheels || true
151
+
152
+ local W="$(ls -t /app/wheels/q8_kernels-*.whl 2>/dev/null | head -n1 || true)"
153
+ if [ -n "${W}" ]; then
154
+ python -m pip install -U --no-deps "${W}" || true
155
+ echo "[build] Q8 instalado da wheel: ${W}"
156
+ else
157
+ echo "[build] Nenhuma wheel q8_kernels gerada; instalando do source"
158
+ python -m pip install --no-build-isolation "$SRC" || true
159
  fi
160
  }
161
+
162
+ # ===== Pipeline genérico =====
163
+ ensure_pkg () {
164
+ local PKG="$1" # apex | q8_kernels
165
+ local CHECK_FN="$2" # check_apex | check_q8
166
+ local BUILD_FN="$3" # build_apex | build_q8
167
+ echo "[flow] === ${PKG} ==="
168
+
169
+ if ${CHECK_FN}; then
170
+ echo "[flow] ${PKG}: já instalado (import OK)"
171
+ return 0
172
  fi
 
 
 
 
173
 
174
+ echo "[flow] ${PKG}: tentando wheel do Hub (${SELF_HF_REPO_ID})"
175
+ HF_OUT="$(install_from_hf "$PKG" || true)"
176
+ if [ -n "${HF_OUT:-}" ]; then
177
+ WHEEL_PATH="$(printf "%s\n" "${HF_OUT}" | tail -n1)"
178
+ echo "[hub] Baixado: ${WHEEL_PATH}"
179
+ python -m pip install -U --no-build-isolation "${WHEEL_PATH}" || true
180
+ if ${CHECK_FN}; then
181
+ echo "[flow] ${PKG}: sucesso via Hub (${WHEEL_PATH})"
182
+ return 0
183
+ else
184
+ echo "[flow] ${PKG}: import falhou após wheel do Hub; compilando"
185
+ fi
186
+ else
187
+ echo "[hub] Nenhuma wheel compatível encontrada para ${PKG}"
 
188
  fi
189
+
190
+ echo "[flow] ${PKG}: compilando (fallback)"
191
+ ${BUILD_FN}
192
+ if ${CHECK_FN}; then
193
+ echo "[flow] ${PKG}: sucesso após compilação"
194
+ return 0
 
 
195
  fi
196
+
197
+ echo "[flow] ${PKG}: falhou após build; seguindo adiante"
198
+ return 1
199
  }
200
 
201
+
202
  # ===== layer_norm (wheel separada) =====
203
  check_layernorm () { python - <<'PY'
204
  import importlib, sys
 
264
  log "[flow] layer_norm falhou"; return 1
265
  }
266
 
 
 
 
 
 
267
 
268
 
269
+ # ===== Execução: Apex e Q8 =====
270
+
271
+ ensure_pkg "apex" check_apex build_apex || true
272
+ ensure_pkg "q8_kernels" check_q8 build_q8 || true
273
+ ensure_layernorm || true
274
+
275
+ # ===== Upload das wheels geradas (opcional) =====
276
  python - <<'PY'
277
+ import os
278
+ from huggingface_hub import HfApi, HfFolder
279
+ repo=os.environ.get("SELF_HF_REPO_ID","carlex3321/aduc-sdr")
280
+ token=os.getenv("HF_TOKEN") or HfFolder.get_token()
281
+ if not token:
282
+ raise SystemExit("HF_TOKEN ausente; upload desabilitado")
283
+ api=HfApi(token=token)
284
+ api.upload_folder(
285
+ folder_path="/app/wheels",
286
+ repo_id=repo,
287
+ repo_type="model",
288
+ allow_patterns=["*.whl","NGC-DL-CONTAINER-LICENSE"],
289
+ ignore_patterns=["**/src/**","**/*.log","**/logs/**",".git/**"],
290
+ )
291
+ print("Upload concluído (wheels + licença).")
 
 
 
292
  PY
293
 
294
+ chmod -R 777 /app/wheels || true
295
+ echo "✅ Builder finalizado."