Aduc-sdr-2_5s / start.sh
carlex3321's picture
Update start.sh
b0e5670 verified
raw
history blame
4.28 kB
#!/usr/bin/env bash
set -euo pipefail
echo "======================================================="
echo " ADUC-SDR — Start (VINCIE/SeedVR, 8× L40S)"
echo "======================================================="
# ---------------------- Env base ----------------------
export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
export TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}"
# SDPA / FA toggles
export ENABLE_FLASH_SDP="${ENABLE_FLASH_SDP:-1}"
export ENABLE_MEMORY_EFFICIENT_SDP="${ENABLE_MEMORY_EFFICIENT_SDP:-1}"
export ENABLE_MATH_SDP="${ENABLE_MATH_SDP:-0}"
export FLASH_ATTENTION_DISABLE="${FLASH_ATTENTION_DISABLE:-0}"
export XFORMERS_FORCE_DISABLE="${XFORMERS_FORCE_DISABLE:-1}"
# CUDA / NCCL baseline
export CUDA_MODULE_LOADING="LAZY"
export CUDA_DEVICE_MAX_CONNECTIONS="${CUDA_DEVICE_MAX_CONNECTIONS:-32}"
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:512,garbage_collection_threshold:0.8"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-8}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-8}"
export NCCL_DEBUG="INFO"
export NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_P2P_DISABLE=0
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME="lo"
export NCCL_BLOCKING_WAIT=1
export TORCH_NCCL_BLOCKING_WAIT=1
export NCCL_TIMEOUT=600
# ---------------------- Persistência HF/torch ----------------------
if [ -d /data ]; then
export HF_HOME="/data/.cache/huggingface"
export TORCH_HOME="/data/.cache/torch"
else
export HF_HOME="/app/.cache/huggingface"
export TORCH_HOME="/app/.cache/torch"
fi
export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
mkdir -p "$HF_HUB_CACHE" "$TORCH_HOME"
mkdir -p /app/.cache
ln -sf "$HF_HOME" /app/.cache/huggingface
unset TRANSFORMERS_CACHE
export HF_HUB_ENABLE_HF_TRANSFER=1
export HF_HUB_DOWNLOAD_TIMEOUT=60
MODEL_REPO="ByteDance-Seed/VINCIE-3B"
CKPT_DIR="/app/ckpt/VINCIE-3B"
mkdir -p "$CKPT_DIR"
# ---------------------- Cache Estruturado HF (persistente) ----------------------
# Define cache no volume persistente /data (1TB)
if [ -d /data ]; then
export HF_HOME="${HF_HOME:-/data/.cache/huggingface}"
else
export HF_HOME="${HF_HOME:-/app/.cache/huggingface}"
fi
export HF_HUB_CACHE="${HF_HUB_CACHE:-$HF_HOME/hub}"
mkdir -p "$HF_HUB_CACHE"
echo "📦 Cache HF: $HF_HUB_CACHE"
# Download usando cache estruturado (não duplica arquivos)
python3 - <<'PY'
from huggingface_hub import snapshot_download
import os
cache_dir = os.environ.get('HF_HUB_CACHE')
print(f'📥 Baixando VINCIE-3B para cache: {cache_dir}')
try:
model_path = snapshot_download(
repo_id='ByteDance-Seed/VINCIE-3B',
cache_dir=cache_dir, # Usa cache estruturado
resume_download=True, # Retoma downloads interrompidos
max_workers=8, # Acelera com paralelismo
# Não usa local_dir - mantém tudo no cache HF
)
print(f'✅ Modelo em cache: {model_path}')
# Cria symlink para compatibilidade com código legacy
ckpt_link = '/app/ckpt/VINCIE-3B'
os.makedirs('/app/ckpt', exist_ok=True)
if os.path.islink(ckpt_link):
os.unlink(ckpt_link)
if not os.path.exists(ckpt_link):
os.symlink(model_path, ckpt_link)
print(f'🔗 Symlink: {ckpt_link} -> {model_path}')
except Exception as e:
print(f'⚠️ Download falhou: {e}')
import traceback
traceback.print_exc()
PY
# ---------------------- Builder Apex/Q8 ----------------------
if nvidia-smi >/dev/null 2>&1; then
if [ "${DISABLE_BUILDER:-0}" -eq 0 ]; then
echo "Executando builder Apex/Q8..."
chmod +x /app/builder.sh || true
timeout "${BUILDER_TIMEOUT_SEC:-7200000}" bash -lc /app/builder.sh || echo "Builder excedeu tempo/erro, prosseguindo."
else
echo "Builder desabilitado por DISABLE_BUILDER=1"
fi
else
echo "GPU não visível, pulando builder Apex/Q8."
fi
python3 -c "from flash_attn.ops.rms_norm import rms_norm; print(rms_norm)"
# ---------------------- Diagnóstico ----------------------
/app/info.sh || true
#ls -la /app || true
#ls -R /app | head -n 2000 || true
# ---------------------- Subindo serviço ----------------------
echo "🚀 Subindo serviços..."
# Dica: pode-se exportar VINCIE_DIRECT_TO_CKPT=1 para fallback interno
python /app/app_vince.py