Aduc-sdr-2_5s

Paused

App Files Files Community

carlex3321 commited on Sep 27

Commit

82e33b7

verified ·

1 Parent(s): e9eef92

Update scripts/download_models.py

Browse files

Files changed (1) hide show

scripts/download_models.py +283 -31

scripts/download_models.py CHANGED Viewed

@@ -1,45 +1,297 @@
-# scripts/download_models.py
-import os, yaml
 from huggingface_hub import snapshot_download, login
-cfg_path = os.environ.get("CONFIG_PATH", "config.yaml")
-models_dir = os.environ.get("MODELS_DIR", "/app/models")
-hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
-if hf_token:
     try:
-        login(token=hf_token)
     except Exception:
-        pass
-os.makedirs(models_dir, exist_ok=True)
-with open(cfg_path, "r") as f:
-    data = yaml.safe_load(f)
-def collect_ids(d):
-    ids = set()
-    if isinstance(d, dict):
-        for k, v in d.items():
             if isinstance(v, dict):
                 mid = v.get("model_id")
                 if isinstance(mid, str) and mid.strip():
                     ids.add(mid.strip())
     return sorted(ids)
-# Tenta coletar no root e/ou em specialists
-model_ids = collect_ids(data)
-if not model_ids and isinstance(data.get("specialists"), dict):
-    model_ids = collect_ids(data["specialists"])
-for mid in model_ids:
-    local_dir = os.path.join(models_dir, mid.replace("/", "__"))
-    snapshot_download(
-        repo_id=mid,
-        local_dir=local_dir,
-        #local_dir_use_symlinks=False,
-        #resume_download=True,
-        token=hf_token
-    )
-print("Downloaded:", model_ids)

+#!/usr/bin/env python3
+import os
+import sys
+import json
+import time
+import yaml
+import logging
+import shutil
+import traceback
+from pathlib import Path
+from typing import List, Dict, Set
 from huggingface_hub import snapshot_download, login
+# -------------------------
+# Configuração de logging
+# -------------------------
+VERBOSE = int(os.environ.get("VERBOSE", "1"))
+LOG_LEVEL = logging.DEBUG if VERBOSE >= 2 else (logging.INFO if VERBOSE == 1 else logging.WARNING)
+logging.basicConfig(
+    level=LOG_LEVEL,
+    format="%(asctime)s | %(levelname)s | %(message)s",
+    datefmt="%H:%M:%S",
+)
+log = logging.getLogger("download_models")
+# -------------------------
+# Variáveis de ambiente
+# -------------------------
+CFG_PATH = os.environ.get("CONFIG_PATH", "/app/config.yaml")
+MODELS_DIR = os.environ.get("MODELS_DIR", "/app/models")
+HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
+MAX_WORKERS = int(os.environ.get("MAX_WORKERS", "4"))  # reduzido para poupar RAM
+FAIL_FAST = os.environ.get("FAIL_FAST", "0") == "1"
+DRY_RUN = os.environ.get("DRY_RUN", "0") == "1"
+SKIP_EXISTING = os.environ.get("SKIP_EXISTING", "1") == "1"  # pular se dir já existir e não estiver vazio
+INCLUDE_DIFFUSERS_DIRS = os.environ.get("INCLUDE_DIFFUSERS_DIRS", "1") == "1"
+ALLOW_BIN = os.environ.get("ALLOW_BIN", "0") == "1"  # por padrão evita .bin pesados
+EXTRA_ALLOW = [p.strip() for p in os.environ.get("EXTRA_ALLOW_PATTERNS", "").split(",") if p.strip()]
+EXTRA_IGNORE = [p.strip() for p in os.environ.get("EXTRA_IGNORE_PATTERNS", "").split(",") if p.strip()]
+# Sugerido pelos docs para acelerar quando disponível
+if os.environ.get("HF_HUB_ENABLE_HF_TRANSFER") is None:
+    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+def bytes_to_human(n: int) -> str:
+    for unit in ["B","KB","MB","GB","TB"]:
+        if n < 1024:
+            return f"{n:.1f} {unit}"
+        n /= 1024
+    return f"{n:.1f} PB"
+def dir_size(path: Path) -> int:
+    total = 0
+    if not path.exists():
+        return 0
+    for p in path.rglob("*"):
+        if p.is_file():
+            try:
+                total += p.stat().st_size
+            except Exception:
+                pass
+    return total
+def disk_free(dirname: Path) -> int:
     try:
+        usage = shutil.disk_usage(dirname)
+        return usage.free
     except Exception:
+        return 0
+def load_yaml(path: str) -> Dict:
+    with open(path, "r") as f:
+        return yaml.safe_load(f)
+def collect_model_ids(cfg: Dict) -> List[str]:
+    ids: Set[str] = set()
+    def collect_from_section(d: Dict):
+        for _, v in d.items():
             if isinstance(v, dict):
                 mid = v.get("model_id")
                 if isinstance(mid, str) and mid.strip():
                     ids.add(mid.strip())
+    # Primeiro nível (como no config.yaml enviado)
+    if isinstance(cfg, dict):
+        collect_from_section(cfg)
+    # Se existir seção "specialists"
+    if isinstance(cfg.get("specialists"), dict):
+        collect_from_section(cfg["specialists"])
     return sorted(ids)
+def build_patterns() -> (List[str], List[str]):
+    allow_patterns = [
+        # Pesos e índices leves
+        "*.safetensors",
+        "model.safetensors",
+        "model.safetensors.index.json",
+        "pytorch_model.bin.index.json",  # pequeno, sem puxar os .bin grandes
+        # Metadados/configs
+        "*.json",
+        "config.json",
+        "generation_config.json",
+        "model_index.json",
+        "configs/*.yaml",
+        # Tokenizer
+        "tokenizer.json",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt",
+        "*.model",                     # sentencepiece
+        "tokenizer/*",
+    ]
+    if INCLUDE_DIFFUSERS_DIRS:
+        allow_patterns += [
+            # Estrutura típica de Diffusers
+            "unet/*",
+            "vae/*",
+            "text_encoder/*",
+            "text_encoder_2/*",
+            "scheduler/*",
+            "feature_extractor/*",
+            "processor/*",
+            "preprocessor_config.json",
+        ]
+    # Opcionalmente permitir .bin (desativado por padrão para reduzir payload)
+    if ALLOW_BIN:
+        allow_patterns += [
+            "*.bin",  # cuidado: grande
+        ]
+    # Ignorar conteúdos pesados/dispensáveis
+    ignore_patterns = [
+        "*.h5",
+        "*.msgpack",
+        "*.onnx",
+        "*.npz",
+        "*.tar",
+        "*.zip",
+        "*.ckpt",
+        "*.pt",
+        "*.tflite",
+        "*.onnx_data",
+        "flax_model.msgpack",
+        "rust_model.ot",
+        # mídias e docs
+        "*.png", "*.jpg", "*.jpeg", "*.gif", "*.webp", "*.bmp", "*.svg",
+        "*.md", "README*", "LICENSE*", "docs/*", "images/*", "samples/*", "assets/*",
+        ".gitattributes", ".gitignore",
+        # variantes quantizadas pesadas (ajuste conforme o caso)
+        "int8/*", "int4/*", "fp16/*",
+    ]
+    # Extensões extras via ambiente
+    allow_patterns += EXTRA_ALLOW
+    ignore_patterns += EXTRA_IGNORE
+    # Deduplicar mantendo ordem
+    def dedup(seq):
+        seen = set()
+        out = []
+        for x in seq:
+            if x not in seen:
+                seen.add(x)
+                out.append(x)
+        return out
+    return dedup(allow_patterns), dedup(ignore_patterns)
+def main():
+    start = time.time()
+    Path(MODELS_DIR).mkdir(parents=True, exist_ok=True)
+    # Info de ambiente
+    log.info(f"Config: CFG_PATH={CFG_PATH} MODELS_DIR={MODELS_DIR} MAX_WORKERS={MAX_WORKERS} DRY_RUN={DRY_RUN} FAIL_FAST={FAIL_FAST} SKIP_EXISTING={SKIP_EXISTING} VERBOSE={VERBOSE}")  # noqa
+    log.info(f"HF_HUB_ENABLE_HF_TRANSFER={os.environ.get('HF_HUB_ENABLE_HF_TRANSFER')}")  # noqa
+    # Login opcional com token
+    if HF_TOKEN:
+        try:
+            login(token=HF_TOKEN)
+            log.info("Autenticado no Hugging Face Hub (token fornecido).")
+        except Exception as e:
+            log.warning(f"Falha no login HF: {e}")
+    # Ler config.yaml e coletar model_ids
+    cfg = load_yaml(CFG_PATH)
+    model_ids = collect_model_ids(cfg)
+    if not model_ids:
+        log.warning("Nenhum model_id encontrado no config.yaml; nada a baixar.")
+        return
+    allow_patterns, ignore_patterns = build_patterns()
+    log.info("Allow patterns:")
+    for p in allow_patterns:
+        log.info(f"  - {p}")
+    log.info("Ignore patterns:")
+    for p in ignore_patterns:
+        log.info(f"  - {p}")
+    # Relatório de disco
+    free_before = disk_free(Path(MODELS_DIR))
+    log.info(f"Espaço livre antes: {bytes_to_human(free_before)}")
+    downloaded: Dict[str, Dict] = {}
+    errors: Dict[str, str] = {}
+    for mid in model_ids:
+        safe_dir = mid.replace("/", "__")
+        local_dir = Path(MODELS_DIR) / safe_dir
+        if SKIP_EXISTING and local_dir.exists():
+            try:
+                non_empty = any(local_dir.iterdir())
+            except Exception:
+                non_empty = False
+            if non_empty:
+                log.info(f"[skip] {mid} -> {local_dir} já existe e não está vazio.")
+                downloaded[mid] = {
+                    "local_dir": str(local_dir),
+                    "skipped": True,
+                    "size_bytes": dir_size(local_dir),
+                }
+                continue
+        log.info(f"[start] {mid} -> {local_dir}")
+        if DRY_RUN:
+            downloaded[mid] = {
+                "local_dir": str(local_dir),
+                "dry_run": True,
+                "size_bytes": 0,
+            }
+            continue
+        try:
+            t0 = time.time()
+            out_dir = snapshot_download(
+                repo_id=mid,
+                local_dir=str(local_dir),
+                local_dir_use_symlinks=False,
+                resume_download=True,
+                token=HF_TOKEN,
+                max_workers=MAX_WORKERS,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+            )
+            elapsed = time.time() - t0
+            size_b = dir_size(Path(out_dir))
+            log.info(f"[done] {mid} baixado em {elapsed:.1f}s | tamanho {bytes_to_human(size_b)} | destino {out_dir}")
+            downloaded[mid] = {
+                "local_dir": out_dir,
+                "elapsed_sec": elapsed,
+                "size_bytes": size_b,
+            }
+        except Exception as e:
+            tb = traceback.format_exc(limit=1)
+            msg = f"Erro ao baixar {mid}: {e} | {tb}"
+            log.error(msg)
+            errors[mid] = str(e)
+            if FAIL_FAST:
+                break
+    free_after = disk_free(Path(MODELS_DIR))
+    log.info(f"Espaço livre depois: {bytes_to_human(free_after)}")
+    report = {
+        "models_requested": model_ids,
+        "downloaded": downloaded,
+        "errors": errors,
+        "free_before": free_before,
+        "free_after": free_after,
+        "elapsed_total_sec": time.time() - start,
+        "patterns": {
+            "allow": allow_patterns,
+            "ignore": ignore_patterns,
+        },
+        "env": {
+            "MAX_WORKERS": MAX_WORKERS,
+            "ALLOW_BIN": ALLOW_BIN,
+            "INCLUDE_DIFFUSERS_DIRS": INCLUDE_DIFFUSERS_DIRS,
+            "SKIP_EXISTING": SKIP_EXISTING,
+            "DRY_RUN": DRY_RUN,
+            "VERBOSE": VERBOSE,
+        },
+    }
+    print(json.dumps(report, indent=2, ensure_ascii=False))
+    if errors and FAIL_FAST:
+        sys.exit(1)
+if __name__ == "__main__":
+    main()