# =============================================================================
# DOCKERFILE - Complete AI Video Suite v2.0.0
# Optimized for 8x NVIDIA L40S GPUs (384GB Total VRAM)
# Production-Ready Multi-GPU Video Generation Suite
# =============================================================================

FROM nvidia/cuda:12.8.0-devel-ubuntu22.04

# =============================================================================
# METADATA AND LABELS
# =============================================================================

LABEL maintainer="Complete AI Video Suite Team"
LABEL description="Multi-GPU AI Video Generation Suite with LTX FP8, Q8 Kernels, SeedVR, Wan2.2, VINCIE, MMAudio"
LABEL version="2.0.0"
LABEL build_date="2025-09-18"
LABEL cuda_version="12.4.0"
LABEL python_version="3.10"
LABEL pytorch_version="2.8.0+cu128"
LABEL architecture="amd64"
LABEL gpu_optimized="8x_L40S"
LABEL total_vram="384GB"
LABEL license="MIT"


# =============================================================================
# ENVIRONMENT VARIABLES - PRODUCTION OPTIMIZED
# =============================================================================

ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=UTC
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8

# Python optimization
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONIOENCODING=utf-8
ENV PIP_NO_CACHE_DIR=1
ENV PIP_DISABLE_PIP_VERSION_CHECK=0

# CUDA optimizations for 8x L40S GPUs
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,graphics
ENV NVIDIA_REQUIRE_CUDA="cuda>=12.8"
ENV CUDA_LAUNCH_BLOCKING=0
ENV TORCH_CUDA_ARCH_LIST="8.9"
ENV CUDA_CACHE_MAXSIZE=2147483648

# Multi-GPU distributed training
ENV NCCL_DEBUG=DEBUG
ENV NCCL_TREE_THRESHOLD=1
ENV NCCL_P2P_DISABLE=0
ENV NCCL_IB_DISABLE=0
ENV NCCL_NVLS_ENABLE=1
ENV NCCL_CROSS_NIC=1

# PyTorch optimizations
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512,roundup_power2_divisions:16
ENV TORCH_BACKENDS_CUDNN_BENCHMARK=1
ENV TORCH_BACKENDS_CUDA_MATMUL_ALLOW_TF32=1
ENV TORCH_BACKENDS_CUDNN_ALLOW_TF32=1

# Application paths
ENV APP_HOME=/app
ENV HF_HOME=/app/model_cache
ENV HF_HUB_CACHE=/app/model_cache/hub
ENV TRANSFORMERS_CACHE=/app/model_cache/transformers
ENV TORCH_HOME=/app/model_cache/torch
ENV TMPDIR=/app/tmp
ENV OUTPUT_DIR=/app/outputs

# CPU optimizations
ENV OMP_NUM_THREADS=8
ENV MKL_NUM_THREADS=8
ENV NUMEXPR_NUM_THREADS=8
ENV OPENBLAS_NUM_THREADS=8

# =============================================================================
# SYSTEM PACKAGE INSTALLATION
# =============================================================================

RUN apt-get update && apt-get install -y \
    build-essential \
    cmake \
    ninja-build \
    pkg-config \
    python3.11 \
    python3.11-dev \
    python3.11-distutils \
    python3-pip \
    python3.11-venv \
    git \
    git-lfs \
    curl \
    wget \
    rsync \
    unzip \
    zip \
    ffmpeg \
    libavcodec-dev \
    libavformat-dev \
    libavutil-dev \
    libswscale-dev \
    libgl1-mesa-glx \
    libgl1-mesa-dev \
    libglib2.0-0 \
    libsm6 \
    libxext6 \
    libxrender-dev \
    libgomp1 \
    libglu1-mesa \
    libglu1-mesa-dev \
    htop \
    nvtop \
    tree \
    vim \
    nano \
    tmux \
    screen \
    net-tools \
    iproute2 \
    iotop \
    && apt-get autoremove -y \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/* \
    && rm -rf /tmp/* \
    && rm -rf /var/tmp/*

# =============================================================================
# PYTHON SETUP AND OPTIMIZATION
# =============================================================================

RUN ln -sf /usr/bin/python3.10 /usr/bin/python3 && \
    ln -sf /usr/bin/python3.10 /usr/bin/python && \
    python3 -m pip install --upgrade pip==24.2 setuptools==70.0.0 wheel==0.43.0

RUN pip install  \
    packaging \
    ninja \
    cmake \
    pybind11 \
    scikit-build \
    cython \
    numpy>=1.24.3

# =============================================================================
# PYTORCH AND CUDA LIBRARIES
# =============================================================================

RUN pip install \
    torch>=2.8.0+cu128 \
    torchvision \
    torchaudio \
    --index-url https://download.pytorch.org/whl/cu128

RUN pip install torchao

RUN python3 -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA available: {torch.cuda.is_available()}'); print(f'CUDA version: {torch.version.cuda}'); print(f'Device count: {torch.cuda.device_count()}')"


# =============================================================================
# AI/ML LIBRARIES INSTALLATION
# =============================================================================

WORKDIR $APP_HOME
COPY . .

RUN pip install -r requirements.txt

# =============================================================================
# APPLICATION STRUCTURE SETUP
# =============================================================================

RUN mkdir -p \
    $APP_HOME/installer \
    $APP_HOME/monitoring \
    $APP_HOME/tools \
    $APP_HOME/configs \
    $APP_HOME/build_cache \
    $APP_HOME/model_cache/hub \
    $APP_HOME/model_cache/transformers \
    $APP_HOME/model_cache/torch \
    $APP_HOME/model_cache/ltx_models \
    $APP_HOME/tmp \
    $APP_HOME/outputs \
    $APP_HOME/logs \
    && chmod -R 755 $APP_HOME

# =============================================================================
# DOWNLOAD PREREQUISITE FILES
# =============================================================================


COPY . .

COPY configs/ ./configs/


RUN chmod +x start.sh && \
    find . -name "*.sh" -exec chmod +x {} \; && \
    find . -name "*.py" -exec chmod +x {} \;

# =============================================================================
# CREATE OPTIMIZATION PATCHES AND TOOLS (FIXED SYNTAX)
# =============================================================================


# =============================================================================
# CONFIGURATION FILES
# =============================================================================

# Create default LTX FP8 configuration
RUN cat <<'YAML_CONFIG' > $APP_HOME/configs/ltxv-13b-0.9.8-distilled-fp8.yaml
# LTX Video FP8 Distilled Configuration
# Optimized for 8x L40S GPUs (384GB VRAM)
model:
  target: "ltx_video.models.transformer_temporal.TransformerTemporalModel"
  params:
    transformer_additional_kwargs:
      attention_mode: "sdpa"
      enable_flash_attention: true
      memory_efficient_attention: true
    network_config:
      model_name: "ltxv-13b-0.9.8-distilled-fp8"
      fp8_optimization: true
      quantization: "fp8"
      ada_optimized: true
      multi_gpu_support: true

scheduler:
  target: "diffusers.LTXVideoScheduler"
  params:
    num_train_timesteps: 1000
    beta_start: 0.0001
    beta_end: 0.02
    beta_schedule: "scaled_linear"

vae:
  target: "diffusers.AutoencoderKLLTXVideo"
  params:
    force_upcast: false
    enable_slicing: true
    enable_tiling: true

text_encoder:
  target: "transformers.T5EncoderModel"
  params:
    torch_dtype: "bfloat16"

pipeline:
  target: "diffusers.LTXVideoPipeline"
  params:
    scheduler_type: "LTXVideoScheduler"
    num_inference_steps: 4
    guidance_scale: 1.0
    height: 704
    width: 1216
    num_frames: 121
    fps: 30
    enable_memory_efficient_attention: true
    enable_cpu_offload: false
    enable_model_cpu_offload: false
    max_batch_size: 4

multi_gpu:
  enabled: true
  num_gpus: 8
  distribution_strategy: "data_parallel"
  load_balancing: "memory_aware"
  synchronize_gpus: true
YAML_CONFIG

# Create multi-GPU optimization config
RUN cat <<'GPU_CONFIG' > $APP_HOME/configs/multi_gpu_config.yaml
# Multi-GPU Configuration for 8x L40S Setup
system:
  gpu_count: 8
  total_vram: "384GB"
  compute_capability: "8.9"
  architecture: "ADA_LOVELACE"

distributed_training:
  backend: "nccl"
  init_method: "env://"
  world_size: 8
  rank: 0

memory_optimization:
  gradient_checkpointing: true
  mixed_precision: "bf16"
  max_batch_size_per_gpu: 8
  gradient_accumulation_steps: 4
  memory_fraction: 0.95

performance:
  torch_compile: true
  cuda_graphs: true
  tensor_cores: true
  flash_attention: true
  memory_efficient_attention: true

load_balancing:
  strategy: "memory_aware"
  rebalance_interval: 30
  utilization_threshold: 0.8

thermal_management:
  max_temperature: 83
  fan_curve: "aggressive"
  throttle_threshold: 80
  monitoring_interval: 10

power_management:
  max_power_limit: 300
  efficiency_mode: false
  power_monitoring: true
GPU_CONFIG

# =============================================================================
# HEALTH CHECK SCRIPT
# =============================================================================

RUN cat <<'HEALTHCHECK_SCRIPT' > $APP_HOME/healthcheck.py
#!/usr/bin/env python3
"""
Health check script for Complete AI Video Suite
"""
import sys
import requests
import torch
import time
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def check_cuda():
    """Check CUDA availability and GPU status"""
    if not torch.cuda.is_available():
        logger.error("CUDA not available")
        return False
    
    gpu_count = torch.cuda.device_count()
    logger.info(f"CUDA available with {gpu_count} GPUs")
    
    for i in range(gpu_count):
        try:
            torch.cuda.set_device(i)
            props = torch.cuda.get_device_properties(i)
            memory_allocated = torch.cuda.memory_allocated() / 1024**3
            memory_total = props.total_memory / 1024**3
            
            logger.info(f"GPU {i}: {props.name} ({memory_allocated:.2f}GB/{memory_total:.1f}GB)")
            
            x = torch.randn(100, 100, device=f'cuda:{i}')
            y = torch.matmul(x, x)
            torch.cuda.synchronize()
            
        except Exception as e:
            logger.error(f"GPU {i} test failed: {e}")
            return False
    
    return True

def check_web_service():
    """Check if web service is responding"""
    try:
        response = requests.get("http://localhost:7860/", timeout=10)
        if response.status_code == 200:
            logger.info("Web service is responding")
            return True
        else:
            logger.error(f"Web service returned status code: {response.status_code}")
            return False
    except requests.RequestException as e:
        logger.error(f"Web service check failed: {e}")
        return False

def main():
    """Main health check routine"""
    logger.info("Starting health check...")
    
    if not check_cuda():
        sys.exit(1)
    
    if not check_web_service():
        sys.exit(1)
    
    logger.info("All health checks passed")
    sys.exit(0)

if __name__ == "__main__":
    main()
HEALTHCHECK_SCRIPT

RUN chmod +x $APP_HOME/healthcheck.py

# =============================================================================
# USER SETUP AND SECURITY
# =============================================================================

RUN mkdir -p /etc/sudoers.d && \
    useradd -m -u 1000 -s /bin/bash appuser && \
    usermod -aG sudo appuser && \
    chown -R appuser:appuser $APP_HOME && \
    echo "appuser ALL=(ALL) NOPASSWD: /usr/bin/nvidia-smi, /usr/bin/nvidia-ml-py" > /etc/sudoers.d/appuser

USER appuser

WORKDIR $APP_HOME

# =============================================================================
# RUNTIME CONFIGURATION
# =============================================================================

EXPOSE 7860 8001 8002 6006

VOLUME ["/app/model_cache", "/app/outputs", "/app/logs", "/app/build_cache"]

HEALTHCHECK --interval=60s --timeout=30s --start-period=300s --retries=3 \
    CMD python3 /app/healthcheck.py

# =============================================================================
# FINAL SETUP AND ENTRY POINT
# =============================================================================

RUN cat <<'ENTRYPOINT_SCRIPT' > $APP_HOME/docker-entrypoint.sh
#!/bin/bash
set -euo pipefail

echo "🚀 Complete AI Suite - Docker Container Starting..."
echo "🐳 Container: $(hostname)"
echo "👤 User: $(whoami)"
echo "🎮 GPUs: $(nvidia-smi --list-gpus | wc -l || echo '0')"

if command -v nvidia-smi >/dev/null 2>&1; then
    echo "💾 CUDA Memory:"
    nvidia-smi --query-gpu=memory.total,memory.used --format=csv,noheader,nounits | nl
fi

echo "🔧 Applying optimization patches..."
python3 /app/tools/optimization_patch.py

echo "📁 Setting up permissions..."
chmod -R 755 /app/installer
chmod -R 755 /app/monitoring
chmod +x /app/start.sh

mkdir -p /app/logs /app/outputs /app/tmp
chmod 777 /app/logs /app/outputs /app/tmp

echo "✅ Docker container initialization complete"
echo "🚀 Starting Complete AI Video Suite..."

exec /app/start.sh "$@"
ENTRYPOINT_SCRIPT

RUN chmod +x $APP_HOME/docker-entrypoint.sh

ENTRYPOINT ["/app/docker-entrypoint.sh"]

CMD ["--listen", "--multi-gpu", "--optimize"]

# =============================================================================
# FINAL METADATA
# =============================================================================

RUN echo "Complete AI Video Suite v2.0.0" > /app/VERSION && \
    echo "Build Date: 2025-09-18T$(date +%H:%M:%S)" >> /app/VERSION && \
    echo "CUDA: 12.4.1" >> /app/VERSION && \
    echo "PyTorch: $(python3 -c 'import torch; print(torch.__version__)')" >> /app/VERSION && \
    echo "Optimized for: 8x NVIDIA L40S GPUs" >> /app/VERSION

LABEL org.opencontainers.image.title="Complete AI Video Suite"
LABEL org.opencontainers.image.description="Production-ready multi-GPU video generation with LTX FP8, Q8 Kernels, and more"
LABEL org.opencontainers.image.version="2.0.0"
LABEL org.opencontainers.image.created="2025-09-18T17:42:00Z"
LABEL org.opencontainers.image.revision="main"
LABEL org.opencontainers.image.licenses="MIT"