Spaces:

jeanbaptdzd
/

dragonllm-finance-models

Runtime error

File size: 20,796 Bytes

8c0b652

#!/usr/bin/env python3
"""
Scaleway Deployment Configuration for LinguaCustodia Financial AI API
"""

import os
import logging
from typing import Dict, Any
from dotenv import load_dotenv
from scaleway import Client
from scaleway.container.v1beta1 import ContainerV1Beta1API
from scaleway.function.v1beta1 import FunctionV1Beta1API

load_dotenv()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ScalewayDeployment:
    """Scaleway deployment manager for LinguaCustodia API."""
    
    def __init__(self):
        """Initialize Scaleway client with credentials from .env."""
        self.access_key = os.getenv('SCW_ACCESS_KEY')
        self.secret_key = os.getenv('SCW_SECRET_KEY')
        self.project_id = os.getenv('SCW_DEFAULT_PROJECT_ID')
        self.region = os.getenv('SCW_REGION', 'fr-par-2')  # PARIS 2 for H100 availability
        
        if not all([self.access_key, self.secret_key, self.project_id]):
            raise ValueError("Missing required Scaleway credentials in .env file")
        
        self.client = Client(
            access_key=self.access_key,
            secret_key=self.secret_key,
            default_project_id=self.project_id,
            default_region=self.region,
            default_zone=f"{self.region}-1"
        )
        
        self.container_api = ContainerV1Beta1API(self.client)
        self.function_api = FunctionV1Beta1API(self.client)
        
        logger.info(f"Scaleway client initialized for project: {self.project_id}")
    
    def _get_environment_variables(self, model_size: str = "8b") -> Dict[str, str]:
        """Get common environment variables for deployments."""
        base_vars = {
            "HF_TOKEN_LC": os.getenv('HF_TOKEN_LC', ''),
            "HF_TOKEN": os.getenv('HF_TOKEN', ''),
            "APP_PORT": "7860",  # HuggingFace standard port
            "LOG_LEVEL": "INFO",
            "HF_HOME": "/data/.huggingface"  # Persistent storage for model caching
        }
        
        # Configure model-specific variables
        if model_size == "70b":
            base_vars.update({
                "MODEL_NAME": "llama3.1-70b-v1.0",  # Use latest v1.0 model
                "MAX_CONTEXT_LENGTH": "128000",  # 128K context for v1.0 70B
                "BATCH_SIZE": "1",  # Conservative batch size for 70B
                "GPU_MEMORY_FRACTION": "0.95",  # Use 95% of GPU memory for BF16
                "VLLM_GPU_MEMORY_UTILIZATION": "0.95",
                "VLLM_MAX_MODEL_LEN": "128000",  # 128K context for v1.0
                "VLLM_DTYPE": "bfloat16",  # BF16 precision
                "VLLM_ENFORCE_EAGER": "true",  # Better memory management
                "VLLM_DISABLE_CUSTOM_ALL_REDUCE": "true",  # Optimize for single GPU
                "VLLM_BLOCK_SIZE": "16",  # Optimize KV cache block size
                "VLLM_SWAP_SPACE": "4",  # 4GB swap space for memory overflow
                "VLLM_CPU_OFFLOAD_GBN": "1"  # CPU offload for gradient computation
            })
        elif model_size == "32b":
            base_vars.update({
                "MODEL_NAME": "qwen3-32b-v1.0",  # New 32B model
                "MAX_CONTEXT_LENGTH": "32768",   # Qwen 3 32B supports 32K context
                "BATCH_SIZE": "1",  # Conservative batch size for 32B
                "GPU_MEMORY_FRACTION": "0.9",  # Use 90% of GPU memory
                "VLLM_GPU_MEMORY_UTILIZATION": "0.9",
                "VLLM_MAX_MODEL_LEN": "32768",
                "VLLM_DTYPE": "bfloat16",  # BF16 precision for 32B
                "VLLM_ENFORCE_EAGER": "true",
                "VLLM_DISABLE_CUSTOM_ALL_REDUCE": "true",
                "VLLM_BLOCK_SIZE": "16",
                "VLLM_SWAP_SPACE": "2",  # 2GB swap space
                "VLLM_CPU_OFFLOAD_GBN": "1"
            })
        elif model_size == "12b":
            base_vars.update({
                "MODEL_NAME": "gemma3-12b-v1.0",  # Use latest v1.0 model
                "MAX_CONTEXT_LENGTH": "8192",     # Gemma 3 12B supports 8K context
                "BATCH_SIZE": "2",
                "GPU_MEMORY_FRACTION": "0.85",
                "VLLM_GPU_MEMORY_UTILIZATION": "0.85",
                "VLLM_MAX_MODEL_LEN": "8192"
            })
        else:  # 8B and smaller
            base_vars.update({
                "MODEL_NAME": os.getenv('MODEL_NAME', 'qwen3-8b-v1.0'),  # Default to v1.0
                "MAX_CONTEXT_LENGTH": "32768",   # Default 32K (Llama 3.1 8B can use 128K)
                "BATCH_SIZE": "4",
                "GPU_MEMORY_FRACTION": "0.8",
                "VLLM_GPU_MEMORY_UTILIZATION": "0.8",
                "VLLM_MAX_MODEL_LEN": "32768"
            })
        
        return base_vars
    
    def create_container_namespace(self, name: str = "lingua-custodia") -> Dict[str, Any]:
        """Create a container namespace for the LinguaCustodia API."""
        try:
            namespace = self.container_api.create_namespace(
                project_id=self.project_id,
                name=name,
                description="LinguaCustodia Financial AI API Container Namespace",
                environment_variables=self._get_environment_variables()
            )
            
            logger.info(f"Created container namespace: {namespace.id}")
            return {
                "namespace_id": namespace.id,
                "name": namespace.name,
                "status": "created"
            }
            
        except Exception as e:
            logger.error(f"Failed to create container namespace: {e}")
            raise
    
    def deploy_container(self, namespace_id: str, image_name: str = "lingua-custodia-api", model_size: str = "70b") -> Dict[str, Any]:
        """Deploy the LinguaCustodia API as a container with optimized resources for model size."""
        try:
            env_vars = self._get_environment_variables(model_size)
            env_vars["PYTHONPATH"] = "/app"
            
            # Configure resources based on model size
            if model_size == "70b":
                memory_limit = 65536  # 64GB for 70B models
                cpu_limit = 16000     # 16 vCPUs for 70B models
                timeout = "1800s"     # 30 minutes for model loading
                max_scale = 1         # Single instance for 70B (resource intensive)
            elif model_size == "12b":
                memory_limit = 32768  # 32GB for 12B models
                cpu_limit = 8000      # 8 vCPUs for 12B models
                timeout = "900s"      # 15 minutes for model loading
                max_scale = 2         # Limited scaling for 12B
            else:  # 8B and smaller
                memory_limit = 16384  # 16GB for 8B models
                cpu_limit = 4000      # 4 vCPUs for 8B models
                timeout = "600s"      # 10 minutes for model loading
                max_scale = 3         # Normal scaling for smaller models
            
            container = self.container_api.create_container(
                namespace_id=namespace_id,
                name=image_name,
                description=f"LinguaCustodia Financial AI API ({model_size.upper()} Model)",
                environment_variables=env_vars,
                min_scale=1,
                max_scale=max_scale,
                memory_limit=memory_limit,
                cpu_limit=cpu_limit,
                timeout=timeout,
                privacy="public",
                http_option="enabled",
                port=7860,  # HuggingFace standard port
                protocol="http1"
            )
            
            logger.info(f"Created container: {container.id}")
            return {
                "container_id": container.id,
                "name": container.name,
                "status": "created",
                "endpoint": getattr(container, 'domain_name', None)
            }
            
        except Exception as e:
            logger.error(f"Failed to create container: {e}")
            raise
    
    def deploy_gpu_container(self, namespace_id: str, image_name: str = "lingua-custodia-gpu", gpu_type: str = "L40S") -> Dict[str, Any]:
        """Deploy the LinguaCustodia API as a GPU-enabled container for 70B models."""
        try:
            env_vars = self._get_environment_variables("70b")
            env_vars["PYTHONPATH"] = "/app"
            env_vars["GPU_TYPE"] = gpu_type
            
            # GPU-specific configuration for BF16 precision with Scaleway pricing
            gpu_configs = {
                "L40S": {
                    "memory_limit": 32768,  # 32GB RAM
                    "cpu_limit": 8000,      # 8 vCPUs
                    "gpu_memory": 48,       # 48GB VRAM
                    "context_length": 32768,  # Default 32K (Llama 3.1 8B can use 128K)
                    "max_model_size": "8B",   # L40S can only handle up to 8B models
                    "bf16_support": True,
                    "hourly_price": "€1.50",  # Estimated (not available in current pricing)
                    "monthly_price": "~€1,095"
                },
                "A100": {
                    "memory_limit": 131072, # 128GB RAM
                    "cpu_limit": 32000,     # 32 vCPUs
                    "gpu_memory": 80,       # 80GB VRAM
                    "context_length": 32768,  # Default 32K (model-specific)
                    "max_model_size": "32B",  # A100 can handle 32B models with full context
                    "bf16_support": True,
                    "hourly_price": "€2.20",  # Estimated (not in current H100-focused pricing)
                    "monthly_price": "~€1,606"
                },
                "H100": {
                    "memory_limit": 131072, # 128GB RAM (240GB actual)
                    "cpu_limit": 24000,     # 24 vCPUs (actual H100-1-80G specs)
                    "gpu_memory": 80,       # 80GB VRAM
                    "context_length": 128000,  # 128K context for Llama 3.1 70B
                    "max_model_size": "70B",  # H100 can handle 70B models with BF16
                    "bf16_support": True,
                    "hourly_price": "€2.73",
                    "monthly_price": "~€1,993"
                },
                "H100_DUAL": {
                    "memory_limit": 262144, # 256GB RAM (480GB actual)
                    "cpu_limit": 48000,     # 48 vCPUs (actual H100-2-80G specs)
                    "gpu_memory": 160,      # 160GB VRAM (2x80GB)
                    "context_length": 128000,  # Full context for BF16 70B models
                    "max_model_size": "70B",   # Dual H100 can handle 70B BF16 with full context
                    "bf16_support": True,
                    "hourly_price": "€5.46",
                    "monthly_price": "~€3,986"
                },
                "H100_SXM_DUAL": {
                    "memory_limit": 131072, # 128GB RAM (240GB actual)
                    "cpu_limit": 32000,     # 32 vCPUs (actual H100-SXM-2-80G specs)
                    "gpu_memory": 160,      # 160GB VRAM (2x80GB)
                    "context_length": 128000,  # Full context for BF16 70B models
                    "max_model_size": "70B",   # SXM version with better interconnect
                    "bf16_support": True,
                    "hourly_price": "€6.018",
                    "monthly_price": "~€4,393"
                },
                "H100_SXM_QUAD": {
                    "memory_limit": 262144, # 256GB RAM (480GB actual)
                    "cpu_limit": 64000,     # 64 vCPUs (actual H100-SXM-4-80G specs)
                    "gpu_memory": 320,      # 320GB VRAM (4x80GB)
                    "context_length": 128000,  # Full context for BF16 70B models
                    "max_model_size": "70B",   # Quad H100 optimal for BF16 70B
                    "bf16_support": True,
                    "hourly_price": "€11.61",
                    "monthly_price": "~€8,475"
                }
            }
            
            config = gpu_configs.get(gpu_type, gpu_configs["L40S"])
            env_vars["GPU_MEMORY_GB"] = str(config["gpu_memory"])
            env_vars["MAX_CONTEXT_LENGTH"] = str(config["context_length"])
            
            container = self.container_api.create_container(
                namespace_id=namespace_id,
                name=image_name,
                description=f"LinguaCustodia Financial AI API (70B Model on {gpu_type})",
                environment_variables=env_vars,
                min_scale=1,
                max_scale=1,  # Single instance for GPU workloads
                memory_limit=config["memory_limit"],
                cpu_limit=config["cpu_limit"],
                timeout="1800s",  # 30 minutes for model loading
                privacy="public",
                http_option="enabled",
                port=7860,
                protocol="http1"
            )
            
            logger.info(f"Created GPU container: {container.id} with {gpu_type}")
            return {
                "container_id": container.id,
                "name": container.name,
                "status": "created",
                "gpu_type": gpu_type,
                "gpu_memory": config["gpu_memory"],
                "context_length": config["context_length"],
                "endpoint": getattr(container, 'domain_name', None)
            }
            
        except Exception as e:
            logger.error(f"Failed to create GPU container: {e}")
            raise
    
    def deploy_function(self, namespace_id: str, function_name: str = "lingua-custodia-api") -> Dict[str, Any]:
        """Deploy the LinguaCustodia API as a serverless function."""
        try:
            function = self.function_api.create_function(
                namespace_id=namespace_id,
                name=function_name,
                description="LinguaCustodia Financial AI API Serverless Function",
                environment_variables=self._get_environment_variables(),
                min_scale=0,
                max_scale=5,
                memory_limit=16384,  # 16GB for 8B models (was 1GB - insufficient)
                timeout="600s",  # 10 minutes for model loading (Scaleway expects string with unit)
                privacy="public",
                http_option="enabled"
            )
            
            logger.info(f"Created function: {function.id}")
            return {
                "function_id": function.id,
                "name": function.name,
                "status": "created",
                "endpoint": getattr(function, 'domain_name', None)
            }
            
        except Exception as e:
            logger.error(f"Failed to create function: {e}")
            raise
    
    def list_deployments(self) -> Dict[str, Any]:
        """List all existing deployments."""
        try:
            namespaces = self.container_api.list_namespaces()
            function_namespaces = self.function_api.list_namespaces()
            all_functions = []
            
            for func_ns in function_namespaces.namespaces:
                try:
                    functions = self.function_api.list_functions(namespace_id=func_ns.id)
                    all_functions.extend(functions.functions)
                except Exception as e:
                    logger.warning(f"Could not list functions for namespace {func_ns.id}: {e}")
            
            return {
                "namespaces": [{"id": ns.id, "name": ns.name} for ns in namespaces.namespaces],
                "functions": [{"id": func.id, "name": func.name} for func in all_functions],
                "total_namespaces": len(namespaces.namespaces),
                "total_functions": len(all_functions)
            }
            
        except Exception as e:
            logger.error(f"Failed to list deployments: {e}")
            raise
    
    def get_deployment_status(self, deployment_id: str, deployment_type: str = "container") -> Dict[str, Any]:
        """Get the status of a specific deployment."""
        try:
            if deployment_type == "container":
                container = self.container_api.get_container(deployment_id)
                return {
                    "id": container.id,
                    "name": container.name,
                    "status": container.status,
                    "endpoint": getattr(container, 'domain_name', None),
                    "memory_limit": container.memory_limit,
                    "cpu_limit": container.cpu_limit
                }
            elif deployment_type == "function":
                function = self.function_api.get_function(deployment_id)
                return {
                    "id": function.id,
                    "name": function.name,
                    "status": function.status,
                    "endpoint": getattr(function, 'domain_name', None),
                    "memory_limit": function.memory_limit
                }
            else:
                raise ValueError("deployment_type must be 'container' or 'function'")
                
        except Exception as e:
            logger.error(f"Failed to get deployment status: {e}")
            raise

def main():
    """Main function to demonstrate Scaleway deployment for LinguaCustodia v1.0 models."""
    try:
        deployment = ScalewayDeployment()
        
        deployments = deployment.list_deployments()
        logger.info(f"Found {deployments['total_namespaces']} namespaces and {deployments['total_functions']} functions")
        
        # Create namespace for v1.0 models deployment
        namespace = deployment.create_container_namespace("lingua-custodia-v1.0")
        logger.info(f"Namespace created: {namespace['namespace_id']}")
        
        # Deploy 32B model on A100 (new model size)
        a100_32b_container = deployment.deploy_gpu_container(
            namespace['namespace_id'], 
            "lingua-custodia-32b-v1.0-a100", 
            "A100"
        )
        logger.info(f"A100 32B Container created: {a100_32b_container['container_id']}")
        logger.info(f"GPU Type: {a100_32b_container['gpu_type']}")
        logger.info(f"GPU Memory: {a100_32b_container['gpu_memory']}GB")
        logger.info(f"Context Length: {a100_32b_container['context_length']} tokens")
        
        # Deploy 70B v1.0 model on H100_DUAL (recommended for 128K context)
        h100_dual_container = deployment.deploy_gpu_container(
            namespace['namespace_id'], 
            "lingua-custodia-70b-v1.0-h100-dual", 
            "H100_DUAL"
        )
        logger.info(f"H100 Dual 70B Container created: {h100_dual_container['container_id']}")
        logger.info(f"GPU Type: {h100_dual_container['gpu_type']}")
        logger.info(f"GPU Memory: {h100_dual_container['gpu_memory']}GB")
        logger.info(f"Context Length: {h100_dual_container['context_length']} tokens")
        
        # Deploy 8B v1.0 model on L40S (cost-effective option)
        l40s_8b_container = deployment.deploy_gpu_container(
            namespace['namespace_id'], 
            "lingua-custodia-8b-v1.0-l40s", 
            "L40S"
        )
        logger.info(f"L40S 8B Container created: {l40s_8b_container['container_id']}")
        logger.info(f"GPU Type: {l40s_8b_container['gpu_type']}")
        logger.info(f"GPU Memory: {l40s_8b_container['gpu_memory']}GB")
        logger.info(f"Context Length: {l40s_8b_container['context_length']} tokens")
        
        logger.info("Scaleway LinguaCustodia v1.0 models deployment completed successfully!")
        logger.info("🌍 Region: PARIS 2 (fr-par-2) - H100 availability")
        logger.info("💰 Current Scaleway Pricing (2024):")
        logger.info("   - L40S: €1.50/hour (~€1,095/month) - 8B models")
        logger.info("   - A100-80G: €2.20/hour (~€1,606/month) - 32B models")
        logger.info("   - H100-1-80G: €2.73/hour (~€1,993/month) - 32B models")
        logger.info("   - H100-2-80G: €5.46/hour (~€3,986/month) - 70B models")
        logger.info("   - H100-SXM-2-80G: €6.018/hour (~€4,393/month) - 70B models")
        logger.info("   - H100-SXM-4-80G: €11.61/hour (~€8,475/month) - 70B models")
        logger.info("⚠️  v1.0 Model Requirements:")
        logger.info("   - 8B models: 8GB VRAM (L40S)")
        logger.info("   - 12B models: 12GB VRAM (A100)")
        logger.info("   - 32B models: 32GB VRAM (A100/H100)")
        logger.info("   - 70B models: 80GB VRAM (H100)")
        logger.info("✅ All v1.0 models support 128K context length")
        logger.info("📊 Precision: BF16 (bfloat16) - no quantization needed")
        logger.info("⚡ H100: 3x faster than A100 for transformer workloads")
        
    except Exception as e:
        logger.error(f"Deployment failed: {e}")
        raise

if __name__ == "__main__":
    main()