Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| """ | |
| Scaleway Deployment Configuration for LinguaCustodia Financial AI API | |
| """ | |
| import os | |
| import logging | |
| from typing import Dict, Any | |
| from dotenv import load_dotenv | |
| from scaleway import Client | |
| from scaleway.container.v1beta1 import ContainerV1Beta1API | |
| from scaleway.function.v1beta1 import FunctionV1Beta1API | |
| load_dotenv() | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| class ScalewayDeployment: | |
| """Scaleway deployment manager for LinguaCustodia API.""" | |
| def __init__(self): | |
| """Initialize Scaleway client with credentials from .env.""" | |
| self.access_key = os.getenv('SCW_ACCESS_KEY') | |
| self.secret_key = os.getenv('SCW_SECRET_KEY') | |
| self.project_id = os.getenv('SCW_DEFAULT_PROJECT_ID') | |
| self.region = os.getenv('SCW_REGION', 'fr-par-2') # PARIS 2 for H100 availability | |
| if not all([self.access_key, self.secret_key, self.project_id]): | |
| raise ValueError("Missing required Scaleway credentials in .env file") | |
| self.client = Client( | |
| access_key=self.access_key, | |
| secret_key=self.secret_key, | |
| default_project_id=self.project_id, | |
| default_region=self.region, | |
| default_zone=f"{self.region}-1" | |
| ) | |
| self.container_api = ContainerV1Beta1API(self.client) | |
| self.function_api = FunctionV1Beta1API(self.client) | |
| logger.info(f"Scaleway client initialized for project: {self.project_id}") | |
| def _get_environment_variables(self, model_size: str = "8b") -> Dict[str, str]: | |
| """Get common environment variables for deployments.""" | |
| base_vars = { | |
| "HF_TOKEN_LC": os.getenv('HF_TOKEN_LC', ''), | |
| "HF_TOKEN": os.getenv('HF_TOKEN', ''), | |
| "APP_PORT": "7860", # HuggingFace standard port | |
| "LOG_LEVEL": "INFO", | |
| "HF_HOME": "/data/.huggingface" # Persistent storage for model caching | |
| } | |
| # Configure model-specific variables | |
| if model_size == "70b": | |
| base_vars.update({ | |
| "MODEL_NAME": "llama3.1-70b-v1.0", # Use latest v1.0 model | |
| "MAX_CONTEXT_LENGTH": "128000", # 128K context for v1.0 70B | |
| "BATCH_SIZE": "1", # Conservative batch size for 70B | |
| "GPU_MEMORY_FRACTION": "0.95", # Use 95% of GPU memory for BF16 | |
| "VLLM_GPU_MEMORY_UTILIZATION": "0.95", | |
| "VLLM_MAX_MODEL_LEN": "128000", # 128K context for v1.0 | |
| "VLLM_DTYPE": "bfloat16", # BF16 precision | |
| "VLLM_ENFORCE_EAGER": "true", # Better memory management | |
| "VLLM_DISABLE_CUSTOM_ALL_REDUCE": "true", # Optimize for single GPU | |
| "VLLM_BLOCK_SIZE": "16", # Optimize KV cache block size | |
| "VLLM_SWAP_SPACE": "4", # 4GB swap space for memory overflow | |
| "VLLM_CPU_OFFLOAD_GBN": "1" # CPU offload for gradient computation | |
| }) | |
| elif model_size == "32b": | |
| base_vars.update({ | |
| "MODEL_NAME": "qwen3-32b-v1.0", # New 32B model | |
| "MAX_CONTEXT_LENGTH": "32768", # Qwen 3 32B supports 32K context | |
| "BATCH_SIZE": "1", # Conservative batch size for 32B | |
| "GPU_MEMORY_FRACTION": "0.9", # Use 90% of GPU memory | |
| "VLLM_GPU_MEMORY_UTILIZATION": "0.9", | |
| "VLLM_MAX_MODEL_LEN": "32768", | |
| "VLLM_DTYPE": "bfloat16", # BF16 precision for 32B | |
| "VLLM_ENFORCE_EAGER": "true", | |
| "VLLM_DISABLE_CUSTOM_ALL_REDUCE": "true", | |
| "VLLM_BLOCK_SIZE": "16", | |
| "VLLM_SWAP_SPACE": "2", # 2GB swap space | |
| "VLLM_CPU_OFFLOAD_GBN": "1" | |
| }) | |
| elif model_size == "12b": | |
| base_vars.update({ | |
| "MODEL_NAME": "gemma3-12b-v1.0", # Use latest v1.0 model | |
| "MAX_CONTEXT_LENGTH": "8192", # Gemma 3 12B supports 8K context | |
| "BATCH_SIZE": "2", | |
| "GPU_MEMORY_FRACTION": "0.85", | |
| "VLLM_GPU_MEMORY_UTILIZATION": "0.85", | |
| "VLLM_MAX_MODEL_LEN": "8192" | |
| }) | |
| else: # 8B and smaller | |
| base_vars.update({ | |
| "MODEL_NAME": os.getenv('MODEL_NAME', 'qwen3-8b-v1.0'), # Default to v1.0 | |
| "MAX_CONTEXT_LENGTH": "32768", # Default 32K (Llama 3.1 8B can use 128K) | |
| "BATCH_SIZE": "4", | |
| "GPU_MEMORY_FRACTION": "0.8", | |
| "VLLM_GPU_MEMORY_UTILIZATION": "0.8", | |
| "VLLM_MAX_MODEL_LEN": "32768" | |
| }) | |
| return base_vars | |
| def create_container_namespace(self, name: str = "lingua-custodia") -> Dict[str, Any]: | |
| """Create a container namespace for the LinguaCustodia API.""" | |
| try: | |
| namespace = self.container_api.create_namespace( | |
| project_id=self.project_id, | |
| name=name, | |
| description="LinguaCustodia Financial AI API Container Namespace", | |
| environment_variables=self._get_environment_variables() | |
| ) | |
| logger.info(f"Created container namespace: {namespace.id}") | |
| return { | |
| "namespace_id": namespace.id, | |
| "name": namespace.name, | |
| "status": "created" | |
| } | |
| except Exception as e: | |
| logger.error(f"Failed to create container namespace: {e}") | |
| raise | |
| def deploy_container(self, namespace_id: str, image_name: str = "lingua-custodia-api", model_size: str = "70b") -> Dict[str, Any]: | |
| """Deploy the LinguaCustodia API as a container with optimized resources for model size.""" | |
| try: | |
| env_vars = self._get_environment_variables(model_size) | |
| env_vars["PYTHONPATH"] = "/app" | |
| # Configure resources based on model size | |
| if model_size == "70b": | |
| memory_limit = 65536 # 64GB for 70B models | |
| cpu_limit = 16000 # 16 vCPUs for 70B models | |
| timeout = "1800s" # 30 minutes for model loading | |
| max_scale = 1 # Single instance for 70B (resource intensive) | |
| elif model_size == "12b": | |
| memory_limit = 32768 # 32GB for 12B models | |
| cpu_limit = 8000 # 8 vCPUs for 12B models | |
| timeout = "900s" # 15 minutes for model loading | |
| max_scale = 2 # Limited scaling for 12B | |
| else: # 8B and smaller | |
| memory_limit = 16384 # 16GB for 8B models | |
| cpu_limit = 4000 # 4 vCPUs for 8B models | |
| timeout = "600s" # 10 minutes for model loading | |
| max_scale = 3 # Normal scaling for smaller models | |
| container = self.container_api.create_container( | |
| namespace_id=namespace_id, | |
| name=image_name, | |
| description=f"LinguaCustodia Financial AI API ({model_size.upper()} Model)", | |
| environment_variables=env_vars, | |
| min_scale=1, | |
| max_scale=max_scale, | |
| memory_limit=memory_limit, | |
| cpu_limit=cpu_limit, | |
| timeout=timeout, | |
| privacy="public", | |
| http_option="enabled", | |
| port=7860, # HuggingFace standard port | |
| protocol="http1" | |
| ) | |
| logger.info(f"Created container: {container.id}") | |
| return { | |
| "container_id": container.id, | |
| "name": container.name, | |
| "status": "created", | |
| "endpoint": getattr(container, 'domain_name', None) | |
| } | |
| except Exception as e: | |
| logger.error(f"Failed to create container: {e}") | |
| raise | |
| def deploy_gpu_container(self, namespace_id: str, image_name: str = "lingua-custodia-gpu", gpu_type: str = "L40S") -> Dict[str, Any]: | |
| """Deploy the LinguaCustodia API as a GPU-enabled container for 70B models.""" | |
| try: | |
| env_vars = self._get_environment_variables("70b") | |
| env_vars["PYTHONPATH"] = "/app" | |
| env_vars["GPU_TYPE"] = gpu_type | |
| # GPU-specific configuration for BF16 precision with Scaleway pricing | |
| gpu_configs = { | |
| "L40S": { | |
| "memory_limit": 32768, # 32GB RAM | |
| "cpu_limit": 8000, # 8 vCPUs | |
| "gpu_memory": 48, # 48GB VRAM | |
| "context_length": 32768, # Default 32K (Llama 3.1 8B can use 128K) | |
| "max_model_size": "8B", # L40S can only handle up to 8B models | |
| "bf16_support": True, | |
| "hourly_price": "β¬1.50", # Estimated (not available in current pricing) | |
| "monthly_price": "~β¬1,095" | |
| }, | |
| "A100": { | |
| "memory_limit": 131072, # 128GB RAM | |
| "cpu_limit": 32000, # 32 vCPUs | |
| "gpu_memory": 80, # 80GB VRAM | |
| "context_length": 32768, # Default 32K (model-specific) | |
| "max_model_size": "32B", # A100 can handle 32B models with full context | |
| "bf16_support": True, | |
| "hourly_price": "β¬2.20", # Estimated (not in current H100-focused pricing) | |
| "monthly_price": "~β¬1,606" | |
| }, | |
| "H100": { | |
| "memory_limit": 131072, # 128GB RAM (240GB actual) | |
| "cpu_limit": 24000, # 24 vCPUs (actual H100-1-80G specs) | |
| "gpu_memory": 80, # 80GB VRAM | |
| "context_length": 128000, # 128K context for Llama 3.1 70B | |
| "max_model_size": "70B", # H100 can handle 70B models with BF16 | |
| "bf16_support": True, | |
| "hourly_price": "β¬2.73", | |
| "monthly_price": "~β¬1,993" | |
| }, | |
| "H100_DUAL": { | |
| "memory_limit": 262144, # 256GB RAM (480GB actual) | |
| "cpu_limit": 48000, # 48 vCPUs (actual H100-2-80G specs) | |
| "gpu_memory": 160, # 160GB VRAM (2x80GB) | |
| "context_length": 128000, # Full context for BF16 70B models | |
| "max_model_size": "70B", # Dual H100 can handle 70B BF16 with full context | |
| "bf16_support": True, | |
| "hourly_price": "β¬5.46", | |
| "monthly_price": "~β¬3,986" | |
| }, | |
| "H100_SXM_DUAL": { | |
| "memory_limit": 131072, # 128GB RAM (240GB actual) | |
| "cpu_limit": 32000, # 32 vCPUs (actual H100-SXM-2-80G specs) | |
| "gpu_memory": 160, # 160GB VRAM (2x80GB) | |
| "context_length": 128000, # Full context for BF16 70B models | |
| "max_model_size": "70B", # SXM version with better interconnect | |
| "bf16_support": True, | |
| "hourly_price": "β¬6.018", | |
| "monthly_price": "~β¬4,393" | |
| }, | |
| "H100_SXM_QUAD": { | |
| "memory_limit": 262144, # 256GB RAM (480GB actual) | |
| "cpu_limit": 64000, # 64 vCPUs (actual H100-SXM-4-80G specs) | |
| "gpu_memory": 320, # 320GB VRAM (4x80GB) | |
| "context_length": 128000, # Full context for BF16 70B models | |
| "max_model_size": "70B", # Quad H100 optimal for BF16 70B | |
| "bf16_support": True, | |
| "hourly_price": "β¬11.61", | |
| "monthly_price": "~β¬8,475" | |
| } | |
| } | |
| config = gpu_configs.get(gpu_type, gpu_configs["L40S"]) | |
| env_vars["GPU_MEMORY_GB"] = str(config["gpu_memory"]) | |
| env_vars["MAX_CONTEXT_LENGTH"] = str(config["context_length"]) | |
| container = self.container_api.create_container( | |
| namespace_id=namespace_id, | |
| name=image_name, | |
| description=f"LinguaCustodia Financial AI API (70B Model on {gpu_type})", | |
| environment_variables=env_vars, | |
| min_scale=1, | |
| max_scale=1, # Single instance for GPU workloads | |
| memory_limit=config["memory_limit"], | |
| cpu_limit=config["cpu_limit"], | |
| timeout="1800s", # 30 minutes for model loading | |
| privacy="public", | |
| http_option="enabled", | |
| port=7860, | |
| protocol="http1" | |
| ) | |
| logger.info(f"Created GPU container: {container.id} with {gpu_type}") | |
| return { | |
| "container_id": container.id, | |
| "name": container.name, | |
| "status": "created", | |
| "gpu_type": gpu_type, | |
| "gpu_memory": config["gpu_memory"], | |
| "context_length": config["context_length"], | |
| "endpoint": getattr(container, 'domain_name', None) | |
| } | |
| except Exception as e: | |
| logger.error(f"Failed to create GPU container: {e}") | |
| raise | |
| def deploy_function(self, namespace_id: str, function_name: str = "lingua-custodia-api") -> Dict[str, Any]: | |
| """Deploy the LinguaCustodia API as a serverless function.""" | |
| try: | |
| function = self.function_api.create_function( | |
| namespace_id=namespace_id, | |
| name=function_name, | |
| description="LinguaCustodia Financial AI API Serverless Function", | |
| environment_variables=self._get_environment_variables(), | |
| min_scale=0, | |
| max_scale=5, | |
| memory_limit=16384, # 16GB for 8B models (was 1GB - insufficient) | |
| timeout="600s", # 10 minutes for model loading (Scaleway expects string with unit) | |
| privacy="public", | |
| http_option="enabled" | |
| ) | |
| logger.info(f"Created function: {function.id}") | |
| return { | |
| "function_id": function.id, | |
| "name": function.name, | |
| "status": "created", | |
| "endpoint": getattr(function, 'domain_name', None) | |
| } | |
| except Exception as e: | |
| logger.error(f"Failed to create function: {e}") | |
| raise | |
| def list_deployments(self) -> Dict[str, Any]: | |
| """List all existing deployments.""" | |
| try: | |
| namespaces = self.container_api.list_namespaces() | |
| function_namespaces = self.function_api.list_namespaces() | |
| all_functions = [] | |
| for func_ns in function_namespaces.namespaces: | |
| try: | |
| functions = self.function_api.list_functions(namespace_id=func_ns.id) | |
| all_functions.extend(functions.functions) | |
| except Exception as e: | |
| logger.warning(f"Could not list functions for namespace {func_ns.id}: {e}") | |
| return { | |
| "namespaces": [{"id": ns.id, "name": ns.name} for ns in namespaces.namespaces], | |
| "functions": [{"id": func.id, "name": func.name} for func in all_functions], | |
| "total_namespaces": len(namespaces.namespaces), | |
| "total_functions": len(all_functions) | |
| } | |
| except Exception as e: | |
| logger.error(f"Failed to list deployments: {e}") | |
| raise | |
| def get_deployment_status(self, deployment_id: str, deployment_type: str = "container") -> Dict[str, Any]: | |
| """Get the status of a specific deployment.""" | |
| try: | |
| if deployment_type == "container": | |
| container = self.container_api.get_container(deployment_id) | |
| return { | |
| "id": container.id, | |
| "name": container.name, | |
| "status": container.status, | |
| "endpoint": getattr(container, 'domain_name', None), | |
| "memory_limit": container.memory_limit, | |
| "cpu_limit": container.cpu_limit | |
| } | |
| elif deployment_type == "function": | |
| function = self.function_api.get_function(deployment_id) | |
| return { | |
| "id": function.id, | |
| "name": function.name, | |
| "status": function.status, | |
| "endpoint": getattr(function, 'domain_name', None), | |
| "memory_limit": function.memory_limit | |
| } | |
| else: | |
| raise ValueError("deployment_type must be 'container' or 'function'") | |
| except Exception as e: | |
| logger.error(f"Failed to get deployment status: {e}") | |
| raise | |
| def main(): | |
| """Main function to demonstrate Scaleway deployment for LinguaCustodia v1.0 models.""" | |
| try: | |
| deployment = ScalewayDeployment() | |
| deployments = deployment.list_deployments() | |
| logger.info(f"Found {deployments['total_namespaces']} namespaces and {deployments['total_functions']} functions") | |
| # Create namespace for v1.0 models deployment | |
| namespace = deployment.create_container_namespace("lingua-custodia-v1.0") | |
| logger.info(f"Namespace created: {namespace['namespace_id']}") | |
| # Deploy 32B model on A100 (new model size) | |
| a100_32b_container = deployment.deploy_gpu_container( | |
| namespace['namespace_id'], | |
| "lingua-custodia-32b-v1.0-a100", | |
| "A100" | |
| ) | |
| logger.info(f"A100 32B Container created: {a100_32b_container['container_id']}") | |
| logger.info(f"GPU Type: {a100_32b_container['gpu_type']}") | |
| logger.info(f"GPU Memory: {a100_32b_container['gpu_memory']}GB") | |
| logger.info(f"Context Length: {a100_32b_container['context_length']} tokens") | |
| # Deploy 70B v1.0 model on H100_DUAL (recommended for 128K context) | |
| h100_dual_container = deployment.deploy_gpu_container( | |
| namespace['namespace_id'], | |
| "lingua-custodia-70b-v1.0-h100-dual", | |
| "H100_DUAL" | |
| ) | |
| logger.info(f"H100 Dual 70B Container created: {h100_dual_container['container_id']}") | |
| logger.info(f"GPU Type: {h100_dual_container['gpu_type']}") | |
| logger.info(f"GPU Memory: {h100_dual_container['gpu_memory']}GB") | |
| logger.info(f"Context Length: {h100_dual_container['context_length']} tokens") | |
| # Deploy 8B v1.0 model on L40S (cost-effective option) | |
| l40s_8b_container = deployment.deploy_gpu_container( | |
| namespace['namespace_id'], | |
| "lingua-custodia-8b-v1.0-l40s", | |
| "L40S" | |
| ) | |
| logger.info(f"L40S 8B Container created: {l40s_8b_container['container_id']}") | |
| logger.info(f"GPU Type: {l40s_8b_container['gpu_type']}") | |
| logger.info(f"GPU Memory: {l40s_8b_container['gpu_memory']}GB") | |
| logger.info(f"Context Length: {l40s_8b_container['context_length']} tokens") | |
| logger.info("Scaleway LinguaCustodia v1.0 models deployment completed successfully!") | |
| logger.info("π Region: PARIS 2 (fr-par-2) - H100 availability") | |
| logger.info("π° Current Scaleway Pricing (2024):") | |
| logger.info(" - L40S: β¬1.50/hour (~β¬1,095/month) - 8B models") | |
| logger.info(" - A100-80G: β¬2.20/hour (~β¬1,606/month) - 32B models") | |
| logger.info(" - H100-1-80G: β¬2.73/hour (~β¬1,993/month) - 32B models") | |
| logger.info(" - H100-2-80G: β¬5.46/hour (~β¬3,986/month) - 70B models") | |
| logger.info(" - H100-SXM-2-80G: β¬6.018/hour (~β¬4,393/month) - 70B models") | |
| logger.info(" - H100-SXM-4-80G: β¬11.61/hour (~β¬8,475/month) - 70B models") | |
| logger.info("β οΈ v1.0 Model Requirements:") | |
| logger.info(" - 8B models: 8GB VRAM (L40S)") | |
| logger.info(" - 12B models: 12GB VRAM (A100)") | |
| logger.info(" - 32B models: 32GB VRAM (A100/H100)") | |
| logger.info(" - 70B models: 80GB VRAM (H100)") | |
| logger.info("β All v1.0 models support 128K context length") | |
| logger.info("π Precision: BF16 (bfloat16) - no quantization needed") | |
| logger.info("β‘ H100: 3x faster than A100 for transformer workloads") | |
| except Exception as e: | |
| logger.error(f"Deployment failed: {e}") | |
| raise | |
| if __name__ == "__main__": | |
| main() | |