dragonllm-finance-models / scaleway_deployment.py
jeanbaptdzd's picture
feat: Clean deployment to HuggingFace Space with model config test endpoint
8c0b652
#!/usr/bin/env python3
"""
Scaleway Deployment Configuration for LinguaCustodia Financial AI API
"""
import os
import logging
from typing import Dict, Any
from dotenv import load_dotenv
from scaleway import Client
from scaleway.container.v1beta1 import ContainerV1Beta1API
from scaleway.function.v1beta1 import FunctionV1Beta1API
load_dotenv()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ScalewayDeployment:
"""Scaleway deployment manager for LinguaCustodia API."""
def __init__(self):
"""Initialize Scaleway client with credentials from .env."""
self.access_key = os.getenv('SCW_ACCESS_KEY')
self.secret_key = os.getenv('SCW_SECRET_KEY')
self.project_id = os.getenv('SCW_DEFAULT_PROJECT_ID')
self.region = os.getenv('SCW_REGION', 'fr-par-2') # PARIS 2 for H100 availability
if not all([self.access_key, self.secret_key, self.project_id]):
raise ValueError("Missing required Scaleway credentials in .env file")
self.client = Client(
access_key=self.access_key,
secret_key=self.secret_key,
default_project_id=self.project_id,
default_region=self.region,
default_zone=f"{self.region}-1"
)
self.container_api = ContainerV1Beta1API(self.client)
self.function_api = FunctionV1Beta1API(self.client)
logger.info(f"Scaleway client initialized for project: {self.project_id}")
def _get_environment_variables(self, model_size: str = "8b") -> Dict[str, str]:
"""Get common environment variables for deployments."""
base_vars = {
"HF_TOKEN_LC": os.getenv('HF_TOKEN_LC', ''),
"HF_TOKEN": os.getenv('HF_TOKEN', ''),
"APP_PORT": "7860", # HuggingFace standard port
"LOG_LEVEL": "INFO",
"HF_HOME": "/data/.huggingface" # Persistent storage for model caching
}
# Configure model-specific variables
if model_size == "70b":
base_vars.update({
"MODEL_NAME": "llama3.1-70b-v1.0", # Use latest v1.0 model
"MAX_CONTEXT_LENGTH": "128000", # 128K context for v1.0 70B
"BATCH_SIZE": "1", # Conservative batch size for 70B
"GPU_MEMORY_FRACTION": "0.95", # Use 95% of GPU memory for BF16
"VLLM_GPU_MEMORY_UTILIZATION": "0.95",
"VLLM_MAX_MODEL_LEN": "128000", # 128K context for v1.0
"VLLM_DTYPE": "bfloat16", # BF16 precision
"VLLM_ENFORCE_EAGER": "true", # Better memory management
"VLLM_DISABLE_CUSTOM_ALL_REDUCE": "true", # Optimize for single GPU
"VLLM_BLOCK_SIZE": "16", # Optimize KV cache block size
"VLLM_SWAP_SPACE": "4", # 4GB swap space for memory overflow
"VLLM_CPU_OFFLOAD_GBN": "1" # CPU offload for gradient computation
})
elif model_size == "32b":
base_vars.update({
"MODEL_NAME": "qwen3-32b-v1.0", # New 32B model
"MAX_CONTEXT_LENGTH": "32768", # Qwen 3 32B supports 32K context
"BATCH_SIZE": "1", # Conservative batch size for 32B
"GPU_MEMORY_FRACTION": "0.9", # Use 90% of GPU memory
"VLLM_GPU_MEMORY_UTILIZATION": "0.9",
"VLLM_MAX_MODEL_LEN": "32768",
"VLLM_DTYPE": "bfloat16", # BF16 precision for 32B
"VLLM_ENFORCE_EAGER": "true",
"VLLM_DISABLE_CUSTOM_ALL_REDUCE": "true",
"VLLM_BLOCK_SIZE": "16",
"VLLM_SWAP_SPACE": "2", # 2GB swap space
"VLLM_CPU_OFFLOAD_GBN": "1"
})
elif model_size == "12b":
base_vars.update({
"MODEL_NAME": "gemma3-12b-v1.0", # Use latest v1.0 model
"MAX_CONTEXT_LENGTH": "8192", # Gemma 3 12B supports 8K context
"BATCH_SIZE": "2",
"GPU_MEMORY_FRACTION": "0.85",
"VLLM_GPU_MEMORY_UTILIZATION": "0.85",
"VLLM_MAX_MODEL_LEN": "8192"
})
else: # 8B and smaller
base_vars.update({
"MODEL_NAME": os.getenv('MODEL_NAME', 'qwen3-8b-v1.0'), # Default to v1.0
"MAX_CONTEXT_LENGTH": "32768", # Default 32K (Llama 3.1 8B can use 128K)
"BATCH_SIZE": "4",
"GPU_MEMORY_FRACTION": "0.8",
"VLLM_GPU_MEMORY_UTILIZATION": "0.8",
"VLLM_MAX_MODEL_LEN": "32768"
})
return base_vars
def create_container_namespace(self, name: str = "lingua-custodia") -> Dict[str, Any]:
"""Create a container namespace for the LinguaCustodia API."""
try:
namespace = self.container_api.create_namespace(
project_id=self.project_id,
name=name,
description="LinguaCustodia Financial AI API Container Namespace",
environment_variables=self._get_environment_variables()
)
logger.info(f"Created container namespace: {namespace.id}")
return {
"namespace_id": namespace.id,
"name": namespace.name,
"status": "created"
}
except Exception as e:
logger.error(f"Failed to create container namespace: {e}")
raise
def deploy_container(self, namespace_id: str, image_name: str = "lingua-custodia-api", model_size: str = "70b") -> Dict[str, Any]:
"""Deploy the LinguaCustodia API as a container with optimized resources for model size."""
try:
env_vars = self._get_environment_variables(model_size)
env_vars["PYTHONPATH"] = "/app"
# Configure resources based on model size
if model_size == "70b":
memory_limit = 65536 # 64GB for 70B models
cpu_limit = 16000 # 16 vCPUs for 70B models
timeout = "1800s" # 30 minutes for model loading
max_scale = 1 # Single instance for 70B (resource intensive)
elif model_size == "12b":
memory_limit = 32768 # 32GB for 12B models
cpu_limit = 8000 # 8 vCPUs for 12B models
timeout = "900s" # 15 minutes for model loading
max_scale = 2 # Limited scaling for 12B
else: # 8B and smaller
memory_limit = 16384 # 16GB for 8B models
cpu_limit = 4000 # 4 vCPUs for 8B models
timeout = "600s" # 10 minutes for model loading
max_scale = 3 # Normal scaling for smaller models
container = self.container_api.create_container(
namespace_id=namespace_id,
name=image_name,
description=f"LinguaCustodia Financial AI API ({model_size.upper()} Model)",
environment_variables=env_vars,
min_scale=1,
max_scale=max_scale,
memory_limit=memory_limit,
cpu_limit=cpu_limit,
timeout=timeout,
privacy="public",
http_option="enabled",
port=7860, # HuggingFace standard port
protocol="http1"
)
logger.info(f"Created container: {container.id}")
return {
"container_id": container.id,
"name": container.name,
"status": "created",
"endpoint": getattr(container, 'domain_name', None)
}
except Exception as e:
logger.error(f"Failed to create container: {e}")
raise
def deploy_gpu_container(self, namespace_id: str, image_name: str = "lingua-custodia-gpu", gpu_type: str = "L40S") -> Dict[str, Any]:
"""Deploy the LinguaCustodia API as a GPU-enabled container for 70B models."""
try:
env_vars = self._get_environment_variables("70b")
env_vars["PYTHONPATH"] = "/app"
env_vars["GPU_TYPE"] = gpu_type
# GPU-specific configuration for BF16 precision with Scaleway pricing
gpu_configs = {
"L40S": {
"memory_limit": 32768, # 32GB RAM
"cpu_limit": 8000, # 8 vCPUs
"gpu_memory": 48, # 48GB VRAM
"context_length": 32768, # Default 32K (Llama 3.1 8B can use 128K)
"max_model_size": "8B", # L40S can only handle up to 8B models
"bf16_support": True,
"hourly_price": "€1.50", # Estimated (not available in current pricing)
"monthly_price": "~€1,095"
},
"A100": {
"memory_limit": 131072, # 128GB RAM
"cpu_limit": 32000, # 32 vCPUs
"gpu_memory": 80, # 80GB VRAM
"context_length": 32768, # Default 32K (model-specific)
"max_model_size": "32B", # A100 can handle 32B models with full context
"bf16_support": True,
"hourly_price": "€2.20", # Estimated (not in current H100-focused pricing)
"monthly_price": "~€1,606"
},
"H100": {
"memory_limit": 131072, # 128GB RAM (240GB actual)
"cpu_limit": 24000, # 24 vCPUs (actual H100-1-80G specs)
"gpu_memory": 80, # 80GB VRAM
"context_length": 128000, # 128K context for Llama 3.1 70B
"max_model_size": "70B", # H100 can handle 70B models with BF16
"bf16_support": True,
"hourly_price": "€2.73",
"monthly_price": "~€1,993"
},
"H100_DUAL": {
"memory_limit": 262144, # 256GB RAM (480GB actual)
"cpu_limit": 48000, # 48 vCPUs (actual H100-2-80G specs)
"gpu_memory": 160, # 160GB VRAM (2x80GB)
"context_length": 128000, # Full context for BF16 70B models
"max_model_size": "70B", # Dual H100 can handle 70B BF16 with full context
"bf16_support": True,
"hourly_price": "€5.46",
"monthly_price": "~€3,986"
},
"H100_SXM_DUAL": {
"memory_limit": 131072, # 128GB RAM (240GB actual)
"cpu_limit": 32000, # 32 vCPUs (actual H100-SXM-2-80G specs)
"gpu_memory": 160, # 160GB VRAM (2x80GB)
"context_length": 128000, # Full context for BF16 70B models
"max_model_size": "70B", # SXM version with better interconnect
"bf16_support": True,
"hourly_price": "€6.018",
"monthly_price": "~€4,393"
},
"H100_SXM_QUAD": {
"memory_limit": 262144, # 256GB RAM (480GB actual)
"cpu_limit": 64000, # 64 vCPUs (actual H100-SXM-4-80G specs)
"gpu_memory": 320, # 320GB VRAM (4x80GB)
"context_length": 128000, # Full context for BF16 70B models
"max_model_size": "70B", # Quad H100 optimal for BF16 70B
"bf16_support": True,
"hourly_price": "€11.61",
"monthly_price": "~€8,475"
}
}
config = gpu_configs.get(gpu_type, gpu_configs["L40S"])
env_vars["GPU_MEMORY_GB"] = str(config["gpu_memory"])
env_vars["MAX_CONTEXT_LENGTH"] = str(config["context_length"])
container = self.container_api.create_container(
namespace_id=namespace_id,
name=image_name,
description=f"LinguaCustodia Financial AI API (70B Model on {gpu_type})",
environment_variables=env_vars,
min_scale=1,
max_scale=1, # Single instance for GPU workloads
memory_limit=config["memory_limit"],
cpu_limit=config["cpu_limit"],
timeout="1800s", # 30 minutes for model loading
privacy="public",
http_option="enabled",
port=7860,
protocol="http1"
)
logger.info(f"Created GPU container: {container.id} with {gpu_type}")
return {
"container_id": container.id,
"name": container.name,
"status": "created",
"gpu_type": gpu_type,
"gpu_memory": config["gpu_memory"],
"context_length": config["context_length"],
"endpoint": getattr(container, 'domain_name', None)
}
except Exception as e:
logger.error(f"Failed to create GPU container: {e}")
raise
def deploy_function(self, namespace_id: str, function_name: str = "lingua-custodia-api") -> Dict[str, Any]:
"""Deploy the LinguaCustodia API as a serverless function."""
try:
function = self.function_api.create_function(
namespace_id=namespace_id,
name=function_name,
description="LinguaCustodia Financial AI API Serverless Function",
environment_variables=self._get_environment_variables(),
min_scale=0,
max_scale=5,
memory_limit=16384, # 16GB for 8B models (was 1GB - insufficient)
timeout="600s", # 10 minutes for model loading (Scaleway expects string with unit)
privacy="public",
http_option="enabled"
)
logger.info(f"Created function: {function.id}")
return {
"function_id": function.id,
"name": function.name,
"status": "created",
"endpoint": getattr(function, 'domain_name', None)
}
except Exception as e:
logger.error(f"Failed to create function: {e}")
raise
def list_deployments(self) -> Dict[str, Any]:
"""List all existing deployments."""
try:
namespaces = self.container_api.list_namespaces()
function_namespaces = self.function_api.list_namespaces()
all_functions = []
for func_ns in function_namespaces.namespaces:
try:
functions = self.function_api.list_functions(namespace_id=func_ns.id)
all_functions.extend(functions.functions)
except Exception as e:
logger.warning(f"Could not list functions for namespace {func_ns.id}: {e}")
return {
"namespaces": [{"id": ns.id, "name": ns.name} for ns in namespaces.namespaces],
"functions": [{"id": func.id, "name": func.name} for func in all_functions],
"total_namespaces": len(namespaces.namespaces),
"total_functions": len(all_functions)
}
except Exception as e:
logger.error(f"Failed to list deployments: {e}")
raise
def get_deployment_status(self, deployment_id: str, deployment_type: str = "container") -> Dict[str, Any]:
"""Get the status of a specific deployment."""
try:
if deployment_type == "container":
container = self.container_api.get_container(deployment_id)
return {
"id": container.id,
"name": container.name,
"status": container.status,
"endpoint": getattr(container, 'domain_name', None),
"memory_limit": container.memory_limit,
"cpu_limit": container.cpu_limit
}
elif deployment_type == "function":
function = self.function_api.get_function(deployment_id)
return {
"id": function.id,
"name": function.name,
"status": function.status,
"endpoint": getattr(function, 'domain_name', None),
"memory_limit": function.memory_limit
}
else:
raise ValueError("deployment_type must be 'container' or 'function'")
except Exception as e:
logger.error(f"Failed to get deployment status: {e}")
raise
def main():
"""Main function to demonstrate Scaleway deployment for LinguaCustodia v1.0 models."""
try:
deployment = ScalewayDeployment()
deployments = deployment.list_deployments()
logger.info(f"Found {deployments['total_namespaces']} namespaces and {deployments['total_functions']} functions")
# Create namespace for v1.0 models deployment
namespace = deployment.create_container_namespace("lingua-custodia-v1.0")
logger.info(f"Namespace created: {namespace['namespace_id']}")
# Deploy 32B model on A100 (new model size)
a100_32b_container = deployment.deploy_gpu_container(
namespace['namespace_id'],
"lingua-custodia-32b-v1.0-a100",
"A100"
)
logger.info(f"A100 32B Container created: {a100_32b_container['container_id']}")
logger.info(f"GPU Type: {a100_32b_container['gpu_type']}")
logger.info(f"GPU Memory: {a100_32b_container['gpu_memory']}GB")
logger.info(f"Context Length: {a100_32b_container['context_length']} tokens")
# Deploy 70B v1.0 model on H100_DUAL (recommended for 128K context)
h100_dual_container = deployment.deploy_gpu_container(
namespace['namespace_id'],
"lingua-custodia-70b-v1.0-h100-dual",
"H100_DUAL"
)
logger.info(f"H100 Dual 70B Container created: {h100_dual_container['container_id']}")
logger.info(f"GPU Type: {h100_dual_container['gpu_type']}")
logger.info(f"GPU Memory: {h100_dual_container['gpu_memory']}GB")
logger.info(f"Context Length: {h100_dual_container['context_length']} tokens")
# Deploy 8B v1.0 model on L40S (cost-effective option)
l40s_8b_container = deployment.deploy_gpu_container(
namespace['namespace_id'],
"lingua-custodia-8b-v1.0-l40s",
"L40S"
)
logger.info(f"L40S 8B Container created: {l40s_8b_container['container_id']}")
logger.info(f"GPU Type: {l40s_8b_container['gpu_type']}")
logger.info(f"GPU Memory: {l40s_8b_container['gpu_memory']}GB")
logger.info(f"Context Length: {l40s_8b_container['context_length']} tokens")
logger.info("Scaleway LinguaCustodia v1.0 models deployment completed successfully!")
logger.info("🌍 Region: PARIS 2 (fr-par-2) - H100 availability")
logger.info("πŸ’° Current Scaleway Pricing (2024):")
logger.info(" - L40S: €1.50/hour (~€1,095/month) - 8B models")
logger.info(" - A100-80G: €2.20/hour (~€1,606/month) - 32B models")
logger.info(" - H100-1-80G: €2.73/hour (~€1,993/month) - 32B models")
logger.info(" - H100-2-80G: €5.46/hour (~€3,986/month) - 70B models")
logger.info(" - H100-SXM-2-80G: €6.018/hour (~€4,393/month) - 70B models")
logger.info(" - H100-SXM-4-80G: €11.61/hour (~€8,475/month) - 70B models")
logger.info("⚠️ v1.0 Model Requirements:")
logger.info(" - 8B models: 8GB VRAM (L40S)")
logger.info(" - 12B models: 12GB VRAM (A100)")
logger.info(" - 32B models: 32GB VRAM (A100/H100)")
logger.info(" - 70B models: 80GB VRAM (H100)")
logger.info("βœ… All v1.0 models support 128K context length")
logger.info("πŸ“Š Precision: BF16 (bfloat16) - no quantization needed")
logger.info("⚑ H100: 3x faster than A100 for transformer workloads")
except Exception as e:
logger.error(f"Deployment failed: {e}")
raise
if __name__ == "__main__":
main()