Spaces:

jeanbaptdzd
/

dragonllm-finance-models

Runtime error

App Files Files Community

dragonllm-finance-models / scaleway_deployment.py

jeanbaptdzd

feat: Clean deployment to HuggingFace Space with model config test endpoint

8c0b652 about 1 month ago

raw

history blame contribute delete

20.8 kB

	#!/usr/bin/env python3
	"""
	Scaleway Deployment Configuration for LinguaCustodia Financial AI API
	"""

	import os
	import logging
	from typing import Dict, Any
	from dotenv import load_dotenv
	from scaleway import Client
	from scaleway.container.v1beta1 import ContainerV1Beta1API
	from scaleway.function.v1beta1 import FunctionV1Beta1API

	load_dotenv()
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class ScalewayDeployment:
	"""Scaleway deployment manager for LinguaCustodia API."""

	def __init__(self):
	"""Initialize Scaleway client with credentials from .env."""
	self.access_key = os.getenv('SCW_ACCESS_KEY')
	self.secret_key = os.getenv('SCW_SECRET_KEY')
	self.project_id = os.getenv('SCW_DEFAULT_PROJECT_ID')
	self.region = os.getenv('SCW_REGION', 'fr-par-2') # PARIS 2 for H100 availability

	if not all([self.access_key, self.secret_key, self.project_id]):
	raise ValueError("Missing required Scaleway credentials in .env file")

	self.client = Client(
	access_key=self.access_key,
	secret_key=self.secret_key,
	default_project_id=self.project_id,
	default_region=self.region,
	default_zone=f"{self.region}-1"
	)

	self.container_api = ContainerV1Beta1API(self.client)
	self.function_api = FunctionV1Beta1API(self.client)

	logger.info(f"Scaleway client initialized for project: {self.project_id}")

	def _get_environment_variables(self, model_size: str = "8b") -> Dict[str, str]:
	"""Get common environment variables for deployments."""
	base_vars = {
	"HF_TOKEN_LC": os.getenv('HF_TOKEN_LC', ''),
	"HF_TOKEN": os.getenv('HF_TOKEN', ''),
	"APP_PORT": "7860", # HuggingFace standard port
	"LOG_LEVEL": "INFO",
	"HF_HOME": "/data/.huggingface" # Persistent storage for model caching
	}

	# Configure model-specific variables
	if model_size == "70b":
	base_vars.update({
	"MODEL_NAME": "llama3.1-70b-v1.0", # Use latest v1.0 model
	"MAX_CONTEXT_LENGTH": "128000", # 128K context for v1.0 70B
	"BATCH_SIZE": "1", # Conservative batch size for 70B
	"GPU_MEMORY_FRACTION": "0.95", # Use 95% of GPU memory for BF16
	"VLLM_GPU_MEMORY_UTILIZATION": "0.95",
	"VLLM_MAX_MODEL_LEN": "128000", # 128K context for v1.0
	"VLLM_DTYPE": "bfloat16", # BF16 precision
	"VLLM_ENFORCE_EAGER": "true", # Better memory management
	"VLLM_DISABLE_CUSTOM_ALL_REDUCE": "true", # Optimize for single GPU
	"VLLM_BLOCK_SIZE": "16", # Optimize KV cache block size
	"VLLM_SWAP_SPACE": "4", # 4GB swap space for memory overflow
	"VLLM_CPU_OFFLOAD_GBN": "1" # CPU offload for gradient computation
	})
	elif model_size == "32b":
	base_vars.update({
	"MODEL_NAME": "qwen3-32b-v1.0", # New 32B model
	"MAX_CONTEXT_LENGTH": "32768", # Qwen 3 32B supports 32K context
	"BATCH_SIZE": "1", # Conservative batch size for 32B
	"GPU_MEMORY_FRACTION": "0.9", # Use 90% of GPU memory
	"VLLM_GPU_MEMORY_UTILIZATION": "0.9",
	"VLLM_MAX_MODEL_LEN": "32768",
	"VLLM_DTYPE": "bfloat16", # BF16 precision for 32B
	"VLLM_ENFORCE_EAGER": "true",
	"VLLM_DISABLE_CUSTOM_ALL_REDUCE": "true",
	"VLLM_BLOCK_SIZE": "16",
	"VLLM_SWAP_SPACE": "2", # 2GB swap space
	"VLLM_CPU_OFFLOAD_GBN": "1"
	})
	elif model_size == "12b":
	base_vars.update({
	"MODEL_NAME": "gemma3-12b-v1.0", # Use latest v1.0 model
	"MAX_CONTEXT_LENGTH": "8192", # Gemma 3 12B supports 8K context
	"BATCH_SIZE": "2",
	"GPU_MEMORY_FRACTION": "0.85",
	"VLLM_GPU_MEMORY_UTILIZATION": "0.85",
	"VLLM_MAX_MODEL_LEN": "8192"
	})
	else: # 8B and smaller
	base_vars.update({
	"MODEL_NAME": os.getenv('MODEL_NAME', 'qwen3-8b-v1.0'), # Default to v1.0
	"MAX_CONTEXT_LENGTH": "32768", # Default 32K (Llama 3.1 8B can use 128K)
	"BATCH_SIZE": "4",
	"GPU_MEMORY_FRACTION": "0.8",
	"VLLM_GPU_MEMORY_UTILIZATION": "0.8",
	"VLLM_MAX_MODEL_LEN": "32768"
	})

	return base_vars

	def create_container_namespace(self, name: str = "lingua-custodia") -> Dict[str, Any]:
	"""Create a container namespace for the LinguaCustodia API."""
	try:
	namespace = self.container_api.create_namespace(
	project_id=self.project_id,
	name=name,
	description="LinguaCustodia Financial AI API Container Namespace",
	environment_variables=self._get_environment_variables()
	)

	logger.info(f"Created container namespace: {namespace.id}")
	return {
	"namespace_id": namespace.id,
	"name": namespace.name,
	"status": "created"
	}

	except Exception as e:
	logger.error(f"Failed to create container namespace: {e}")
	raise

	def deploy_container(self, namespace_id: str, image_name: str = "lingua-custodia-api", model_size: str = "70b") -> Dict[str, Any]:
	"""Deploy the LinguaCustodia API as a container with optimized resources for model size."""
	try:
	env_vars = self._get_environment_variables(model_size)
	env_vars["PYTHONPATH"] = "/app"

	# Configure resources based on model size
	if model_size == "70b":
	memory_limit = 65536 # 64GB for 70B models
	cpu_limit = 16000 # 16 vCPUs for 70B models
	timeout = "1800s" # 30 minutes for model loading
	max_scale = 1 # Single instance for 70B (resource intensive)
	elif model_size == "12b":
	memory_limit = 32768 # 32GB for 12B models
	cpu_limit = 8000 # 8 vCPUs for 12B models
	timeout = "900s" # 15 minutes for model loading
	max_scale = 2 # Limited scaling for 12B
	else: # 8B and smaller
	memory_limit = 16384 # 16GB for 8B models
	cpu_limit = 4000 # 4 vCPUs for 8B models
	timeout = "600s" # 10 minutes for model loading
	max_scale = 3 # Normal scaling for smaller models

	container = self.container_api.create_container(
	namespace_id=namespace_id,
	name=image_name,
	description=f"LinguaCustodia Financial AI API ({model_size.upper()} Model)",
	environment_variables=env_vars,
	min_scale=1,
	max_scale=max_scale,
	memory_limit=memory_limit,
	cpu_limit=cpu_limit,
	timeout=timeout,
	privacy="public",
	http_option="enabled",
	port=7860, # HuggingFace standard port
	protocol="http1"
	)

	logger.info(f"Created container: {container.id}")
	return {
	"container_id": container.id,
	"name": container.name,
	"status": "created",
	"endpoint": getattr(container, 'domain_name', None)
	}

	except Exception as e:
	logger.error(f"Failed to create container: {e}")
	raise

	def deploy_gpu_container(self, namespace_id: str, image_name: str = "lingua-custodia-gpu", gpu_type: str = "L40S") -> Dict[str, Any]:
	"""Deploy the LinguaCustodia API as a GPU-enabled container for 70B models."""
	try:
	env_vars = self._get_environment_variables("70b")
	env_vars["PYTHONPATH"] = "/app"
	env_vars["GPU_TYPE"] = gpu_type

	# GPU-specific configuration for BF16 precision with Scaleway pricing
	gpu_configs = {
	"L40S": {
	"memory_limit": 32768, # 32GB RAM
	"cpu_limit": 8000, # 8 vCPUs
	"gpu_memory": 48, # 48GB VRAM
	"context_length": 32768, # Default 32K (Llama 3.1 8B can use 128K)
	"max_model_size": "8B", # L40S can only handle up to 8B models
	"bf16_support": True,
	"hourly_price": "€1.50", # Estimated (not available in current pricing)
	"monthly_price": "~€1,095"
	},
	"A100": {
	"memory_limit": 131072, # 128GB RAM
	"cpu_limit": 32000, # 32 vCPUs
	"gpu_memory": 80, # 80GB VRAM
	"context_length": 32768, # Default 32K (model-specific)
	"max_model_size": "32B", # A100 can handle 32B models with full context
	"bf16_support": True,
	"hourly_price": "€2.20", # Estimated (not in current H100-focused pricing)
	"monthly_price": "~€1,606"
	},
	"H100": {
	"memory_limit": 131072, # 128GB RAM (240GB actual)
	"cpu_limit": 24000, # 24 vCPUs (actual H100-1-80G specs)
	"gpu_memory": 80, # 80GB VRAM
	"context_length": 128000, # 128K context for Llama 3.1 70B
	"max_model_size": "70B", # H100 can handle 70B models with BF16
	"bf16_support": True,
	"hourly_price": "€2.73",
	"monthly_price": "~€1,993"
	},
	"H100_DUAL": {
	"memory_limit": 262144, # 256GB RAM (480GB actual)
	"cpu_limit": 48000, # 48 vCPUs (actual H100-2-80G specs)
	"gpu_memory": 160, # 160GB VRAM (2x80GB)
	"context_length": 128000, # Full context for BF16 70B models
	"max_model_size": "70B", # Dual H100 can handle 70B BF16 with full context
	"bf16_support": True,
	"hourly_price": "€5.46",
	"monthly_price": "~€3,986"
	},
	"H100_SXM_DUAL": {
	"memory_limit": 131072, # 128GB RAM (240GB actual)
	"cpu_limit": 32000, # 32 vCPUs (actual H100-SXM-2-80G specs)
	"gpu_memory": 160, # 160GB VRAM (2x80GB)
	"context_length": 128000, # Full context for BF16 70B models
	"max_model_size": "70B", # SXM version with better interconnect
	"bf16_support": True,
	"hourly_price": "€6.018",
	"monthly_price": "~€4,393"
	},
	"H100_SXM_QUAD": {
	"memory_limit": 262144, # 256GB RAM (480GB actual)
	"cpu_limit": 64000, # 64 vCPUs (actual H100-SXM-4-80G specs)
	"gpu_memory": 320, # 320GB VRAM (4x80GB)
	"context_length": 128000, # Full context for BF16 70B models
	"max_model_size": "70B", # Quad H100 optimal for BF16 70B
	"bf16_support": True,
	"hourly_price": "€11.61",
	"monthly_price": "~€8,475"
	}
	}

	config = gpu_configs.get(gpu_type, gpu_configs["L40S"])
	env_vars["GPU_MEMORY_GB"] = str(config["gpu_memory"])
	env_vars["MAX_CONTEXT_LENGTH"] = str(config["context_length"])

	container = self.container_api.create_container(
	namespace_id=namespace_id,
	name=image_name,
	description=f"LinguaCustodia Financial AI API (70B Model on {gpu_type})",
	environment_variables=env_vars,
	min_scale=1,
	max_scale=1, # Single instance for GPU workloads
	memory_limit=config["memory_limit"],
	cpu_limit=config["cpu_limit"],
	timeout="1800s", # 30 minutes for model loading
	privacy="public",
	http_option="enabled",
	port=7860,
	protocol="http1"
	)

	logger.info(f"Created GPU container: {container.id} with {gpu_type}")
	return {
	"container_id": container.id,
	"name": container.name,
	"status": "created",
	"gpu_type": gpu_type,
	"gpu_memory": config["gpu_memory"],
	"context_length": config["context_length"],
	"endpoint": getattr(container, 'domain_name', None)
	}

	except Exception as e:
	logger.error(f"Failed to create GPU container: {e}")
	raise

	def deploy_function(self, namespace_id: str, function_name: str = "lingua-custodia-api") -> Dict[str, Any]:
	"""Deploy the LinguaCustodia API as a serverless function."""
	try:
	function = self.function_api.create_function(
	namespace_id=namespace_id,
	name=function_name,
	description="LinguaCustodia Financial AI API Serverless Function",
	environment_variables=self._get_environment_variables(),
	min_scale=0,
	max_scale=5,
	memory_limit=16384, # 16GB for 8B models (was 1GB - insufficient)
	timeout="600s", # 10 minutes for model loading (Scaleway expects string with unit)
	privacy="public",
	http_option="enabled"
	)

	logger.info(f"Created function: {function.id}")
	return {
	"function_id": function.id,
	"name": function.name,
	"status": "created",
	"endpoint": getattr(function, 'domain_name', None)
	}

	except Exception as e:
	logger.error(f"Failed to create function: {e}")
	raise

	def list_deployments(self) -> Dict[str, Any]:
	"""List all existing deployments."""
	try:
	namespaces = self.container_api.list_namespaces()
	function_namespaces = self.function_api.list_namespaces()
	all_functions = []

	for func_ns in function_namespaces.namespaces:
	try:
	functions = self.function_api.list_functions(namespace_id=func_ns.id)
	all_functions.extend(functions.functions)
	except Exception as e:
	logger.warning(f"Could not list functions for namespace {func_ns.id}: {e}")

	return {
	"namespaces": [{"id": ns.id, "name": ns.name} for ns in namespaces.namespaces],
	"functions": [{"id": func.id, "name": func.name} for func in all_functions],
	"total_namespaces": len(namespaces.namespaces),
	"total_functions": len(all_functions)
	}

	except Exception as e:
	logger.error(f"Failed to list deployments: {e}")
	raise

	def get_deployment_status(self, deployment_id: str, deployment_type: str = "container") -> Dict[str, Any]:
	"""Get the status of a specific deployment."""
	try:
	if deployment_type == "container":
	container = self.container_api.get_container(deployment_id)
	return {
	"id": container.id,
	"name": container.name,
	"status": container.status,
	"endpoint": getattr(container, 'domain_name', None),
	"memory_limit": container.memory_limit,
	"cpu_limit": container.cpu_limit
	}
	elif deployment_type == "function":
	function = self.function_api.get_function(deployment_id)
	return {
	"id": function.id,
	"name": function.name,
	"status": function.status,
	"endpoint": getattr(function, 'domain_name', None),
	"memory_limit": function.memory_limit
	}
	else:
	raise ValueError("deployment_type must be 'container' or 'function'")

	except Exception as e:
	logger.error(f"Failed to get deployment status: {e}")
	raise

	def main():
	"""Main function to demonstrate Scaleway deployment for LinguaCustodia v1.0 models."""
	try:
	deployment = ScalewayDeployment()

	deployments = deployment.list_deployments()
	logger.info(f"Found {deployments['total_namespaces']} namespaces and {deployments['total_functions']} functions")

	# Create namespace for v1.0 models deployment
	namespace = deployment.create_container_namespace("lingua-custodia-v1.0")
	logger.info(f"Namespace created: {namespace['namespace_id']}")

	# Deploy 32B model on A100 (new model size)
	a100_32b_container = deployment.deploy_gpu_container(
	namespace['namespace_id'],
	"lingua-custodia-32b-v1.0-a100",
	"A100"
	)
	logger.info(f"A100 32B Container created: {a100_32b_container['container_id']}")
	logger.info(f"GPU Type: {a100_32b_container['gpu_type']}")
	logger.info(f"GPU Memory: {a100_32b_container['gpu_memory']}GB")
	logger.info(f"Context Length: {a100_32b_container['context_length']} tokens")

	# Deploy 70B v1.0 model on H100_DUAL (recommended for 128K context)
	h100_dual_container = deployment.deploy_gpu_container(
	namespace['namespace_id'],
	"lingua-custodia-70b-v1.0-h100-dual",
	"H100_DUAL"
	)
	logger.info(f"H100 Dual 70B Container created: {h100_dual_container['container_id']}")
	logger.info(f"GPU Type: {h100_dual_container['gpu_type']}")
	logger.info(f"GPU Memory: {h100_dual_container['gpu_memory']}GB")
	logger.info(f"Context Length: {h100_dual_container['context_length']} tokens")

	# Deploy 8B v1.0 model on L40S (cost-effective option)
	l40s_8b_container = deployment.deploy_gpu_container(
	namespace['namespace_id'],
	"lingua-custodia-8b-v1.0-l40s",
	"L40S"
	)
	logger.info(f"L40S 8B Container created: {l40s_8b_container['container_id']}")
	logger.info(f"GPU Type: {l40s_8b_container['gpu_type']}")
	logger.info(f"GPU Memory: {l40s_8b_container['gpu_memory']}GB")
	logger.info(f"Context Length: {l40s_8b_container['context_length']} tokens")

	logger.info("Scaleway LinguaCustodia v1.0 models deployment completed successfully!")
	logger.info("🌍 Region: PARIS 2 (fr-par-2) - H100 availability")
	logger.info("💰 Current Scaleway Pricing (2024):")
	logger.info(" - L40S: €1.50/hour (~€1,095/month) - 8B models")
	logger.info(" - A100-80G: €2.20/hour (~€1,606/month) - 32B models")
	logger.info(" - H100-1-80G: €2.73/hour (~€1,993/month) - 32B models")
	logger.info(" - H100-2-80G: €5.46/hour (~€3,986/month) - 70B models")
	logger.info(" - H100-SXM-2-80G: €6.018/hour (~€4,393/month) - 70B models")
	logger.info(" - H100-SXM-4-80G: €11.61/hour (~€8,475/month) - 70B models")
	logger.info("⚠️ v1.0 Model Requirements:")
	logger.info(" - 8B models: 8GB VRAM (L40S)")
	logger.info(" - 12B models: 12GB VRAM (A100)")
	logger.info(" - 32B models: 32GB VRAM (A100/H100)")
	logger.info(" - 70B models: 80GB VRAM (H100)")
	logger.info("✅ All v1.0 models support 128K context length")
	logger.info("📊 Precision: BF16 (bfloat16) - no quantization needed")
	logger.info("⚡ H100: 3x faster than A100 for transformer workloads")

	except Exception as e:
	logger.error(f"Deployment failed: {e}")
	raise

	if __name__ == "__main__":
	main()