Spaces:
Runtime error
Runtime error
File size: 20,796 Bytes
8c0b652 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 |
#!/usr/bin/env python3
"""
Scaleway Deployment Configuration for LinguaCustodia Financial AI API
"""
import os
import logging
from typing import Dict, Any
from dotenv import load_dotenv
from scaleway import Client
from scaleway.container.v1beta1 import ContainerV1Beta1API
from scaleway.function.v1beta1 import FunctionV1Beta1API
load_dotenv()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ScalewayDeployment:
"""Scaleway deployment manager for LinguaCustodia API."""
def __init__(self):
"""Initialize Scaleway client with credentials from .env."""
self.access_key = os.getenv('SCW_ACCESS_KEY')
self.secret_key = os.getenv('SCW_SECRET_KEY')
self.project_id = os.getenv('SCW_DEFAULT_PROJECT_ID')
self.region = os.getenv('SCW_REGION', 'fr-par-2') # PARIS 2 for H100 availability
if not all([self.access_key, self.secret_key, self.project_id]):
raise ValueError("Missing required Scaleway credentials in .env file")
self.client = Client(
access_key=self.access_key,
secret_key=self.secret_key,
default_project_id=self.project_id,
default_region=self.region,
default_zone=f"{self.region}-1"
)
self.container_api = ContainerV1Beta1API(self.client)
self.function_api = FunctionV1Beta1API(self.client)
logger.info(f"Scaleway client initialized for project: {self.project_id}")
def _get_environment_variables(self, model_size: str = "8b") -> Dict[str, str]:
"""Get common environment variables for deployments."""
base_vars = {
"HF_TOKEN_LC": os.getenv('HF_TOKEN_LC', ''),
"HF_TOKEN": os.getenv('HF_TOKEN', ''),
"APP_PORT": "7860", # HuggingFace standard port
"LOG_LEVEL": "INFO",
"HF_HOME": "/data/.huggingface" # Persistent storage for model caching
}
# Configure model-specific variables
if model_size == "70b":
base_vars.update({
"MODEL_NAME": "llama3.1-70b-v1.0", # Use latest v1.0 model
"MAX_CONTEXT_LENGTH": "128000", # 128K context for v1.0 70B
"BATCH_SIZE": "1", # Conservative batch size for 70B
"GPU_MEMORY_FRACTION": "0.95", # Use 95% of GPU memory for BF16
"VLLM_GPU_MEMORY_UTILIZATION": "0.95",
"VLLM_MAX_MODEL_LEN": "128000", # 128K context for v1.0
"VLLM_DTYPE": "bfloat16", # BF16 precision
"VLLM_ENFORCE_EAGER": "true", # Better memory management
"VLLM_DISABLE_CUSTOM_ALL_REDUCE": "true", # Optimize for single GPU
"VLLM_BLOCK_SIZE": "16", # Optimize KV cache block size
"VLLM_SWAP_SPACE": "4", # 4GB swap space for memory overflow
"VLLM_CPU_OFFLOAD_GBN": "1" # CPU offload for gradient computation
})
elif model_size == "32b":
base_vars.update({
"MODEL_NAME": "qwen3-32b-v1.0", # New 32B model
"MAX_CONTEXT_LENGTH": "32768", # Qwen 3 32B supports 32K context
"BATCH_SIZE": "1", # Conservative batch size for 32B
"GPU_MEMORY_FRACTION": "0.9", # Use 90% of GPU memory
"VLLM_GPU_MEMORY_UTILIZATION": "0.9",
"VLLM_MAX_MODEL_LEN": "32768",
"VLLM_DTYPE": "bfloat16", # BF16 precision for 32B
"VLLM_ENFORCE_EAGER": "true",
"VLLM_DISABLE_CUSTOM_ALL_REDUCE": "true",
"VLLM_BLOCK_SIZE": "16",
"VLLM_SWAP_SPACE": "2", # 2GB swap space
"VLLM_CPU_OFFLOAD_GBN": "1"
})
elif model_size == "12b":
base_vars.update({
"MODEL_NAME": "gemma3-12b-v1.0", # Use latest v1.0 model
"MAX_CONTEXT_LENGTH": "8192", # Gemma 3 12B supports 8K context
"BATCH_SIZE": "2",
"GPU_MEMORY_FRACTION": "0.85",
"VLLM_GPU_MEMORY_UTILIZATION": "0.85",
"VLLM_MAX_MODEL_LEN": "8192"
})
else: # 8B and smaller
base_vars.update({
"MODEL_NAME": os.getenv('MODEL_NAME', 'qwen3-8b-v1.0'), # Default to v1.0
"MAX_CONTEXT_LENGTH": "32768", # Default 32K (Llama 3.1 8B can use 128K)
"BATCH_SIZE": "4",
"GPU_MEMORY_FRACTION": "0.8",
"VLLM_GPU_MEMORY_UTILIZATION": "0.8",
"VLLM_MAX_MODEL_LEN": "32768"
})
return base_vars
def create_container_namespace(self, name: str = "lingua-custodia") -> Dict[str, Any]:
"""Create a container namespace for the LinguaCustodia API."""
try:
namespace = self.container_api.create_namespace(
project_id=self.project_id,
name=name,
description="LinguaCustodia Financial AI API Container Namespace",
environment_variables=self._get_environment_variables()
)
logger.info(f"Created container namespace: {namespace.id}")
return {
"namespace_id": namespace.id,
"name": namespace.name,
"status": "created"
}
except Exception as e:
logger.error(f"Failed to create container namespace: {e}")
raise
def deploy_container(self, namespace_id: str, image_name: str = "lingua-custodia-api", model_size: str = "70b") -> Dict[str, Any]:
"""Deploy the LinguaCustodia API as a container with optimized resources for model size."""
try:
env_vars = self._get_environment_variables(model_size)
env_vars["PYTHONPATH"] = "/app"
# Configure resources based on model size
if model_size == "70b":
memory_limit = 65536 # 64GB for 70B models
cpu_limit = 16000 # 16 vCPUs for 70B models
timeout = "1800s" # 30 minutes for model loading
max_scale = 1 # Single instance for 70B (resource intensive)
elif model_size == "12b":
memory_limit = 32768 # 32GB for 12B models
cpu_limit = 8000 # 8 vCPUs for 12B models
timeout = "900s" # 15 minutes for model loading
max_scale = 2 # Limited scaling for 12B
else: # 8B and smaller
memory_limit = 16384 # 16GB for 8B models
cpu_limit = 4000 # 4 vCPUs for 8B models
timeout = "600s" # 10 minutes for model loading
max_scale = 3 # Normal scaling for smaller models
container = self.container_api.create_container(
namespace_id=namespace_id,
name=image_name,
description=f"LinguaCustodia Financial AI API ({model_size.upper()} Model)",
environment_variables=env_vars,
min_scale=1,
max_scale=max_scale,
memory_limit=memory_limit,
cpu_limit=cpu_limit,
timeout=timeout,
privacy="public",
http_option="enabled",
port=7860, # HuggingFace standard port
protocol="http1"
)
logger.info(f"Created container: {container.id}")
return {
"container_id": container.id,
"name": container.name,
"status": "created",
"endpoint": getattr(container, 'domain_name', None)
}
except Exception as e:
logger.error(f"Failed to create container: {e}")
raise
def deploy_gpu_container(self, namespace_id: str, image_name: str = "lingua-custodia-gpu", gpu_type: str = "L40S") -> Dict[str, Any]:
"""Deploy the LinguaCustodia API as a GPU-enabled container for 70B models."""
try:
env_vars = self._get_environment_variables("70b")
env_vars["PYTHONPATH"] = "/app"
env_vars["GPU_TYPE"] = gpu_type
# GPU-specific configuration for BF16 precision with Scaleway pricing
gpu_configs = {
"L40S": {
"memory_limit": 32768, # 32GB RAM
"cpu_limit": 8000, # 8 vCPUs
"gpu_memory": 48, # 48GB VRAM
"context_length": 32768, # Default 32K (Llama 3.1 8B can use 128K)
"max_model_size": "8B", # L40S can only handle up to 8B models
"bf16_support": True,
"hourly_price": "β¬1.50", # Estimated (not available in current pricing)
"monthly_price": "~β¬1,095"
},
"A100": {
"memory_limit": 131072, # 128GB RAM
"cpu_limit": 32000, # 32 vCPUs
"gpu_memory": 80, # 80GB VRAM
"context_length": 32768, # Default 32K (model-specific)
"max_model_size": "32B", # A100 can handle 32B models with full context
"bf16_support": True,
"hourly_price": "β¬2.20", # Estimated (not in current H100-focused pricing)
"monthly_price": "~β¬1,606"
},
"H100": {
"memory_limit": 131072, # 128GB RAM (240GB actual)
"cpu_limit": 24000, # 24 vCPUs (actual H100-1-80G specs)
"gpu_memory": 80, # 80GB VRAM
"context_length": 128000, # 128K context for Llama 3.1 70B
"max_model_size": "70B", # H100 can handle 70B models with BF16
"bf16_support": True,
"hourly_price": "β¬2.73",
"monthly_price": "~β¬1,993"
},
"H100_DUAL": {
"memory_limit": 262144, # 256GB RAM (480GB actual)
"cpu_limit": 48000, # 48 vCPUs (actual H100-2-80G specs)
"gpu_memory": 160, # 160GB VRAM (2x80GB)
"context_length": 128000, # Full context for BF16 70B models
"max_model_size": "70B", # Dual H100 can handle 70B BF16 with full context
"bf16_support": True,
"hourly_price": "β¬5.46",
"monthly_price": "~β¬3,986"
},
"H100_SXM_DUAL": {
"memory_limit": 131072, # 128GB RAM (240GB actual)
"cpu_limit": 32000, # 32 vCPUs (actual H100-SXM-2-80G specs)
"gpu_memory": 160, # 160GB VRAM (2x80GB)
"context_length": 128000, # Full context for BF16 70B models
"max_model_size": "70B", # SXM version with better interconnect
"bf16_support": True,
"hourly_price": "β¬6.018",
"monthly_price": "~β¬4,393"
},
"H100_SXM_QUAD": {
"memory_limit": 262144, # 256GB RAM (480GB actual)
"cpu_limit": 64000, # 64 vCPUs (actual H100-SXM-4-80G specs)
"gpu_memory": 320, # 320GB VRAM (4x80GB)
"context_length": 128000, # Full context for BF16 70B models
"max_model_size": "70B", # Quad H100 optimal for BF16 70B
"bf16_support": True,
"hourly_price": "β¬11.61",
"monthly_price": "~β¬8,475"
}
}
config = gpu_configs.get(gpu_type, gpu_configs["L40S"])
env_vars["GPU_MEMORY_GB"] = str(config["gpu_memory"])
env_vars["MAX_CONTEXT_LENGTH"] = str(config["context_length"])
container = self.container_api.create_container(
namespace_id=namespace_id,
name=image_name,
description=f"LinguaCustodia Financial AI API (70B Model on {gpu_type})",
environment_variables=env_vars,
min_scale=1,
max_scale=1, # Single instance for GPU workloads
memory_limit=config["memory_limit"],
cpu_limit=config["cpu_limit"],
timeout="1800s", # 30 minutes for model loading
privacy="public",
http_option="enabled",
port=7860,
protocol="http1"
)
logger.info(f"Created GPU container: {container.id} with {gpu_type}")
return {
"container_id": container.id,
"name": container.name,
"status": "created",
"gpu_type": gpu_type,
"gpu_memory": config["gpu_memory"],
"context_length": config["context_length"],
"endpoint": getattr(container, 'domain_name', None)
}
except Exception as e:
logger.error(f"Failed to create GPU container: {e}")
raise
def deploy_function(self, namespace_id: str, function_name: str = "lingua-custodia-api") -> Dict[str, Any]:
"""Deploy the LinguaCustodia API as a serverless function."""
try:
function = self.function_api.create_function(
namespace_id=namespace_id,
name=function_name,
description="LinguaCustodia Financial AI API Serverless Function",
environment_variables=self._get_environment_variables(),
min_scale=0,
max_scale=5,
memory_limit=16384, # 16GB for 8B models (was 1GB - insufficient)
timeout="600s", # 10 minutes for model loading (Scaleway expects string with unit)
privacy="public",
http_option="enabled"
)
logger.info(f"Created function: {function.id}")
return {
"function_id": function.id,
"name": function.name,
"status": "created",
"endpoint": getattr(function, 'domain_name', None)
}
except Exception as e:
logger.error(f"Failed to create function: {e}")
raise
def list_deployments(self) -> Dict[str, Any]:
"""List all existing deployments."""
try:
namespaces = self.container_api.list_namespaces()
function_namespaces = self.function_api.list_namespaces()
all_functions = []
for func_ns in function_namespaces.namespaces:
try:
functions = self.function_api.list_functions(namespace_id=func_ns.id)
all_functions.extend(functions.functions)
except Exception as e:
logger.warning(f"Could not list functions for namespace {func_ns.id}: {e}")
return {
"namespaces": [{"id": ns.id, "name": ns.name} for ns in namespaces.namespaces],
"functions": [{"id": func.id, "name": func.name} for func in all_functions],
"total_namespaces": len(namespaces.namespaces),
"total_functions": len(all_functions)
}
except Exception as e:
logger.error(f"Failed to list deployments: {e}")
raise
def get_deployment_status(self, deployment_id: str, deployment_type: str = "container") -> Dict[str, Any]:
"""Get the status of a specific deployment."""
try:
if deployment_type == "container":
container = self.container_api.get_container(deployment_id)
return {
"id": container.id,
"name": container.name,
"status": container.status,
"endpoint": getattr(container, 'domain_name', None),
"memory_limit": container.memory_limit,
"cpu_limit": container.cpu_limit
}
elif deployment_type == "function":
function = self.function_api.get_function(deployment_id)
return {
"id": function.id,
"name": function.name,
"status": function.status,
"endpoint": getattr(function, 'domain_name', None),
"memory_limit": function.memory_limit
}
else:
raise ValueError("deployment_type must be 'container' or 'function'")
except Exception as e:
logger.error(f"Failed to get deployment status: {e}")
raise
def main():
"""Main function to demonstrate Scaleway deployment for LinguaCustodia v1.0 models."""
try:
deployment = ScalewayDeployment()
deployments = deployment.list_deployments()
logger.info(f"Found {deployments['total_namespaces']} namespaces and {deployments['total_functions']} functions")
# Create namespace for v1.0 models deployment
namespace = deployment.create_container_namespace("lingua-custodia-v1.0")
logger.info(f"Namespace created: {namespace['namespace_id']}")
# Deploy 32B model on A100 (new model size)
a100_32b_container = deployment.deploy_gpu_container(
namespace['namespace_id'],
"lingua-custodia-32b-v1.0-a100",
"A100"
)
logger.info(f"A100 32B Container created: {a100_32b_container['container_id']}")
logger.info(f"GPU Type: {a100_32b_container['gpu_type']}")
logger.info(f"GPU Memory: {a100_32b_container['gpu_memory']}GB")
logger.info(f"Context Length: {a100_32b_container['context_length']} tokens")
# Deploy 70B v1.0 model on H100_DUAL (recommended for 128K context)
h100_dual_container = deployment.deploy_gpu_container(
namespace['namespace_id'],
"lingua-custodia-70b-v1.0-h100-dual",
"H100_DUAL"
)
logger.info(f"H100 Dual 70B Container created: {h100_dual_container['container_id']}")
logger.info(f"GPU Type: {h100_dual_container['gpu_type']}")
logger.info(f"GPU Memory: {h100_dual_container['gpu_memory']}GB")
logger.info(f"Context Length: {h100_dual_container['context_length']} tokens")
# Deploy 8B v1.0 model on L40S (cost-effective option)
l40s_8b_container = deployment.deploy_gpu_container(
namespace['namespace_id'],
"lingua-custodia-8b-v1.0-l40s",
"L40S"
)
logger.info(f"L40S 8B Container created: {l40s_8b_container['container_id']}")
logger.info(f"GPU Type: {l40s_8b_container['gpu_type']}")
logger.info(f"GPU Memory: {l40s_8b_container['gpu_memory']}GB")
logger.info(f"Context Length: {l40s_8b_container['context_length']} tokens")
logger.info("Scaleway LinguaCustodia v1.0 models deployment completed successfully!")
logger.info("π Region: PARIS 2 (fr-par-2) - H100 availability")
logger.info("π° Current Scaleway Pricing (2024):")
logger.info(" - L40S: β¬1.50/hour (~β¬1,095/month) - 8B models")
logger.info(" - A100-80G: β¬2.20/hour (~β¬1,606/month) - 32B models")
logger.info(" - H100-1-80G: β¬2.73/hour (~β¬1,993/month) - 32B models")
logger.info(" - H100-2-80G: β¬5.46/hour (~β¬3,986/month) - 70B models")
logger.info(" - H100-SXM-2-80G: β¬6.018/hour (~β¬4,393/month) - 70B models")
logger.info(" - H100-SXM-4-80G: β¬11.61/hour (~β¬8,475/month) - 70B models")
logger.info("β οΈ v1.0 Model Requirements:")
logger.info(" - 8B models: 8GB VRAM (L40S)")
logger.info(" - 12B models: 12GB VRAM (A100)")
logger.info(" - 32B models: 32GB VRAM (A100/H100)")
logger.info(" - 70B models: 80GB VRAM (H100)")
logger.info("β
All v1.0 models support 128K context length")
logger.info("π Precision: BF16 (bfloat16) - no quantization needed")
logger.info("β‘ H100: 3x faster than A100 for transformer workloads")
except Exception as e:
logger.error(f"Deployment failed: {e}")
raise
if __name__ == "__main__":
main()
|