File size: 20,796 Bytes
8c0b652
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
#!/usr/bin/env python3
"""
Scaleway Deployment Configuration for LinguaCustodia Financial AI API
"""

import os
import logging
from typing import Dict, Any
from dotenv import load_dotenv
from scaleway import Client
from scaleway.container.v1beta1 import ContainerV1Beta1API
from scaleway.function.v1beta1 import FunctionV1Beta1API

load_dotenv()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ScalewayDeployment:
    """Scaleway deployment manager for LinguaCustodia API."""
    
    def __init__(self):
        """Initialize Scaleway client with credentials from .env."""
        self.access_key = os.getenv('SCW_ACCESS_KEY')
        self.secret_key = os.getenv('SCW_SECRET_KEY')
        self.project_id = os.getenv('SCW_DEFAULT_PROJECT_ID')
        self.region = os.getenv('SCW_REGION', 'fr-par-2')  # PARIS 2 for H100 availability
        
        if not all([self.access_key, self.secret_key, self.project_id]):
            raise ValueError("Missing required Scaleway credentials in .env file")
        
        self.client = Client(
            access_key=self.access_key,
            secret_key=self.secret_key,
            default_project_id=self.project_id,
            default_region=self.region,
            default_zone=f"{self.region}-1"
        )
        
        self.container_api = ContainerV1Beta1API(self.client)
        self.function_api = FunctionV1Beta1API(self.client)
        
        logger.info(f"Scaleway client initialized for project: {self.project_id}")
    
    def _get_environment_variables(self, model_size: str = "8b") -> Dict[str, str]:
        """Get common environment variables for deployments."""
        base_vars = {
            "HF_TOKEN_LC": os.getenv('HF_TOKEN_LC', ''),
            "HF_TOKEN": os.getenv('HF_TOKEN', ''),
            "APP_PORT": "7860",  # HuggingFace standard port
            "LOG_LEVEL": "INFO",
            "HF_HOME": "/data/.huggingface"  # Persistent storage for model caching
        }
        
        # Configure model-specific variables
        if model_size == "70b":
            base_vars.update({
                "MODEL_NAME": "llama3.1-70b-v1.0",  # Use latest v1.0 model
                "MAX_CONTEXT_LENGTH": "128000",  # 128K context for v1.0 70B
                "BATCH_SIZE": "1",  # Conservative batch size for 70B
                "GPU_MEMORY_FRACTION": "0.95",  # Use 95% of GPU memory for BF16
                "VLLM_GPU_MEMORY_UTILIZATION": "0.95",
                "VLLM_MAX_MODEL_LEN": "128000",  # 128K context for v1.0
                "VLLM_DTYPE": "bfloat16",  # BF16 precision
                "VLLM_ENFORCE_EAGER": "true",  # Better memory management
                "VLLM_DISABLE_CUSTOM_ALL_REDUCE": "true",  # Optimize for single GPU
                "VLLM_BLOCK_SIZE": "16",  # Optimize KV cache block size
                "VLLM_SWAP_SPACE": "4",  # 4GB swap space for memory overflow
                "VLLM_CPU_OFFLOAD_GBN": "1"  # CPU offload for gradient computation
            })
        elif model_size == "32b":
            base_vars.update({
                "MODEL_NAME": "qwen3-32b-v1.0",  # New 32B model
                "MAX_CONTEXT_LENGTH": "32768",   # Qwen 3 32B supports 32K context
                "BATCH_SIZE": "1",  # Conservative batch size for 32B
                "GPU_MEMORY_FRACTION": "0.9",  # Use 90% of GPU memory
                "VLLM_GPU_MEMORY_UTILIZATION": "0.9",
                "VLLM_MAX_MODEL_LEN": "32768",
                "VLLM_DTYPE": "bfloat16",  # BF16 precision for 32B
                "VLLM_ENFORCE_EAGER": "true",
                "VLLM_DISABLE_CUSTOM_ALL_REDUCE": "true",
                "VLLM_BLOCK_SIZE": "16",
                "VLLM_SWAP_SPACE": "2",  # 2GB swap space
                "VLLM_CPU_OFFLOAD_GBN": "1"
            })
        elif model_size == "12b":
            base_vars.update({
                "MODEL_NAME": "gemma3-12b-v1.0",  # Use latest v1.0 model
                "MAX_CONTEXT_LENGTH": "8192",     # Gemma 3 12B supports 8K context
                "BATCH_SIZE": "2",
                "GPU_MEMORY_FRACTION": "0.85",
                "VLLM_GPU_MEMORY_UTILIZATION": "0.85",
                "VLLM_MAX_MODEL_LEN": "8192"
            })
        else:  # 8B and smaller
            base_vars.update({
                "MODEL_NAME": os.getenv('MODEL_NAME', 'qwen3-8b-v1.0'),  # Default to v1.0
                "MAX_CONTEXT_LENGTH": "32768",   # Default 32K (Llama 3.1 8B can use 128K)
                "BATCH_SIZE": "4",
                "GPU_MEMORY_FRACTION": "0.8",
                "VLLM_GPU_MEMORY_UTILIZATION": "0.8",
                "VLLM_MAX_MODEL_LEN": "32768"
            })
        
        return base_vars
    
    def create_container_namespace(self, name: str = "lingua-custodia") -> Dict[str, Any]:
        """Create a container namespace for the LinguaCustodia API."""
        try:
            namespace = self.container_api.create_namespace(
                project_id=self.project_id,
                name=name,
                description="LinguaCustodia Financial AI API Container Namespace",
                environment_variables=self._get_environment_variables()
            )
            
            logger.info(f"Created container namespace: {namespace.id}")
            return {
                "namespace_id": namespace.id,
                "name": namespace.name,
                "status": "created"
            }
            
        except Exception as e:
            logger.error(f"Failed to create container namespace: {e}")
            raise
    
    def deploy_container(self, namespace_id: str, image_name: str = "lingua-custodia-api", model_size: str = "70b") -> Dict[str, Any]:
        """Deploy the LinguaCustodia API as a container with optimized resources for model size."""
        try:
            env_vars = self._get_environment_variables(model_size)
            env_vars["PYTHONPATH"] = "/app"
            
            # Configure resources based on model size
            if model_size == "70b":
                memory_limit = 65536  # 64GB for 70B models
                cpu_limit = 16000     # 16 vCPUs for 70B models
                timeout = "1800s"     # 30 minutes for model loading
                max_scale = 1         # Single instance for 70B (resource intensive)
            elif model_size == "12b":
                memory_limit = 32768  # 32GB for 12B models
                cpu_limit = 8000      # 8 vCPUs for 12B models
                timeout = "900s"      # 15 minutes for model loading
                max_scale = 2         # Limited scaling for 12B
            else:  # 8B and smaller
                memory_limit = 16384  # 16GB for 8B models
                cpu_limit = 4000      # 4 vCPUs for 8B models
                timeout = "600s"      # 10 minutes for model loading
                max_scale = 3         # Normal scaling for smaller models
            
            container = self.container_api.create_container(
                namespace_id=namespace_id,
                name=image_name,
                description=f"LinguaCustodia Financial AI API ({model_size.upper()} Model)",
                environment_variables=env_vars,
                min_scale=1,
                max_scale=max_scale,
                memory_limit=memory_limit,
                cpu_limit=cpu_limit,
                timeout=timeout,
                privacy="public",
                http_option="enabled",
                port=7860,  # HuggingFace standard port
                protocol="http1"
            )
            
            logger.info(f"Created container: {container.id}")
            return {
                "container_id": container.id,
                "name": container.name,
                "status": "created",
                "endpoint": getattr(container, 'domain_name', None)
            }
            
        except Exception as e:
            logger.error(f"Failed to create container: {e}")
            raise
    
    def deploy_gpu_container(self, namespace_id: str, image_name: str = "lingua-custodia-gpu", gpu_type: str = "L40S") -> Dict[str, Any]:
        """Deploy the LinguaCustodia API as a GPU-enabled container for 70B models."""
        try:
            env_vars = self._get_environment_variables("70b")
            env_vars["PYTHONPATH"] = "/app"
            env_vars["GPU_TYPE"] = gpu_type
            
            # GPU-specific configuration for BF16 precision with Scaleway pricing
            gpu_configs = {
                "L40S": {
                    "memory_limit": 32768,  # 32GB RAM
                    "cpu_limit": 8000,      # 8 vCPUs
                    "gpu_memory": 48,       # 48GB VRAM
                    "context_length": 32768,  # Default 32K (Llama 3.1 8B can use 128K)
                    "max_model_size": "8B",   # L40S can only handle up to 8B models
                    "bf16_support": True,
                    "hourly_price": "€1.50",  # Estimated (not available in current pricing)
                    "monthly_price": "~€1,095"
                },
                "A100": {
                    "memory_limit": 131072, # 128GB RAM
                    "cpu_limit": 32000,     # 32 vCPUs
                    "gpu_memory": 80,       # 80GB VRAM
                    "context_length": 32768,  # Default 32K (model-specific)
                    "max_model_size": "32B",  # A100 can handle 32B models with full context
                    "bf16_support": True,
                    "hourly_price": "€2.20",  # Estimated (not in current H100-focused pricing)
                    "monthly_price": "~€1,606"
                },
                "H100": {
                    "memory_limit": 131072, # 128GB RAM (240GB actual)
                    "cpu_limit": 24000,     # 24 vCPUs (actual H100-1-80G specs)
                    "gpu_memory": 80,       # 80GB VRAM
                    "context_length": 128000,  # 128K context for Llama 3.1 70B
                    "max_model_size": "70B",  # H100 can handle 70B models with BF16
                    "bf16_support": True,
                    "hourly_price": "€2.73",
                    "monthly_price": "~€1,993"
                },
                "H100_DUAL": {
                    "memory_limit": 262144, # 256GB RAM (480GB actual)
                    "cpu_limit": 48000,     # 48 vCPUs (actual H100-2-80G specs)
                    "gpu_memory": 160,      # 160GB VRAM (2x80GB)
                    "context_length": 128000,  # Full context for BF16 70B models
                    "max_model_size": "70B",   # Dual H100 can handle 70B BF16 with full context
                    "bf16_support": True,
                    "hourly_price": "€5.46",
                    "monthly_price": "~€3,986"
                },
                "H100_SXM_DUAL": {
                    "memory_limit": 131072, # 128GB RAM (240GB actual)
                    "cpu_limit": 32000,     # 32 vCPUs (actual H100-SXM-2-80G specs)
                    "gpu_memory": 160,      # 160GB VRAM (2x80GB)
                    "context_length": 128000,  # Full context for BF16 70B models
                    "max_model_size": "70B",   # SXM version with better interconnect
                    "bf16_support": True,
                    "hourly_price": "€6.018",
                    "monthly_price": "~€4,393"
                },
                "H100_SXM_QUAD": {
                    "memory_limit": 262144, # 256GB RAM (480GB actual)
                    "cpu_limit": 64000,     # 64 vCPUs (actual H100-SXM-4-80G specs)
                    "gpu_memory": 320,      # 320GB VRAM (4x80GB)
                    "context_length": 128000,  # Full context for BF16 70B models
                    "max_model_size": "70B",   # Quad H100 optimal for BF16 70B
                    "bf16_support": True,
                    "hourly_price": "€11.61",
                    "monthly_price": "~€8,475"
                }
            }
            
            config = gpu_configs.get(gpu_type, gpu_configs["L40S"])
            env_vars["GPU_MEMORY_GB"] = str(config["gpu_memory"])
            env_vars["MAX_CONTEXT_LENGTH"] = str(config["context_length"])
            
            container = self.container_api.create_container(
                namespace_id=namespace_id,
                name=image_name,
                description=f"LinguaCustodia Financial AI API (70B Model on {gpu_type})",
                environment_variables=env_vars,
                min_scale=1,
                max_scale=1,  # Single instance for GPU workloads
                memory_limit=config["memory_limit"],
                cpu_limit=config["cpu_limit"],
                timeout="1800s",  # 30 minutes for model loading
                privacy="public",
                http_option="enabled",
                port=7860,
                protocol="http1"
            )
            
            logger.info(f"Created GPU container: {container.id} with {gpu_type}")
            return {
                "container_id": container.id,
                "name": container.name,
                "status": "created",
                "gpu_type": gpu_type,
                "gpu_memory": config["gpu_memory"],
                "context_length": config["context_length"],
                "endpoint": getattr(container, 'domain_name', None)
            }
            
        except Exception as e:
            logger.error(f"Failed to create GPU container: {e}")
            raise
    
    def deploy_function(self, namespace_id: str, function_name: str = "lingua-custodia-api") -> Dict[str, Any]:
        """Deploy the LinguaCustodia API as a serverless function."""
        try:
            function = self.function_api.create_function(
                namespace_id=namespace_id,
                name=function_name,
                description="LinguaCustodia Financial AI API Serverless Function",
                environment_variables=self._get_environment_variables(),
                min_scale=0,
                max_scale=5,
                memory_limit=16384,  # 16GB for 8B models (was 1GB - insufficient)
                timeout="600s",  # 10 minutes for model loading (Scaleway expects string with unit)
                privacy="public",
                http_option="enabled"
            )
            
            logger.info(f"Created function: {function.id}")
            return {
                "function_id": function.id,
                "name": function.name,
                "status": "created",
                "endpoint": getattr(function, 'domain_name', None)
            }
            
        except Exception as e:
            logger.error(f"Failed to create function: {e}")
            raise
    
    def list_deployments(self) -> Dict[str, Any]:
        """List all existing deployments."""
        try:
            namespaces = self.container_api.list_namespaces()
            function_namespaces = self.function_api.list_namespaces()
            all_functions = []
            
            for func_ns in function_namespaces.namespaces:
                try:
                    functions = self.function_api.list_functions(namespace_id=func_ns.id)
                    all_functions.extend(functions.functions)
                except Exception as e:
                    logger.warning(f"Could not list functions for namespace {func_ns.id}: {e}")
            
            return {
                "namespaces": [{"id": ns.id, "name": ns.name} for ns in namespaces.namespaces],
                "functions": [{"id": func.id, "name": func.name} for func in all_functions],
                "total_namespaces": len(namespaces.namespaces),
                "total_functions": len(all_functions)
            }
            
        except Exception as e:
            logger.error(f"Failed to list deployments: {e}")
            raise
    
    def get_deployment_status(self, deployment_id: str, deployment_type: str = "container") -> Dict[str, Any]:
        """Get the status of a specific deployment."""
        try:
            if deployment_type == "container":
                container = self.container_api.get_container(deployment_id)
                return {
                    "id": container.id,
                    "name": container.name,
                    "status": container.status,
                    "endpoint": getattr(container, 'domain_name', None),
                    "memory_limit": container.memory_limit,
                    "cpu_limit": container.cpu_limit
                }
            elif deployment_type == "function":
                function = self.function_api.get_function(deployment_id)
                return {
                    "id": function.id,
                    "name": function.name,
                    "status": function.status,
                    "endpoint": getattr(function, 'domain_name', None),
                    "memory_limit": function.memory_limit
                }
            else:
                raise ValueError("deployment_type must be 'container' or 'function'")
                
        except Exception as e:
            logger.error(f"Failed to get deployment status: {e}")
            raise

def main():
    """Main function to demonstrate Scaleway deployment for LinguaCustodia v1.0 models."""
    try:
        deployment = ScalewayDeployment()
        
        deployments = deployment.list_deployments()
        logger.info(f"Found {deployments['total_namespaces']} namespaces and {deployments['total_functions']} functions")
        
        # Create namespace for v1.0 models deployment
        namespace = deployment.create_container_namespace("lingua-custodia-v1.0")
        logger.info(f"Namespace created: {namespace['namespace_id']}")
        
        # Deploy 32B model on A100 (new model size)
        a100_32b_container = deployment.deploy_gpu_container(
            namespace['namespace_id'], 
            "lingua-custodia-32b-v1.0-a100", 
            "A100"
        )
        logger.info(f"A100 32B Container created: {a100_32b_container['container_id']}")
        logger.info(f"GPU Type: {a100_32b_container['gpu_type']}")
        logger.info(f"GPU Memory: {a100_32b_container['gpu_memory']}GB")
        logger.info(f"Context Length: {a100_32b_container['context_length']} tokens")
        
        # Deploy 70B v1.0 model on H100_DUAL (recommended for 128K context)
        h100_dual_container = deployment.deploy_gpu_container(
            namespace['namespace_id'], 
            "lingua-custodia-70b-v1.0-h100-dual", 
            "H100_DUAL"
        )
        logger.info(f"H100 Dual 70B Container created: {h100_dual_container['container_id']}")
        logger.info(f"GPU Type: {h100_dual_container['gpu_type']}")
        logger.info(f"GPU Memory: {h100_dual_container['gpu_memory']}GB")
        logger.info(f"Context Length: {h100_dual_container['context_length']} tokens")
        
        # Deploy 8B v1.0 model on L40S (cost-effective option)
        l40s_8b_container = deployment.deploy_gpu_container(
            namespace['namespace_id'], 
            "lingua-custodia-8b-v1.0-l40s", 
            "L40S"
        )
        logger.info(f"L40S 8B Container created: {l40s_8b_container['container_id']}")
        logger.info(f"GPU Type: {l40s_8b_container['gpu_type']}")
        logger.info(f"GPU Memory: {l40s_8b_container['gpu_memory']}GB")
        logger.info(f"Context Length: {l40s_8b_container['context_length']} tokens")
        
        logger.info("Scaleway LinguaCustodia v1.0 models deployment completed successfully!")
        logger.info("🌍 Region: PARIS 2 (fr-par-2) - H100 availability")
        logger.info("πŸ’° Current Scaleway Pricing (2024):")
        logger.info("   - L40S: €1.50/hour (~€1,095/month) - 8B models")
        logger.info("   - A100-80G: €2.20/hour (~€1,606/month) - 32B models")
        logger.info("   - H100-1-80G: €2.73/hour (~€1,993/month) - 32B models")
        logger.info("   - H100-2-80G: €5.46/hour (~€3,986/month) - 70B models")
        logger.info("   - H100-SXM-2-80G: €6.018/hour (~€4,393/month) - 70B models")
        logger.info("   - H100-SXM-4-80G: €11.61/hour (~€8,475/month) - 70B models")
        logger.info("⚠️  v1.0 Model Requirements:")
        logger.info("   - 8B models: 8GB VRAM (L40S)")
        logger.info("   - 12B models: 12GB VRAM (A100)")
        logger.info("   - 32B models: 32GB VRAM (A100/H100)")
        logger.info("   - 70B models: 80GB VRAM (H100)")
        logger.info("βœ… All v1.0 models support 128K context length")
        logger.info("πŸ“Š Precision: BF16 (bfloat16) - no quantization needed")
        logger.info("⚑ H100: 3x faster than A100 for transformer workloads")
        
    except Exception as e:
        logger.error(f"Deployment failed: {e}")
        raise

if __name__ == "__main__":
    main()