Trouter-Library commited on
Commit
96e2cf8
·
verified ·
1 Parent(s): 64d1fc1

Create autotrain.py

Browse files
Files changed (1) hide show
  1. autotrain.py +712 -0
autotrain.py ADDED
@@ -0,0 +1,712 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helion-V1 Auto Training Handler
3
+ Robust training script with comprehensive error handling for HuggingFace
4
+ Handles HTTP errors, upload issues, authentication, and training failures
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import time
10
+ import json
11
+ import logging
12
+ import traceback
13
+ from typing import Optional, Dict, List, Any
14
+ from dataclasses import dataclass
15
+ from pathlib import Path
16
+ import requests
17
+ from requests.adapters import HTTPAdapter
18
+ from urllib3.util.retry import Retry
19
+
20
+ # Setup logging
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
24
+ handlers=[
25
+ logging.FileHandler('training.log'),
26
+ logging.StreamHandler(sys.stdout)
27
+ ]
28
+ )
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ @dataclass
33
+ class TrainingConfig:
34
+ """Configuration for auto training."""
35
+ model_name: str = "DeepXR/Helion-V1"
36
+ base_model: str = "meta-llama/Llama-2-7b-hf"
37
+ dataset_name: str = "your-dataset-name"
38
+ output_dir: str = "./helion-v1-output"
39
+ hub_model_id: str = "DeepXR/Helion-V1"
40
+ hf_token: Optional[str] = None
41
+
42
+ # Training hyperparameters
43
+ num_epochs: int = 3
44
+ batch_size: int = 4
45
+ gradient_accumulation: int = 8
46
+ learning_rate: float = 2e-5
47
+ warmup_steps: int = 100
48
+ max_seq_length: int = 4096
49
+
50
+ # LoRA config
51
+ use_lora: bool = True
52
+ lora_r: int = 64
53
+ lora_alpha: int = 128
54
+ lora_dropout: float = 0.05
55
+
56
+ # Retry settings
57
+ max_retries: int = 5
58
+ retry_delay: int = 60
59
+ upload_chunk_size: int = 5 * 1024 * 1024 # 5MB chunks
60
+
61
+
62
+ class HuggingFaceErrorHandler:
63
+ """Handle various HuggingFace API and training errors."""
64
+
65
+ ERROR_CODES = {
66
+ 400: "Bad Request - Check your input data format",
67
+ 401: "Unauthorized - Invalid or missing HuggingFace token",
68
+ 403: "Forbidden - Check repository permissions",
69
+ 404: "Not Found - Model or dataset doesn't exist",
70
+ 408: "Request Timeout - Server took too long to respond",
71
+ 413: "Payload Too Large - File size exceeds limits",
72
+ 422: "Unprocessable Entity - Validation error in request",
73
+ 429: "Rate Limited - Too many requests, will retry",
74
+ 500: "Internal Server Error - HuggingFace server issue",
75
+ 502: "Bad Gateway - Service temporarily unavailable",
76
+ 503: "Service Unavailable - Server overloaded",
77
+ 504: "Gateway Timeout - Request took too long"
78
+ }
79
+
80
+ @staticmethod
81
+ def handle_http_error(error: Exception, context: str = "") -> bool:
82
+ """
83
+ Handle HTTP errors with appropriate recovery strategies.
84
+
85
+ Args:
86
+ error: The exception that occurred
87
+ context: Additional context about what was being done
88
+
89
+ Returns:
90
+ True if error is recoverable, False otherwise
91
+ """
92
+ if hasattr(error, 'response') and error.response is not None:
93
+ status_code = error.response.status_code
94
+ error_msg = HuggingFaceErrorHandler.ERROR_CODES.get(
95
+ status_code,
96
+ f"Unknown error (code {status_code})"
97
+ )
98
+
99
+ logger.error(f"{context} - HTTP {status_code}: {error_msg}")
100
+
101
+ # Log response content for debugging
102
+ try:
103
+ response_text = error.response.text
104
+ logger.debug(f"Response content: {response_text}")
105
+ except:
106
+ pass
107
+
108
+ # Determine if error is recoverable
109
+ recoverable_codes = [408, 429, 500, 502, 503, 504]
110
+ return status_code in recoverable_codes
111
+
112
+ logger.error(f"{context} - {type(error).__name__}: {str(error)}")
113
+ return False
114
+
115
+ @staticmethod
116
+ def handle_training_error(error: Exception) -> Dict[str, Any]:
117
+ """Handle training-specific errors."""
118
+ error_info = {
119
+ "error_type": type(error).__name__,
120
+ "error_message": str(error),
121
+ "traceback": traceback.format_exc(),
122
+ "recoverable": False,
123
+ "suggestion": ""
124
+ }
125
+
126
+ error_str = str(error).lower()
127
+
128
+ if "out of memory" in error_str or "oom" in error_str:
129
+ error_info["recoverable"] = True
130
+ error_info["suggestion"] = (
131
+ "Reduce batch_size, enable gradient_checkpointing, "
132
+ "or use smaller model/sequence length"
133
+ )
134
+ elif "cuda" in error_str:
135
+ error_info["suggestion"] = "Check CUDA installation and GPU availability"
136
+ elif "token" in error_str and "invalid" in error_str:
137
+ error_info["suggestion"] = "Check HuggingFace token validity"
138
+ elif "permission" in error_str:
139
+ error_info["suggestion"] = "Verify repository write permissions"
140
+ elif "dataset" in error_str:
141
+ error_info["suggestion"] = "Check dataset name and format"
142
+ elif "disk" in error_str or "space" in error_str:
143
+ error_info["suggestion"] = "Free up disk space"
144
+
145
+ return error_info
146
+
147
+
148
+ class RobustHFUploader:
149
+ """Robust uploader for HuggingFace Hub with retry logic."""
150
+
151
+ def __init__(self, token: str, max_retries: int = 5):
152
+ self.token = token
153
+ self.max_retries = max_retries
154
+ self.session = self._create_session()
155
+
156
+ def _create_session(self) -> requests.Session:
157
+ """Create session with retry strategy."""
158
+ session = requests.Session()
159
+
160
+ retry_strategy = Retry(
161
+ total=self.max_retries,
162
+ backoff_factor=2,
163
+ status_forcelist=[408, 429, 500, 502, 503, 504],
164
+ allowed_methods=["HEAD", "GET", "PUT", "POST", "PATCH"]
165
+ )
166
+
167
+ adapter = HTTPAdapter(max_retries=retry_strategy)
168
+ session.mount("http://", adapter)
169
+ session.mount("https://", adapter)
170
+
171
+ return session
172
+
173
+ def upload_file_chunked(
174
+ self,
175
+ file_path: str,
176
+ repo_id: str,
177
+ path_in_repo: str,
178
+ chunk_size: int = 5 * 1024 * 1024
179
+ ) -> bool:
180
+ """
181
+ Upload large file in chunks with progress tracking.
182
+
183
+ Args:
184
+ file_path: Local file path
185
+ repo_id: HuggingFace repo ID
186
+ path_in_repo: Path in repository
187
+ chunk_size: Size of chunks in bytes
188
+
189
+ Returns:
190
+ True if successful, False otherwise
191
+ """
192
+ try:
193
+ from huggingface_hub import HfApi
194
+
195
+ api = HfApi(token=self.token)
196
+ file_size = os.path.getsize(file_path)
197
+
198
+ logger.info(f"Uploading {file_path} ({file_size / 1024 / 1024:.2f} MB)")
199
+
200
+ for attempt in range(self.max_retries):
201
+ try:
202
+ api.upload_file(
203
+ path_or_fileobj=file_path,
204
+ path_in_repo=path_in_repo,
205
+ repo_id=repo_id,
206
+ token=self.token
207
+ )
208
+ logger.info(f"✅ Successfully uploaded {path_in_repo}")
209
+ return True
210
+
211
+ except Exception as e:
212
+ if HuggingFaceErrorHandler.handle_http_error(
213
+ e,
214
+ f"Upload attempt {attempt + 1}/{self.max_retries}"
215
+ ):
216
+ wait_time = (2 ** attempt) * 30
217
+ logger.warning(f"Retrying in {wait_time}s...")
218
+ time.sleep(wait_time)
219
+ else:
220
+ logger.error(f"Non-recoverable error: {e}")
221
+ return False
222
+
223
+ logger.error(f"Failed to upload after {self.max_retries} attempts")
224
+ return False
225
+
226
+ except Exception as e:
227
+ logger.error(f"Upload error: {e}")
228
+ return False
229
+
230
+
231
+ class HelionAutoTrainer:
232
+ """Auto trainer with comprehensive error handling."""
233
+
234
+ def __init__(self, config: TrainingConfig):
235
+ self.config = config
236
+ self.error_handler = HuggingFaceErrorHandler()
237
+
238
+ # Get HuggingFace token
239
+ self.hf_token = config.hf_token or os.getenv("HF_TOKEN")
240
+ if not self.hf_token:
241
+ raise ValueError(
242
+ "HuggingFace token not found. Set HF_TOKEN environment variable "
243
+ "or pass token in config"
244
+ )
245
+
246
+ self.uploader = RobustHFUploader(self.hf_token, config.max_retries)
247
+
248
+ # Training state
249
+ self.training_state = {
250
+ "status": "initialized",
251
+ "current_epoch": 0,
252
+ "total_steps": 0,
253
+ "errors": [],
254
+ "checkpoints": []
255
+ }
256
+
257
+ def verify_setup(self) -> bool:
258
+ """Verify all prerequisites before training."""
259
+ logger.info("Verifying setup...")
260
+
261
+ checks = {
262
+ "HuggingFace Token": self._check_token(),
263
+ "CUDA Available": self._check_cuda(),
264
+ "Base Model Access": self._check_model_access(),
265
+ "Dataset Access": self._check_dataset_access(),
266
+ "Disk Space": self._check_disk_space(),
267
+ "Repository Permissions": self._check_repo_permissions()
268
+ }
269
+
270
+ all_passed = True
271
+ for check_name, result in checks.items():
272
+ status = "✅" if result else "❌"
273
+ logger.info(f"{status} {check_name}")
274
+ if not result:
275
+ all_passed = False
276
+
277
+ return all_passed
278
+
279
+ def _check_token(self) -> bool:
280
+ """Verify HuggingFace token is valid."""
281
+ try:
282
+ from huggingface_hub import HfApi
283
+ api = HfApi(token=self.hf_token)
284
+ api.whoami()
285
+ return True
286
+ except Exception as e:
287
+ logger.error(f"Token validation failed: {e}")
288
+ return False
289
+
290
+ def _check_cuda(self) -> bool:
291
+ """Check CUDA availability."""
292
+ try:
293
+ import torch
294
+ available = torch.cuda.is_available()
295
+ if available:
296
+ logger.info(f"CUDA devices: {torch.cuda.device_count()}")
297
+ for i in range(torch.cuda.device_count()):
298
+ logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
299
+ return available
300
+ except:
301
+ return False
302
+
303
+ def _check_model_access(self) -> bool:
304
+ """Check if base model is accessible."""
305
+ try:
306
+ from huggingface_hub import HfApi
307
+ api = HfApi(token=self.hf_token)
308
+ api.model_info(self.config.base_model)
309
+ return True
310
+ except Exception as e:
311
+ logger.error(f"Cannot access base model: {e}")
312
+ return False
313
+
314
+ def _check_dataset_access(self) -> bool:
315
+ """Check if dataset is accessible."""
316
+ try:
317
+ from huggingface_hub import HfApi
318
+ api = HfApi(token=self.hf_token)
319
+ api.dataset_info(self.config.dataset_name)
320
+ return True
321
+ except Exception as e:
322
+ logger.warning(f"Cannot access dataset: {e}")
323
+ return False
324
+
325
+ def _check_disk_space(self, required_gb: int = 50) -> bool:
326
+ """Check available disk space."""
327
+ try:
328
+ import shutil
329
+ stat = shutil.disk_usage(self.config.output_dir)
330
+ available_gb = stat.free / (1024 ** 3)
331
+ logger.info(f"Available disk space: {available_gb:.2f} GB")
332
+ return available_gb >= required_gb
333
+ except:
334
+ return False
335
+
336
+ def _check_repo_permissions(self) -> bool:
337
+ """Check if we can write to the repository."""
338
+ try:
339
+ from huggingface_hub import HfApi
340
+ api = HfApi(token=self.hf_token)
341
+
342
+ # Try to get repo info (will create if doesn't exist)
343
+ try:
344
+ api.create_repo(
345
+ self.config.hub_model_id,
346
+ exist_ok=True,
347
+ private=False
348
+ )
349
+ return True
350
+ except Exception as e:
351
+ logger.error(f"Repository permission check failed: {e}")
352
+ return False
353
+ except:
354
+ return False
355
+
356
+ def prepare_training(self):
357
+ """Prepare for training with error handling."""
358
+ logger.info("Preparing training environment...")
359
+
360
+ try:
361
+ # Import libraries
362
+ import torch
363
+ from transformers import (
364
+ AutoTokenizer,
365
+ AutoModelForCausalLM,
366
+ TrainingArguments,
367
+ Trainer,
368
+ DataCollatorForLanguageModeling
369
+ )
370
+ from datasets import load_dataset
371
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
372
+
373
+ # Load tokenizer
374
+ logger.info("Loading tokenizer...")
375
+ self.tokenizer = AutoTokenizer.from_pretrained(
376
+ self.config.base_model,
377
+ token=self.hf_token
378
+ )
379
+
380
+ if self.tokenizer.pad_token is None:
381
+ self.tokenizer.pad_token = self.tokenizer.eos_token
382
+
383
+ # Load model with error handling
384
+ logger.info("Loading base model...")
385
+ for attempt in range(self.config.max_retries):
386
+ try:
387
+ self.model = AutoModelForCausalLM.from_pretrained(
388
+ self.config.base_model,
389
+ torch_dtype=torch.bfloat16,
390
+ device_map="auto",
391
+ token=self.hf_token,
392
+ trust_remote_code=True
393
+ )
394
+ break
395
+ except Exception as e:
396
+ if attempt < self.config.max_retries - 1:
397
+ logger.warning(f"Model load attempt {attempt + 1} failed: {e}")
398
+ time.sleep(self.config.retry_delay)
399
+ else:
400
+ raise
401
+
402
+ # Apply LoRA if enabled
403
+ if self.config.use_lora:
404
+ logger.info("Applying LoRA configuration...")
405
+
406
+ peft_config = LoraConfig(
407
+ r=self.config.lora_r,
408
+ lora_alpha=self.config.lora_alpha,
409
+ lora_dropout=self.config.lora_dropout,
410
+ bias="none",
411
+ task_type="CAUSAL_LM",
412
+ target_modules=[
413
+ "q_proj", "k_proj", "v_proj", "o_proj",
414
+ "gate_proj", "up_proj", "down_proj"
415
+ ]
416
+ )
417
+
418
+ self.model = prepare_model_for_kbit_training(self.model)
419
+ self.model = get_peft_model(self.model, peft_config)
420
+ self.model.print_trainable_parameters()
421
+
422
+ # Load dataset
423
+ logger.info("Loading dataset...")
424
+ self.dataset = load_dataset(
425
+ self.config.dataset_name,
426
+ token=self.hf_token
427
+ )
428
+
429
+ # Preprocessing
430
+ def preprocess_function(examples):
431
+ return self.tokenizer(
432
+ examples["text"],
433
+ truncation=True,
434
+ max_length=self.config.max_seq_length,
435
+ padding="max_length"
436
+ )
437
+
438
+ logger.info("Preprocessing dataset...")
439
+ self.tokenized_dataset = self.dataset.map(
440
+ preprocess_function,
441
+ batched=True,
442
+ remove_columns=self.dataset["train"].column_names
443
+ )
444
+
445
+ # Data collator
446
+ self.data_collator = DataCollatorForLanguageModeling(
447
+ tokenizer=self.tokenizer,
448
+ mlm=False
449
+ )
450
+
451
+ logger.info("✅ Training preparation complete")
452
+ return True
453
+
454
+ except Exception as e:
455
+ error_info = self.error_handler.handle_training_error(e)
456
+ logger.error(f"Preparation failed: {error_info}")
457
+ self.training_state["errors"].append(error_info)
458
+ return False
459
+
460
+ def train(self) -> bool:
461
+ """Run training with comprehensive error handling."""
462
+ logger.info("Starting training...")
463
+ self.training_state["status"] = "training"
464
+
465
+ try:
466
+ from transformers import TrainingArguments, Trainer
467
+
468
+ # Training arguments
469
+ training_args = TrainingArguments(
470
+ output_dir=self.config.output_dir,
471
+ num_train_epochs=self.config.num_epochs,
472
+ per_device_train_batch_size=self.config.batch_size,
473
+ gradient_accumulation_steps=self.config.gradient_accumulation,
474
+ learning_rate=self.config.learning_rate,
475
+ warmup_steps=self.config.warmup_steps,
476
+ logging_steps=10,
477
+ save_steps=500,
478
+ save_total_limit=3,
479
+ fp16=False,
480
+ bf16=True,
481
+ gradient_checkpointing=True,
482
+ optim="adamw_torch",
483
+ report_to=["tensorboard"],
484
+ push_to_hub=False, # We'll handle upload manually
485
+ hub_token=self.hf_token,
486
+ load_best_model_at_end=True,
487
+ save_strategy="steps",
488
+ evaluation_strategy="steps" if "validation" in self.tokenized_dataset else "no",
489
+ eval_steps=500 if "validation" in self.tokenized_dataset else None
490
+ )
491
+
492
+ # Create trainer
493
+ trainer = Trainer(
494
+ model=self.model,
495
+ args=training_args,
496
+ train_dataset=self.tokenized_dataset["train"],
497
+ eval_dataset=self.tokenized_dataset.get("validation"),
498
+ data_collator=self.data_collator,
499
+ tokenizer=self.tokenizer
500
+ )
501
+
502
+ # Train with error recovery
503
+ for attempt in range(self.config.max_retries):
504
+ try:
505
+ logger.info(f"Training attempt {attempt + 1}/{self.config.max_retries}")
506
+ trainer.train()
507
+ logger.info("✅ Training completed successfully")
508
+ self.training_state["status"] = "completed"
509
+ return True
510
+
511
+ except RuntimeError as e:
512
+ error_info = self.error_handler.handle_training_error(e)
513
+ self.training_state["errors"].append(error_info)
514
+
515
+ if "out of memory" in str(e).lower():
516
+ logger.warning("OOM error - reducing batch size")
517
+ training_args.per_device_train_batch_size //= 2
518
+ training_args.gradient_accumulation_steps *= 2
519
+
520
+ if training_args.per_device_train_batch_size < 1:
521
+ logger.error("Cannot reduce batch size further")
522
+ return False
523
+
524
+ # Recreate trainer with new settings
525
+ trainer = Trainer(
526
+ model=self.model,
527
+ args=training_args,
528
+ train_dataset=self.tokenized_dataset["train"],
529
+ eval_dataset=self.tokenized_dataset.get("validation"),
530
+ data_collator=self.data_collator,
531
+ tokenizer=self.tokenizer
532
+ )
533
+ else:
534
+ logger.error(f"Non-recoverable error: {error_info}")
535
+ return False
536
+
537
+ except Exception as e:
538
+ error_info = self.error_handler.handle_training_error(e)
539
+ logger.error(f"Unexpected error: {error_info}")
540
+ self.training_state["errors"].append(error_info)
541
+
542
+ if attempt < self.config.max_retries - 1:
543
+ wait_time = self.config.retry_delay * (attempt + 1)
544
+ logger.info(f"Retrying in {wait_time}s...")
545
+ time.sleep(wait_time)
546
+ else:
547
+ return False
548
+
549
+ return False
550
+
551
+ except Exception as e:
552
+ error_info = self.error_handler.handle_training_error(e)
553
+ logger.error(f"Training initialization failed: {error_info}")
554
+ self.training_state["errors"].append(error_info)
555
+ self.training_state["status"] = "failed"
556
+ return False
557
+
558
+ def upload_to_hub(self) -> bool:
559
+ """Upload trained model to HuggingFace Hub with retry logic."""
560
+ logger.info("Uploading model to HuggingFace Hub...")
561
+ self.training_state["status"] = "uploading"
562
+
563
+ try:
564
+ from huggingface_hub import HfApi
565
+
566
+ api = HfApi(token=self.hf_token)
567
+
568
+ # Create repo if doesn't exist
569
+ logger.info(f"Creating/updating repository: {self.config.hub_model_id}")
570
+ api.create_repo(
571
+ self.config.hub_model_id,
572
+ exist_ok=True,
573
+ private=False
574
+ )
575
+
576
+ # Upload files with retry
577
+ output_path = Path(self.config.output_dir)
578
+ files_to_upload = list(output_path.glob("*.json")) + \
579
+ list(output_path.glob("*.bin")) + \
580
+ list(output_path.glob("*.safetensors")) + \
581
+ list(output_path.glob("*.txt"))
582
+
583
+ upload_success = True
584
+ for file_path in files_to_upload:
585
+ logger.info(f"Uploading {file_path.name}...")
586
+
587
+ success = self.uploader.upload_file_chunked(
588
+ str(file_path),
589
+ self.config.hub_model_id,
590
+ file_path.name
591
+ )
592
+
593
+ if not success:
594
+ logger.error(f"Failed to upload {file_path.name}")
595
+ upload_success = False
596
+
597
+ if upload_success:
598
+ logger.info("✅ Model uploaded successfully")
599
+ self.training_state["status"] = "uploaded"
600
+ return True
601
+ else:
602
+ logger.error("Some files failed to upload")
603
+ return False
604
+
605
+ except Exception as e:
606
+ self.error_handler.handle_http_error(e, "Hub upload")
607
+ self.training_state["status"] = "upload_failed"
608
+ return False
609
+
610
+ def save_training_state(self):
611
+ """Save training state to file."""
612
+ state_file = Path(self.config.output_dir) / "training_state.json"
613
+ state_file.parent.mkdir(parents=True, exist_ok=True)
614
+
615
+ with open(state_file, 'w') as f:
616
+ json.dump(self.training_state, f, indent=2, default=str)
617
+
618
+ logger.info(f"Training state saved to {state_file}")
619
+
620
+ def run_full_pipeline(self) -> bool:
621
+ """Run complete training pipeline with error handling."""
622
+ logger.info("="*60)
623
+ logger.info("Starting Helion-V1 Auto Training Pipeline")
624
+ logger.info("="*60)
625
+
626
+ try:
627
+ # Step 1: Verify setup
628
+ if not self.verify_setup():
629
+ logger.error("Setup verification failed")
630
+ return False
631
+
632
+ # Step 2: Prepare training
633
+ if not self.prepare_training():
634
+ logger.error("Training preparation failed")
635
+ return False
636
+
637
+ # Step 3: Train
638
+ if not self.train():
639
+ logger.error("Training failed")
640
+ return False
641
+
642
+ # Step 4: Upload to hub
643
+ if not self.upload_to_hub():
644
+ logger.warning("Upload failed, but model is saved locally")
645
+
646
+ # Step 5: Save state
647
+ self.save_training_state()
648
+
649
+ logger.info("="*60)
650
+ logger.info("✅ Training pipeline completed successfully!")
651
+ logger.info("="*60)
652
+ return True
653
+
654
+ except KeyboardInterrupt:
655
+ logger.warning("Training interrupted by user")
656
+ self.training_state["status"] = "interrupted"
657
+ self.save_training_state()
658
+ return False
659
+
660
+ except Exception as e:
661
+ logger.error(f"Pipeline failed: {e}")
662
+ logger.error(traceback.format_exc())
663
+ self.training_state["status"] = "failed"
664
+ self.training_state["errors"].append({
665
+ "error": str(e),
666
+ "traceback": traceback.format_exc()
667
+ })
668
+ self.save_training_state()
669
+ return False
670
+
671
+
672
+ def main():
673
+ """Main entry point for auto training."""
674
+ import argparse
675
+
676
+ parser = argparse.ArgumentParser(description="Helion-V1 Auto Trainer")
677
+ parser.add_argument("--base-model", default="meta-llama/Llama-2-7b-hf")
678
+ parser.add_argument("--dataset", required=True, help="Dataset name on HuggingFace")
679
+ parser.add_argument("--output-dir", default="./helion-v1-output")
680
+ parser.add_argument("--hub-model-id", default="DeepXR/Helion-V1")
681
+ parser.add_argument("--epochs", type=int, default=3)
682
+ parser.add_argument("--batch-size", type=int, default=4)
683
+ parser.add_argument("--learning-rate", type=float, default=2e-5)
684
+ parser.add_argument("--max-seq-length", type=int, default=4096)
685
+ parser.add_argument("--no-lora", action="store_true", help="Disable LoRA")
686
+ parser.add_argument("--token", help="HuggingFace token (or use HF_TOKEN env var)")
687
+
688
+ args = parser.parse_args()
689
+
690
+ # Create config
691
+ config = TrainingConfig(
692
+ base_model=args.base_model,
693
+ dataset_name=args.dataset,
694
+ output_dir=args.output_dir,
695
+ hub_model_id=args.hub_model_id,
696
+ num_epochs=args.epochs,
697
+ batch_size=args.batch_size,
698
+ learning_rate=args.learning_rate,
699
+ max_seq_length=args.max_seq_length,
700
+ use_lora=not args.no_lora,
701
+ hf_token=args.token
702
+ )
703
+
704
+ # Run training
705
+ trainer = HelionAutoTrainer(config)
706
+ success = trainer.run_full_pipeline()
707
+
708
+ sys.exit(0 if success else 1)
709
+
710
+
711
+ if __name__ == "__main__":
712
+ main()