|
|
""" |
|
|
Helion-V1-Embeddings Training Script |
|
|
Train a lightweight embedding model for semantic similarity and retrieval |
|
|
""" |
|
|
|
|
|
import json |
|
|
import logging |
|
|
from typing import List, Dict, Tuple |
|
|
from pathlib import Path |
|
|
from datetime import datetime |
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(levelname)s - %(message)s' |
|
|
) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class EmbeddingsTrainer: |
|
|
"""Train embeddings model for Helion-V1-Embeddings.""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
base_model: str = "sentence-transformers/all-MiniLM-L6-v2", |
|
|
output_path: str = "./helion-embeddings-output" |
|
|
): |
|
|
self.base_model = base_model |
|
|
self.output_path = Path(output_path) |
|
|
self.output_path.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
def prepare_training_data(self) -> List[Dict]: |
|
|
""" |
|
|
Prepare training data for embeddings. |
|
|
Format: sentence pairs with similarity scores. |
|
|
""" |
|
|
training_examples = [ |
|
|
|
|
|
{ |
|
|
"sentence1": "How do I reset my password?", |
|
|
"sentence2": "What's the password reset process?", |
|
|
"score": 0.95 |
|
|
}, |
|
|
{ |
|
|
"sentence1": "Machine learning training methods", |
|
|
"sentence2": "How to train ML models", |
|
|
"score": 0.90 |
|
|
}, |
|
|
{ |
|
|
"sentence1": "Python programming tutorial", |
|
|
"sentence2": "Learn Python coding", |
|
|
"score": 0.88 |
|
|
}, |
|
|
|
|
|
|
|
|
{ |
|
|
"sentence1": "Install Python on Windows", |
|
|
"sentence2": "Python setup guide", |
|
|
"score": 0.70 |
|
|
}, |
|
|
{ |
|
|
"sentence1": "Best restaurants in Paris", |
|
|
"sentence2": "Where to eat in France", |
|
|
"score": 0.65 |
|
|
}, |
|
|
|
|
|
|
|
|
{ |
|
|
"sentence1": "How to bake cookies", |
|
|
"sentence2": "Machine learning algorithms", |
|
|
"score": 0.10 |
|
|
}, |
|
|
{ |
|
|
"sentence1": "Weather forecast tomorrow", |
|
|
"sentence2": "Stock market analysis", |
|
|
"score": 0.05 |
|
|
} |
|
|
] |
|
|
|
|
|
logger.info(f"Prepared {len(training_examples)} training examples") |
|
|
return training_examples |
|
|
|
|
|
def create_contrastive_pairs(self) -> List[Tuple[str, str]]: |
|
|
""" |
|
|
Create pairs for contrastive learning. |
|
|
Format: (anchor, positive) pairs. |
|
|
""" |
|
|
pairs = [ |
|
|
("What is machine learning?", "Machine learning explained simply"), |
|
|
("How to learn Python?", "Python learning resources"), |
|
|
("Best coding practices", "Software development best practices"), |
|
|
("Data science tutorial", "Learn data science basics"), |
|
|
("Natural language processing", "NLP fundamentals guide"), |
|
|
("Deep learning introduction", "Getting started with deep learning"), |
|
|
("Web development guide", "How to build websites"), |
|
|
("Database design principles", "SQL database design tutorial"), |
|
|
("Cloud computing basics", "Introduction to cloud services"), |
|
|
("API development guide", "How to create REST APIs"), |
|
|
] |
|
|
|
|
|
logger.info(f"Created {len(pairs)} contrastive pairs") |
|
|
return pairs |
|
|
|
|
|
def train_model( |
|
|
self, |
|
|
train_examples: List[Dict] = None, |
|
|
epochs: int = 3, |
|
|
batch_size: int = 16, |
|
|
warmup_steps: int = 100 |
|
|
): |
|
|
""" |
|
|
Train the embeddings model. |
|
|
|
|
|
Args: |
|
|
train_examples: Training data (if None, uses default) |
|
|
epochs: Number of training epochs |
|
|
batch_size: Batch size for training |
|
|
warmup_steps: Warmup steps for learning rate |
|
|
""" |
|
|
try: |
|
|
from sentence_transformers import ( |
|
|
SentenceTransformer, |
|
|
InputExample, |
|
|
losses, |
|
|
evaluation |
|
|
) |
|
|
from torch.utils.data import DataLoader |
|
|
|
|
|
logger.info("Loading base model...") |
|
|
model = SentenceTransformer(self.base_model) |
|
|
|
|
|
|
|
|
if train_examples is None: |
|
|
train_examples = self.prepare_training_data() |
|
|
|
|
|
|
|
|
train_data = [] |
|
|
for example in train_examples: |
|
|
train_data.append(InputExample( |
|
|
texts=[example["sentence1"], example["sentence2"]], |
|
|
label=example["score"] |
|
|
)) |
|
|
|
|
|
|
|
|
train_dataloader = DataLoader( |
|
|
train_data, |
|
|
shuffle=True, |
|
|
batch_size=batch_size |
|
|
) |
|
|
|
|
|
|
|
|
train_loss = losses.CosineSimilarityLoss(model) |
|
|
|
|
|
|
|
|
logger.info("Starting training...") |
|
|
model.fit( |
|
|
train_objectives=[(train_dataloader, train_loss)], |
|
|
epochs=epochs, |
|
|
warmup_steps=warmup_steps, |
|
|
output_path=str(self.output_path), |
|
|
show_progress_bar=True, |
|
|
save_best_model=True |
|
|
) |
|
|
|
|
|
logger.info(f"✅ Training complete! Model saved to {self.output_path}") |
|
|
|
|
|
return model |
|
|
|
|
|
except ImportError: |
|
|
logger.error("sentence-transformers not installed. Install with: pip install sentence-transformers") |
|
|
return None |
|
|
except Exception as e: |
|
|
logger.error(f"Training failed: {e}") |
|
|
return None |
|
|
|
|
|
def evaluate_model(self, model, test_pairs: List[Tuple[str, str, float]] = None): |
|
|
""" |
|
|
Evaluate the trained model. |
|
|
|
|
|
Args: |
|
|
model: Trained SentenceTransformer model |
|
|
test_pairs: List of (sentence1, sentence2, expected_similarity) |
|
|
""" |
|
|
from sentence_transformers import util |
|
|
|
|
|
if test_pairs is None: |
|
|
|
|
|
test_pairs = [ |
|
|
("How to code?", "Coding tutorial", 0.85), |
|
|
("Weather today", "Stock prices", 0.1), |
|
|
("Machine learning", "AI and ML", 0.95), |
|
|
] |
|
|
|
|
|
logger.info("Evaluating model...") |
|
|
|
|
|
total_error = 0 |
|
|
for sent1, sent2, expected in test_pairs: |
|
|
emb1 = model.encode(sent1) |
|
|
emb2 = model.encode(sent2) |
|
|
similarity = float(util.cos_sim(emb1, emb2)[0][0]) |
|
|
error = abs(similarity - expected) |
|
|
total_error += error |
|
|
|
|
|
logger.info(f"'{sent1}' <-> '{sent2}'") |
|
|
logger.info(f" Expected: {expected:.2f}, Got: {similarity:.2f}, Error: {error:.2f}") |
|
|
|
|
|
avg_error = total_error / len(test_pairs) |
|
|
logger.info(f"Average error: {avg_error:.3f}") |
|
|
|
|
|
return avg_error |
|
|
|
|
|
def create_config_files(self): |
|
|
"""Create necessary configuration files.""" |
|
|
|
|
|
|
|
|
config = { |
|
|
"__version__": { |
|
|
"sentence_transformers": "2.2.2", |
|
|
"transformers": "4.36.0", |
|
|
"pytorch": "2.0.0" |
|
|
}, |
|
|
"prompts": {}, |
|
|
"default_prompt_name": None, |
|
|
"similarity_fn_name": "cosine", |
|
|
"max_seq_length": 256, |
|
|
"do_lower_case": False |
|
|
} |
|
|
|
|
|
with open(self.output_path / "config_sentence_transformers.json", 'w') as f: |
|
|
json.dump(config, f, indent=2) |
|
|
|
|
|
|
|
|
modules = [ |
|
|
{ |
|
|
"idx": 0, |
|
|
"name": "0", |
|
|
"path": "", |
|
|
"type": "sentence_transformers.models.Transformer" |
|
|
}, |
|
|
{ |
|
|
"idx": 1, |
|
|
"name": "1", |
|
|
"path": "1_Pooling", |
|
|
"type": "sentence_transformers.models.Pooling" |
|
|
}, |
|
|
{ |
|
|
"idx": 2, |
|
|
"name": "2", |
|
|
"path": "2_Normalize", |
|
|
"type": "sentence_transformers.models.Normalize" |
|
|
} |
|
|
] |
|
|
|
|
|
with open(self.output_path / "modules.json", 'w') as f: |
|
|
json.dump(modules, f, indent=2) |
|
|
|
|
|
logger.info("✅ Configuration files created") |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main training function.""" |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser( |
|
|
description="Train Helion-V1-Embeddings model" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--base-model", |
|
|
default="sentence-transformers/all-MiniLM-L6-v2", |
|
|
help="Base model to fine-tune" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output", |
|
|
default="./helion-embeddings-output", |
|
|
help="Output directory" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--epochs", |
|
|
type=int, |
|
|
default=3, |
|
|
help="Number of training epochs" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--batch-size", |
|
|
type=int, |
|
|
default=16, |
|
|
help="Batch size" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--data-file", |
|
|
type=str, |
|
|
help="Path to training data JSON file" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
trainer = EmbeddingsTrainer( |
|
|
base_model=args.base_model, |
|
|
output_path=args.output |
|
|
) |
|
|
|
|
|
|
|
|
train_examples = None |
|
|
if args.data_file: |
|
|
with open(args.data_file, 'r') as f: |
|
|
train_examples = json.load(f) |
|
|
logger.info(f"Loaded {len(train_examples)} examples from {args.data_file}") |
|
|
|
|
|
|
|
|
model = trainer.train_model( |
|
|
train_examples=train_examples, |
|
|
epochs=args.epochs, |
|
|
batch_size=args.batch_size |
|
|
) |
|
|
|
|
|
if model: |
|
|
|
|
|
trainer.evaluate_model(model) |
|
|
|
|
|
|
|
|
trainer.create_config_files() |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("✅ Helion-V1-Embeddings Training Complete!") |
|
|
print("="*60) |
|
|
print(f"📁 Model saved to: {args.output}") |
|
|
print("\n💡 Test your model:") |
|
|
print("```python") |
|
|
print("from sentence_transformers import SentenceTransformer") |
|
|
print(f"model = SentenceTransformer('{args.output}')") |
|
|
print("embeddings = model.encode(['Hello world'])") |
|
|
print("```") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |