File size: 21,880 Bytes
3d8d9e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 |
"""
Helion-V1-Embeddings Training Data Generator
Generate sentence pairs for training embeddings model
Optimized for semantic similarity and retrieval tasks
"""
import json
import logging
import random
from typing import List, Dict, Tuple
from pathlib import Path
from datetime import datetime
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class EmbeddingsDataGenerator:
"""Generate training data for embeddings model."""
def __init__(self, output_dir: str = "./embeddings_training_data"):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
def generate_paraphrase_pairs(self) -> List[Dict]:
"""
Generate paraphrase pairs (high similarity).
Score: 0.85-1.0
"""
pairs = [
# Technical questions
{
"sentence1": "How do I install Python on Windows?",
"sentence2": "What's the process to set up Python on a Windows computer?",
"score": 0.95
},
{
"sentence1": "What is machine learning?",
"sentence2": "Can you explain machine learning to me?",
"score": 0.92
},
{
"sentence1": "How to fix a bug in my code?",
"sentence2": "What's the best way to debug my program?",
"score": 0.88
},
{
"sentence1": "Reset password instructions",
"sentence2": "How do I reset my password?",
"score": 0.93
},
{
"sentence1": "Database connection error",
"sentence2": "Can't connect to the database",
"score": 0.90
},
# General knowledge
{
"sentence1": "What is the capital of France?",
"sentence2": "Tell me the capital city of France",
"score": 0.96
},
{
"sentence1": "Best restaurants in New York",
"sentence2": "Where to eat in New York City",
"score": 0.89
},
{
"sentence1": "Weather forecast for tomorrow",
"sentence2": "What will the weather be like tomorrow?",
"score": 0.91
},
{
"sentence1": "How to learn a new language",
"sentence2": "Tips for learning foreign languages",
"score": 0.87
},
{
"sentence1": "Symptoms of the flu",
"sentence2": "What are flu symptoms?",
"score": 0.94
},
# Product/service queries
{
"sentence1": "How to cancel my subscription",
"sentence2": "Steps to unsubscribe from the service",
"score": 0.90
},
{
"sentence1": "Return policy for products",
"sentence2": "How do I return an item?",
"score": 0.86
},
{
"sentence1": "Customer support contact",
"sentence2": "How to reach customer service",
"score": 0.92
},
{
"sentence1": "Shipping tracking information",
"sentence2": "Where is my order?",
"score": 0.85
},
{
"sentence1": "Update payment method",
"sentence2": "Change my credit card information",
"score": 0.88
},
]
logger.info(f"Generated {len(pairs)} paraphrase pairs")
return pairs
def generate_similar_pairs(self) -> List[Dict]:
"""
Generate semantically similar pairs (medium-high similarity).
Score: 0.60-0.85
"""
pairs = [
# Related concepts
{
"sentence1": "Machine learning algorithms",
"sentence2": "Neural network architectures",
"score": 0.75
},
{
"sentence1": "Python programming language",
"sentence2": "JavaScript coding tutorial",
"score": 0.68
},
{
"sentence1": "Data science career path",
"sentence2": "Becoming a data analyst",
"score": 0.72
},
{
"sentence1": "Cloud computing services",
"sentence2": "AWS infrastructure guide",
"score": 0.70
},
{
"sentence1": "Web development frameworks",
"sentence2": "React and Vue.js comparison",
"score": 0.74
},
# Related questions
{
"sentence1": "How to lose weight?",
"sentence2": "Healthy eating habits",
"score": 0.65
},
{
"sentence1": "Best laptops for programming",
"sentence2": "Computer hardware for developers",
"score": 0.71
},
{
"sentence1": "Learning guitar for beginners",
"sentence2": "Music theory basics",
"score": 0.62
},
{
"sentence1": "Travel tips for Europe",
"sentence2": "Budget travel guide",
"score": 0.67
},
{
"sentence1": "Home workout routines",
"sentence2": "Fitness exercises without equipment",
"score": 0.73
},
# Professional context
{
"sentence1": "Project management best practices",
"sentence2": "Agile methodology guide",
"score": 0.69
},
{
"sentence1": "Resume writing tips",
"sentence2": "Job interview preparation",
"score": 0.64
},
{
"sentence1": "Team collaboration tools",
"sentence2": "Remote work software solutions",
"score": 0.72
},
{
"sentence1": "Time management techniques",
"sentence2": "Productivity improvement strategies",
"score": 0.76
},
{
"sentence1": "Business communication skills",
"sentence2": "Professional email etiquette",
"score": 0.66
},
]
logger.info(f"Generated {len(pairs)} similar pairs")
return pairs
def generate_dissimilar_pairs(self) -> List[Dict]:
"""
Generate unrelated pairs (low similarity).
Score: 0.0-0.30
"""
pairs = [
# Completely unrelated
{
"sentence1": "How to bake chocolate cake",
"sentence2": "Installing Linux operating system",
"score": 0.05
},
{
"sentence1": "Football match results",
"sentence2": "Quantum physics equations",
"score": 0.02
},
{
"sentence1": "Dog training tips",
"sentence2": "Stock market analysis",
"score": 0.08
},
{
"sentence1": "Car repair manual",
"sentence2": "Ancient Roman history",
"score": 0.04
},
{
"sentence1": "Gardening for beginners",
"sentence2": "Cryptocurrency trading strategies",
"score": 0.06
},
# Different domains
{
"sentence1": "Piano lessons online",
"sentence2": "Chemical engineering degree",
"score": 0.10
},
{
"sentence1": "Knitting patterns",
"sentence2": "Cybersecurity threats",
"score": 0.03
},
{
"sentence1": "Mediterranean diet recipes",
"sentence2": "Smartphone app development",
"score": 0.07
},
{
"sentence1": "Yoga poses for flexibility",
"sentence2": "Legal contract templates",
"score": 0.05
},
{
"sentence1": "Movie reviews 2024",
"sentence2": "Database optimization techniques",
"score": 0.09
},
# Topic mismatch
{
"sentence1": "Wedding planning checklist",
"sentence2": "Machine learning deployment",
"score": 0.04
},
{
"sentence1": "Child development stages",
"sentence2": "Network security protocols",
"score": 0.06
},
{
"sentence1": "Photography lighting techniques",
"sentence2": "Tax filing requirements",
"score": 0.08
},
{
"sentence1": "Fashion trends 2024",
"sentence2": "Docker container orchestration",
"score": 0.02
},
{
"sentence1": "Scuba diving certification",
"sentence2": "Financial portfolio management",
"score": 0.07
},
]
logger.info(f"Generated {len(pairs)} dissimilar pairs")
return pairs
def generate_question_answer_pairs(self) -> List[Dict]:
"""
Generate question-answer pairs for retrieval training.
Score: 0.80-0.95
"""
pairs = [
{
"sentence1": "What is Python?",
"sentence2": "Python is a high-level programming language known for its simplicity and versatility.",
"score": 0.88
},
{
"sentence1": "How does HTTP work?",
"sentence2": "HTTP is a protocol that enables communication between web browsers and servers.",
"score": 0.85
},
{
"sentence1": "What is artificial intelligence?",
"sentence2": "AI is the simulation of human intelligence by machines and computer systems.",
"score": 0.90
},
{
"sentence1": "Define cloud computing",
"sentence2": "Cloud computing delivers computing services over the internet including storage and processing.",
"score": 0.87
},
{
"sentence1": "What is a database?",
"sentence2": "A database is an organized collection of structured information stored electronically.",
"score": 0.89
},
{
"sentence1": "Explain REST API",
"sentence2": "REST API is an architectural style for building web services using HTTP requests.",
"score": 0.84
},
{
"sentence1": "What is version control?",
"sentence2": "Version control is a system that tracks changes to files over time.",
"score": 0.86
},
{
"sentence1": "Define responsive design",
"sentence2": "Responsive design ensures websites work well on all devices and screen sizes.",
"score": 0.88
},
{
"sentence1": "What is encryption?",
"sentence2": "Encryption is the process of encoding information to prevent unauthorized access.",
"score": 0.91
},
{
"sentence1": "Explain agile methodology",
"sentence2": "Agile is an iterative approach to project management focused on flexibility.",
"score": 0.83
},
]
logger.info(f"Generated {len(pairs)} question-answer pairs")
return pairs
def generate_domain_specific_pairs(self) -> List[Dict]:
"""
Generate domain-specific sentence pairs.
Score: Various
"""
pairs = [
# Programming
{
"sentence1": "Python list comprehension",
"sentence2": "Creating lists in Python efficiently",
"score": 0.86
},
{
"sentence1": "Git merge conflicts",
"sentence2": "Resolving version control conflicts",
"score": 0.84
},
{
"sentence1": "React component lifecycle",
"sentence2": "Understanding React hooks",
"score": 0.72
},
# Healthcare
{
"sentence1": "Blood pressure medication",
"sentence2": "Treating hypertension",
"score": 0.78
},
{
"sentence1": "Physical therapy exercises",
"sentence2": "Rehabilitation program",
"score": 0.80
},
# Finance
{
"sentence1": "Investment portfolio diversification",
"sentence2": "Managing financial risk",
"score": 0.75
},
{
"sentence1": "Mortgage interest rates",
"sentence2": "Home loan options",
"score": 0.82
},
# Education
{
"sentence1": "Online course platforms",
"sentence2": "E-learning systems",
"score": 0.88
},
{
"sentence1": "Study techniques for exams",
"sentence2": "Test preparation strategies",
"score": 0.85
},
# E-commerce
{
"sentence1": "Product recommendation system",
"sentence2": "Personalized shopping suggestions",
"score": 0.83
},
{
"sentence1": "Shopping cart abandonment",
"sentence2": "Incomplete purchase behavior",
"score": 0.86
},
]
logger.info(f"Generated {len(pairs)} domain-specific pairs")
return pairs
def format_for_training(self, pairs: List[Dict]) -> List[Dict]:
"""
Format sentence pairs for training.
Args:
pairs: List of sentence pair dictionaries
Returns:
Formatted training examples
"""
formatted = []
for pair in pairs:
formatted.append({
"sentence1": pair["sentence1"],
"sentence2": pair["sentence2"],
"score": pair["score"]
})
return formatted
def create_contrastive_examples(self, pairs: List[Dict]) -> List[Dict]:
"""
Create contrastive examples (anchor, positive, negative).
Args:
pairs: Sentence pairs with scores
Returns:
Triplet examples
"""
contrastive = []
high_sim = [p for p in pairs if p["score"] >= 0.80]
low_sim = [p for p in pairs if p["score"] <= 0.30]
for positive_pair in high_sim[:20]: # Take first 20
# Select random negative
if low_sim:
negative_pair = random.choice(low_sim)
contrastive.append({
"anchor": positive_pair["sentence1"],
"positive": positive_pair["sentence2"],
"negative": negative_pair["sentence2"]
})
logger.info(f"Created {len(contrastive)} contrastive examples")
return contrastive
def save_data(self, data: List[Dict], filename: str, format: str = "json"):
"""Save training data to file."""
filepath = self.output_dir / filename
if format == "json":
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
elif format == "jsonl":
with open(filepath, 'w', encoding='utf-8') as f:
for item in data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
logger.info(f"Saved {len(data)} examples to {filepath}")
def generate_full_dataset(self, format: str = "json") -> str:
"""
Generate complete embeddings training dataset.
Args:
format: Output format ('json' or 'jsonl')
Returns:
Output directory path
"""
logger.info("Generating embeddings training dataset...")
# Collect all pairs
all_pairs = []
paraphrase_pairs = self.generate_paraphrase_pairs()
all_pairs.extend(paraphrase_pairs)
similar_pairs = self.generate_similar_pairs()
all_pairs.extend(similar_pairs)
dissimilar_pairs = self.generate_dissimilar_pairs()
all_pairs.extend(dissimilar_pairs)
qa_pairs = self.generate_question_answer_pairs()
all_pairs.extend(qa_pairs)
domain_pairs = self.generate_domain_specific_pairs()
all_pairs.extend(domain_pairs)
# Shuffle
random.shuffle(all_pairs)
# Split train/validation
split_idx = int(len(all_pairs) * 0.9)
train_pairs = all_pairs[:split_idx]
val_pairs = all_pairs[split_idx:]
logger.info(f"Train: {len(train_pairs)} pairs")
logger.info(f"Validation: {len(val_pairs)} pairs")
# Format data
train_data = self.format_for_training(train_pairs)
val_data = self.format_for_training(val_pairs)
# Save sentence pair format
self.save_data(train_data, f"train_pairs.{format}", format)
self.save_data(val_data, f"validation_pairs.{format}", format)
# Create contrastive examples
contrastive_data = self.create_contrastive_examples(all_pairs)
self.save_data(contrastive_data, f"contrastive_triplets.{format}", format)
# Generate statistics
stats = {
"total_pairs": len(all_pairs),
"train_size": len(train_pairs),
"validation_size": len(val_pairs),
"contrastive_triplets": len(contrastive_data),
"paraphrase_pairs": len(paraphrase_pairs),
"similar_pairs": len(similar_pairs),
"dissimilar_pairs": len(dissimilar_pairs),
"qa_pairs": len(qa_pairs),
"domain_pairs": len(domain_pairs),
"score_distribution": {
"high (0.8-1.0)": len([p for p in all_pairs if p["score"] >= 0.8]),
"medium (0.5-0.8)": len([p for p in all_pairs if 0.5 <= p["score"] < 0.8]),
"low (0.0-0.5)": len([p for p in all_pairs if p["score"] < 0.5])
},
"generated_at": datetime.now().isoformat(),
"format": format
}
self.save_data(stats, "embeddings_dataset_stats.json", "json")
logger.info("="*60)
logger.info("β
Embeddings dataset generation complete!")
logger.info(f"Total pairs: {len(all_pairs)}")
logger.info(f"Output directory: {self.output_dir}")
logger.info("="*60)
return str(self.output_dir)
def main():
"""Main function for data generation."""
import argparse
parser = argparse.ArgumentParser(
description="Generate training data for Helion-V1-Embeddings"
)
parser.add_argument(
"--output-dir",
default="./embeddings_training_data",
help="Output directory for training data"
)
parser.add_argument(
"--format",
choices=["json", "jsonl"],
default="json",
help="Output format"
)
args = parser.parse_args()
# Generate dataset
generator = EmbeddingsDataGenerator(output_dir=args.output_dir)
output_path = generator.generate_full_dataset(format=args.format)
print("\n" + "="*60)
print("π― Embeddings Training Data Ready!")
print("="*60)
print(f"π Location: {output_path}")
print(f"π Format: {args.format}")
print("\nπ Files created:")
print(f" β’ train_pairs.{args.format} - Training sentence pairs")
print(f" β’ validation_pairs.{args.format} - Validation pairs")
print(f" β’ contrastive_triplets.{args.format} - Triplet examples")
print(" β’ embeddings_dataset_stats.json - Dataset statistics")
print("\nπ‘ Training data includes:")
print(" β’ Paraphrase pairs (high similarity)")
print(" β’ Similar concept pairs (medium similarity)")
print(" β’ Dissimilar pairs (low similarity)")
print(" β’ Question-answer pairs")
print(" β’ Domain-specific examples")
print("\nπ Next step:")
print(f" python train_embeddings.py --data-file {output_path}/train_pairs.{args.format}")
print("="*60)
if __name__ == "__main__":
main() |