File size: 21,880 Bytes
3d8d9e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
"""
Helion-V1-Embeddings Training Data Generator
Generate sentence pairs for training embeddings model
Optimized for semantic similarity and retrieval tasks
"""

import json
import logging
import random
from typing import List, Dict, Tuple
from pathlib import Path
from datetime import datetime

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


class EmbeddingsDataGenerator:
    """Generate training data for embeddings model."""
    
    def __init__(self, output_dir: str = "./embeddings_training_data"):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
    
    def generate_paraphrase_pairs(self) -> List[Dict]:
        """
        Generate paraphrase pairs (high similarity).
        Score: 0.85-1.0
        """
        pairs = [
            # Technical questions
            {
                "sentence1": "How do I install Python on Windows?",
                "sentence2": "What's the process to set up Python on a Windows computer?",
                "score": 0.95
            },
            {
                "sentence1": "What is machine learning?",
                "sentence2": "Can you explain machine learning to me?",
                "score": 0.92
            },
            {
                "sentence1": "How to fix a bug in my code?",
                "sentence2": "What's the best way to debug my program?",
                "score": 0.88
            },
            {
                "sentence1": "Reset password instructions",
                "sentence2": "How do I reset my password?",
                "score": 0.93
            },
            {
                "sentence1": "Database connection error",
                "sentence2": "Can't connect to the database",
                "score": 0.90
            },
            
            # General knowledge
            {
                "sentence1": "What is the capital of France?",
                "sentence2": "Tell me the capital city of France",
                "score": 0.96
            },
            {
                "sentence1": "Best restaurants in New York",
                "sentence2": "Where to eat in New York City",
                "score": 0.89
            },
            {
                "sentence1": "Weather forecast for tomorrow",
                "sentence2": "What will the weather be like tomorrow?",
                "score": 0.91
            },
            {
                "sentence1": "How to learn a new language",
                "sentence2": "Tips for learning foreign languages",
                "score": 0.87
            },
            {
                "sentence1": "Symptoms of the flu",
                "sentence2": "What are flu symptoms?",
                "score": 0.94
            },
            
            # Product/service queries
            {
                "sentence1": "How to cancel my subscription",
                "sentence2": "Steps to unsubscribe from the service",
                "score": 0.90
            },
            {
                "sentence1": "Return policy for products",
                "sentence2": "How do I return an item?",
                "score": 0.86
            },
            {
                "sentence1": "Customer support contact",
                "sentence2": "How to reach customer service",
                "score": 0.92
            },
            {
                "sentence1": "Shipping tracking information",
                "sentence2": "Where is my order?",
                "score": 0.85
            },
            {
                "sentence1": "Update payment method",
                "sentence2": "Change my credit card information",
                "score": 0.88
            },
        ]
        
        logger.info(f"Generated {len(pairs)} paraphrase pairs")
        return pairs
    
    def generate_similar_pairs(self) -> List[Dict]:
        """
        Generate semantically similar pairs (medium-high similarity).
        Score: 0.60-0.85
        """
        pairs = [
            # Related concepts
            {
                "sentence1": "Machine learning algorithms",
                "sentence2": "Neural network architectures",
                "score": 0.75
            },
            {
                "sentence1": "Python programming language",
                "sentence2": "JavaScript coding tutorial",
                "score": 0.68
            },
            {
                "sentence1": "Data science career path",
                "sentence2": "Becoming a data analyst",
                "score": 0.72
            },
            {
                "sentence1": "Cloud computing services",
                "sentence2": "AWS infrastructure guide",
                "score": 0.70
            },
            {
                "sentence1": "Web development frameworks",
                "sentence2": "React and Vue.js comparison",
                "score": 0.74
            },
            
            # Related questions
            {
                "sentence1": "How to lose weight?",
                "sentence2": "Healthy eating habits",
                "score": 0.65
            },
            {
                "sentence1": "Best laptops for programming",
                "sentence2": "Computer hardware for developers",
                "score": 0.71
            },
            {
                "sentence1": "Learning guitar for beginners",
                "sentence2": "Music theory basics",
                "score": 0.62
            },
            {
                "sentence1": "Travel tips for Europe",
                "sentence2": "Budget travel guide",
                "score": 0.67
            },
            {
                "sentence1": "Home workout routines",
                "sentence2": "Fitness exercises without equipment",
                "score": 0.73
            },
            
            # Professional context
            {
                "sentence1": "Project management best practices",
                "sentence2": "Agile methodology guide",
                "score": 0.69
            },
            {
                "sentence1": "Resume writing tips",
                "sentence2": "Job interview preparation",
                "score": 0.64
            },
            {
                "sentence1": "Team collaboration tools",
                "sentence2": "Remote work software solutions",
                "score": 0.72
            },
            {
                "sentence1": "Time management techniques",
                "sentence2": "Productivity improvement strategies",
                "score": 0.76
            },
            {
                "sentence1": "Business communication skills",
                "sentence2": "Professional email etiquette",
                "score": 0.66
            },
        ]
        
        logger.info(f"Generated {len(pairs)} similar pairs")
        return pairs
    
    def generate_dissimilar_pairs(self) -> List[Dict]:
        """
        Generate unrelated pairs (low similarity).
        Score: 0.0-0.30
        """
        pairs = [
            # Completely unrelated
            {
                "sentence1": "How to bake chocolate cake",
                "sentence2": "Installing Linux operating system",
                "score": 0.05
            },
            {
                "sentence1": "Football match results",
                "sentence2": "Quantum physics equations",
                "score": 0.02
            },
            {
                "sentence1": "Dog training tips",
                "sentence2": "Stock market analysis",
                "score": 0.08
            },
            {
                "sentence1": "Car repair manual",
                "sentence2": "Ancient Roman history",
                "score": 0.04
            },
            {
                "sentence1": "Gardening for beginners",
                "sentence2": "Cryptocurrency trading strategies",
                "score": 0.06
            },
            
            # Different domains
            {
                "sentence1": "Piano lessons online",
                "sentence2": "Chemical engineering degree",
                "score": 0.10
            },
            {
                "sentence1": "Knitting patterns",
                "sentence2": "Cybersecurity threats",
                "score": 0.03
            },
            {
                "sentence1": "Mediterranean diet recipes",
                "sentence2": "Smartphone app development",
                "score": 0.07
            },
            {
                "sentence1": "Yoga poses for flexibility",
                "sentence2": "Legal contract templates",
                "score": 0.05
            },
            {
                "sentence1": "Movie reviews 2024",
                "sentence2": "Database optimization techniques",
                "score": 0.09
            },
            
            # Topic mismatch
            {
                "sentence1": "Wedding planning checklist",
                "sentence2": "Machine learning deployment",
                "score": 0.04
            },
            {
                "sentence1": "Child development stages",
                "sentence2": "Network security protocols",
                "score": 0.06
            },
            {
                "sentence1": "Photography lighting techniques",
                "sentence2": "Tax filing requirements",
                "score": 0.08
            },
            {
                "sentence1": "Fashion trends 2024",
                "sentence2": "Docker container orchestration",
                "score": 0.02
            },
            {
                "sentence1": "Scuba diving certification",
                "sentence2": "Financial portfolio management",
                "score": 0.07
            },
        ]
        
        logger.info(f"Generated {len(pairs)} dissimilar pairs")
        return pairs
    
    def generate_question_answer_pairs(self) -> List[Dict]:
        """
        Generate question-answer pairs for retrieval training.
        Score: 0.80-0.95
        """
        pairs = [
            {
                "sentence1": "What is Python?",
                "sentence2": "Python is a high-level programming language known for its simplicity and versatility.",
                "score": 0.88
            },
            {
                "sentence1": "How does HTTP work?",
                "sentence2": "HTTP is a protocol that enables communication between web browsers and servers.",
                "score": 0.85
            },
            {
                "sentence1": "What is artificial intelligence?",
                "sentence2": "AI is the simulation of human intelligence by machines and computer systems.",
                "score": 0.90
            },
            {
                "sentence1": "Define cloud computing",
                "sentence2": "Cloud computing delivers computing services over the internet including storage and processing.",
                "score": 0.87
            },
            {
                "sentence1": "What is a database?",
                "sentence2": "A database is an organized collection of structured information stored electronically.",
                "score": 0.89
            },
            {
                "sentence1": "Explain REST API",
                "sentence2": "REST API is an architectural style for building web services using HTTP requests.",
                "score": 0.84
            },
            {
                "sentence1": "What is version control?",
                "sentence2": "Version control is a system that tracks changes to files over time.",
                "score": 0.86
            },
            {
                "sentence1": "Define responsive design",
                "sentence2": "Responsive design ensures websites work well on all devices and screen sizes.",
                "score": 0.88
            },
            {
                "sentence1": "What is encryption?",
                "sentence2": "Encryption is the process of encoding information to prevent unauthorized access.",
                "score": 0.91
            },
            {
                "sentence1": "Explain agile methodology",
                "sentence2": "Agile is an iterative approach to project management focused on flexibility.",
                "score": 0.83
            },
        ]
        
        logger.info(f"Generated {len(pairs)} question-answer pairs")
        return pairs
    
    def generate_domain_specific_pairs(self) -> List[Dict]:
        """
        Generate domain-specific sentence pairs.
        Score: Various
        """
        pairs = [
            # Programming
            {
                "sentence1": "Python list comprehension",
                "sentence2": "Creating lists in Python efficiently",
                "score": 0.86
            },
            {
                "sentence1": "Git merge conflicts",
                "sentence2": "Resolving version control conflicts",
                "score": 0.84
            },
            {
                "sentence1": "React component lifecycle",
                "sentence2": "Understanding React hooks",
                "score": 0.72
            },
            
            # Healthcare
            {
                "sentence1": "Blood pressure medication",
                "sentence2": "Treating hypertension",
                "score": 0.78
            },
            {
                "sentence1": "Physical therapy exercises",
                "sentence2": "Rehabilitation program",
                "score": 0.80
            },
            
            # Finance
            {
                "sentence1": "Investment portfolio diversification",
                "sentence2": "Managing financial risk",
                "score": 0.75
            },
            {
                "sentence1": "Mortgage interest rates",
                "sentence2": "Home loan options",
                "score": 0.82
            },
            
            # Education
            {
                "sentence1": "Online course platforms",
                "sentence2": "E-learning systems",
                "score": 0.88
            },
            {
                "sentence1": "Study techniques for exams",
                "sentence2": "Test preparation strategies",
                "score": 0.85
            },
            
            # E-commerce
            {
                "sentence1": "Product recommendation system",
                "sentence2": "Personalized shopping suggestions",
                "score": 0.83
            },
            {
                "sentence1": "Shopping cart abandonment",
                "sentence2": "Incomplete purchase behavior",
                "score": 0.86
            },
        ]
        
        logger.info(f"Generated {len(pairs)} domain-specific pairs")
        return pairs
    
    def format_for_training(self, pairs: List[Dict]) -> List[Dict]:
        """
        Format sentence pairs for training.
        
        Args:
            pairs: List of sentence pair dictionaries
            
        Returns:
            Formatted training examples
        """
        formatted = []
        
        for pair in pairs:
            formatted.append({
                "sentence1": pair["sentence1"],
                "sentence2": pair["sentence2"],
                "score": pair["score"]
            })
        
        return formatted
    
    def create_contrastive_examples(self, pairs: List[Dict]) -> List[Dict]:
        """
        Create contrastive examples (anchor, positive, negative).
        
        Args:
            pairs: Sentence pairs with scores
            
        Returns:
            Triplet examples
        """
        contrastive = []
        
        high_sim = [p for p in pairs if p["score"] >= 0.80]
        low_sim = [p for p in pairs if p["score"] <= 0.30]
        
        for positive_pair in high_sim[:20]:  # Take first 20
            # Select random negative
            if low_sim:
                negative_pair = random.choice(low_sim)
                contrastive.append({
                    "anchor": positive_pair["sentence1"],
                    "positive": positive_pair["sentence2"],
                    "negative": negative_pair["sentence2"]
                })
        
        logger.info(f"Created {len(contrastive)} contrastive examples")
        return contrastive
    
    def save_data(self, data: List[Dict], filename: str, format: str = "json"):
        """Save training data to file."""
        filepath = self.output_dir / filename
        
        if format == "json":
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
        elif format == "jsonl":
            with open(filepath, 'w', encoding='utf-8') as f:
                for item in data:
                    f.write(json.dumps(item, ensure_ascii=False) + '\n')
        
        logger.info(f"Saved {len(data)} examples to {filepath}")
    
    def generate_full_dataset(self, format: str = "json") -> str:
        """
        Generate complete embeddings training dataset.
        
        Args:
            format: Output format ('json' or 'jsonl')
            
        Returns:
            Output directory path
        """
        logger.info("Generating embeddings training dataset...")
        
        # Collect all pairs
        all_pairs = []
        
        paraphrase_pairs = self.generate_paraphrase_pairs()
        all_pairs.extend(paraphrase_pairs)
        
        similar_pairs = self.generate_similar_pairs()
        all_pairs.extend(similar_pairs)
        
        dissimilar_pairs = self.generate_dissimilar_pairs()
        all_pairs.extend(dissimilar_pairs)
        
        qa_pairs = self.generate_question_answer_pairs()
        all_pairs.extend(qa_pairs)
        
        domain_pairs = self.generate_domain_specific_pairs()
        all_pairs.extend(domain_pairs)
        
        # Shuffle
        random.shuffle(all_pairs)
        
        # Split train/validation
        split_idx = int(len(all_pairs) * 0.9)
        train_pairs = all_pairs[:split_idx]
        val_pairs = all_pairs[split_idx:]
        
        logger.info(f"Train: {len(train_pairs)} pairs")
        logger.info(f"Validation: {len(val_pairs)} pairs")
        
        # Format data
        train_data = self.format_for_training(train_pairs)
        val_data = self.format_for_training(val_pairs)
        
        # Save sentence pair format
        self.save_data(train_data, f"train_pairs.{format}", format)
        self.save_data(val_data, f"validation_pairs.{format}", format)
        
        # Create contrastive examples
        contrastive_data = self.create_contrastive_examples(all_pairs)
        self.save_data(contrastive_data, f"contrastive_triplets.{format}", format)
        
        # Generate statistics
        stats = {
            "total_pairs": len(all_pairs),
            "train_size": len(train_pairs),
            "validation_size": len(val_pairs),
            "contrastive_triplets": len(contrastive_data),
            "paraphrase_pairs": len(paraphrase_pairs),
            "similar_pairs": len(similar_pairs),
            "dissimilar_pairs": len(dissimilar_pairs),
            "qa_pairs": len(qa_pairs),
            "domain_pairs": len(domain_pairs),
            "score_distribution": {
                "high (0.8-1.0)": len([p for p in all_pairs if p["score"] >= 0.8]),
                "medium (0.5-0.8)": len([p for p in all_pairs if 0.5 <= p["score"] < 0.8]),
                "low (0.0-0.5)": len([p for p in all_pairs if p["score"] < 0.5])
            },
            "generated_at": datetime.now().isoformat(),
            "format": format
        }
        
        self.save_data(stats, "embeddings_dataset_stats.json", "json")
        
        logger.info("="*60)
        logger.info("βœ… Embeddings dataset generation complete!")
        logger.info(f"Total pairs: {len(all_pairs)}")
        logger.info(f"Output directory: {self.output_dir}")
        logger.info("="*60)
        
        return str(self.output_dir)


def main():
    """Main function for data generation."""
    import argparse
    
    parser = argparse.ArgumentParser(
        description="Generate training data for Helion-V1-Embeddings"
    )
    parser.add_argument(
        "--output-dir",
        default="./embeddings_training_data",
        help="Output directory for training data"
    )
    parser.add_argument(
        "--format",
        choices=["json", "jsonl"],
        default="json",
        help="Output format"
    )
    
    args = parser.parse_args()
    
    # Generate dataset
    generator = EmbeddingsDataGenerator(output_dir=args.output_dir)
    output_path = generator.generate_full_dataset(format=args.format)
    
    print("\n" + "="*60)
    print("🎯 Embeddings Training Data Ready!")
    print("="*60)
    print(f"πŸ“ Location: {output_path}")
    print(f"πŸ“Š Format: {args.format}")
    print("\nπŸ“„ Files created:")
    print(f"  β€’ train_pairs.{args.format} - Training sentence pairs")
    print(f"  β€’ validation_pairs.{args.format} - Validation pairs")
    print(f"  β€’ contrastive_triplets.{args.format} - Triplet examples")
    print("  β€’ embeddings_dataset_stats.json - Dataset statistics")
    print("\nπŸ’‘ Training data includes:")
    print("  β€’ Paraphrase pairs (high similarity)")
    print("  β€’ Similar concept pairs (medium similarity)")
    print("  β€’ Dissimilar pairs (low similarity)")
    print("  β€’ Question-answer pairs")
    print("  β€’ Domain-specific examples")
    print("\nπŸš€ Next step:")
    print(f"   python train_embeddings.py --data-file {output_path}/train_pairs.{args.format}")
    print("="*60)


if __name__ == "__main__":
    main()