|
|
""" |
|
|
Helion-V1-Embeddings Evaluation Script |
|
|
Evaluate embedding model quality on standard benchmarks |
|
|
""" |
|
|
|
|
|
import json |
|
|
import logging |
|
|
import numpy as np |
|
|
from typing import List, Dict, Tuple |
|
|
from dataclasses import dataclass, asdict |
|
|
from pathlib import Path |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class EvaluationMetrics: |
|
|
"""Container for evaluation metrics.""" |
|
|
sts_correlation: float = 0.0 |
|
|
retrieval_accuracy: float = 0.0 |
|
|
clustering_score: float = 0.0 |
|
|
speed_sentences_per_sec: float = 0.0 |
|
|
model_size_mb: float = 0.0 |
|
|
|
|
|
def to_dict(self): |
|
|
return asdict(self) |
|
|
|
|
|
|
|
|
class EmbeddingsEvaluator: |
|
|
"""Evaluate embeddings model.""" |
|
|
|
|
|
def __init__(self, model_name: str = "DeepXR/Helion-V1-embeddings"): |
|
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
logger.info(f"Loading model: {model_name}") |
|
|
self.model = SentenceTransformer(model_name) |
|
|
self.model_name = model_name |
|
|
|
|
|
def evaluate_sts(self) -> float: |
|
|
""" |
|
|
Evaluate on Semantic Textual Similarity benchmark. |
|
|
|
|
|
Returns: |
|
|
Spearman correlation score |
|
|
""" |
|
|
|
|
|
test_pairs = [ |
|
|
("A man is playing a guitar", "A person is playing music", 0.7), |
|
|
("A dog is running in a field", "A cat is sleeping", 0.2), |
|
|
("The weather is nice today", "It's a beautiful day", 0.9), |
|
|
("Programming in Python", "Coding with Python language", 0.95), |
|
|
("Machine learning model", "Deep neural network", 0.6), |
|
|
] |
|
|
|
|
|
from scipy.stats import spearmanr |
|
|
|
|
|
predicted_scores = [] |
|
|
actual_scores = [] |
|
|
|
|
|
for sent1, sent2, actual in test_pairs: |
|
|
emb1 = self.model.encode(sent1) |
|
|
emb2 = self.model.encode(sent2) |
|
|
|
|
|
|
|
|
similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)) |
|
|
|
|
|
predicted_scores.append(similarity) |
|
|
actual_scores.append(actual) |
|
|
|
|
|
correlation, _ = spearmanr(predicted_scores, actual_scores) |
|
|
logger.info(f"STS Correlation: {correlation:.4f}") |
|
|
|
|
|
return correlation |
|
|
|
|
|
def evaluate_retrieval(self) -> float: |
|
|
""" |
|
|
Evaluate retrieval accuracy. |
|
|
|
|
|
Returns: |
|
|
Accuracy score |
|
|
""" |
|
|
|
|
|
queries_and_docs = [ |
|
|
{ |
|
|
"query": "How to learn Python programming?", |
|
|
"relevant": ["Python tutorial for beginners", "Learn Python step by step"], |
|
|
"irrelevant": ["Java programming guide", "Database design tutorial"] |
|
|
}, |
|
|
{ |
|
|
"query": "Best restaurants in Paris", |
|
|
"relevant": ["Top dining spots in Paris", "Where to eat in Paris"], |
|
|
"irrelevant": ["London travel guide", "New York attractions"] |
|
|
}, |
|
|
{ |
|
|
"query": "Machine learning basics", |
|
|
"relevant": ["Introduction to ML", "ML fundamentals explained"], |
|
|
"irrelevant": ["Cooking recipes", "Gardening tips"] |
|
|
} |
|
|
] |
|
|
|
|
|
correct = 0 |
|
|
total = 0 |
|
|
|
|
|
for item in queries_and_docs: |
|
|
query = item["query"] |
|
|
all_docs = item["relevant"] + item["irrelevant"] |
|
|
|
|
|
query_emb = self.model.encode(query) |
|
|
doc_embs = self.model.encode(all_docs) |
|
|
|
|
|
|
|
|
similarities = [ |
|
|
np.dot(query_emb, doc_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(doc_emb)) |
|
|
for doc_emb in doc_embs |
|
|
] |
|
|
|
|
|
|
|
|
num_relevant = len(item["relevant"]) |
|
|
top_indices = np.argsort(similarities)[-num_relevant:] |
|
|
|
|
|
|
|
|
correct += sum(1 for idx in top_indices if idx < num_relevant) |
|
|
total += num_relevant |
|
|
|
|
|
accuracy = correct / total |
|
|
logger.info(f"Retrieval Accuracy: {accuracy:.4f}") |
|
|
|
|
|
return accuracy |
|
|
|
|
|
def evaluate_speed(self, num_sentences: int = 1000) -> float: |
|
|
""" |
|
|
Measure encoding speed. |
|
|
|
|
|
Args: |
|
|
num_sentences: Number of sentences to encode |
|
|
|
|
|
Returns: |
|
|
Sentences per second |
|
|
""" |
|
|
import time |
|
|
|
|
|
|
|
|
test_sentences = [ |
|
|
f"This is test sentence number {i} for speed evaluation." |
|
|
for i in range(num_sentences) |
|
|
] |
|
|
|
|
|
|
|
|
_ = self.model.encode(test_sentences[:10]) |
|
|
|
|
|
|
|
|
start_time = time.time() |
|
|
_ = self.model.encode(test_sentences, batch_size=32) |
|
|
elapsed = time.time() - start_time |
|
|
|
|
|
speed = num_sentences / elapsed |
|
|
logger.info(f"Speed: {speed:.2f} sentences/sec") |
|
|
|
|
|
return speed |
|
|
|
|
|
def evaluate_clustering(self) -> float: |
|
|
""" |
|
|
Evaluate clustering quality. |
|
|
|
|
|
Returns: |
|
|
Clustering score (silhouette score) |
|
|
""" |
|
|
|
|
|
documents = { |
|
|
"tech": [ |
|
|
"Machine learning algorithms", |
|
|
"Python programming tutorial", |
|
|
"Data science basics" |
|
|
], |
|
|
"food": [ |
|
|
"Italian pasta recipes", |
|
|
"How to bake bread", |
|
|
"Cooking techniques" |
|
|
], |
|
|
"travel": [ |
|
|
"Best places to visit in Europe", |
|
|
"Travel tips for beginners", |
|
|
"Budget travel guide" |
|
|
] |
|
|
} |
|
|
|
|
|
all_docs = [] |
|
|
labels = [] |
|
|
|
|
|
for category, docs in documents.items(): |
|
|
all_docs.extend(docs) |
|
|
labels.extend([category] * len(docs)) |
|
|
|
|
|
|
|
|
embeddings = self.model.encode(all_docs) |
|
|
|
|
|
|
|
|
from sklearn.metrics import silhouette_score |
|
|
from sklearn.preprocessing import LabelEncoder |
|
|
|
|
|
le = LabelEncoder() |
|
|
numeric_labels = le.fit_transform(labels) |
|
|
|
|
|
score = silhouette_score(embeddings, numeric_labels) |
|
|
logger.info(f"Clustering Score: {score:.4f}") |
|
|
|
|
|
return score |
|
|
|
|
|
def get_model_size(self) -> float: |
|
|
""" |
|
|
Get model size in MB. |
|
|
|
|
|
Returns: |
|
|
Model size in megabytes |
|
|
""" |
|
|
|
|
|
num_params = sum(p.numel() for p in self.model.parameters()) |
|
|
|
|
|
size_mb = (num_params * 4) / (1024 * 1024) |
|
|
|
|
|
logger.info(f"Model Size: {size_mb:.2f} MB") |
|
|
|
|
|
return size_mb |
|
|
|
|
|
def run_full_evaluation(self, output_file: str = "embeddings_eval_results.json") -> EvaluationMetrics: |
|
|
""" |
|
|
Run complete evaluation suite. |
|
|
|
|
|
Args: |
|
|
output_file: Output file for results |
|
|
|
|
|
Returns: |
|
|
EvaluationMetrics object |
|
|
""" |
|
|
logger.info("="*60) |
|
|
logger.info("Starting Full Evaluation") |
|
|
logger.info("="*60) |
|
|
|
|
|
metrics = EvaluationMetrics() |
|
|
|
|
|
|
|
|
try: |
|
|
metrics.sts_correlation = self.evaluate_sts() |
|
|
except Exception as e: |
|
|
logger.error(f"STS evaluation failed: {e}") |
|
|
|
|
|
try: |
|
|
metrics.retrieval_accuracy = self.evaluate_retrieval() |
|
|
except Exception as e: |
|
|
logger.error(f"Retrieval evaluation failed: {e}") |
|
|
|
|
|
try: |
|
|
metrics.clustering_score = self.evaluate_clustering() |
|
|
except Exception as e: |
|
|
logger.error(f"Clustering evaluation failed: {e}") |
|
|
|
|
|
try: |
|
|
metrics.speed_sentences_per_sec = self.evaluate_speed() |
|
|
except Exception as e: |
|
|
logger.error(f"Speed evaluation failed: {e}") |
|
|
|
|
|
try: |
|
|
metrics.model_size_mb = self.get_model_size() |
|
|
except Exception as e: |
|
|
logger.error(f"Size calculation failed: {e}") |
|
|
|
|
|
|
|
|
results = { |
|
|
"model": self.model_name, |
|
|
"metrics": metrics.to_dict(), |
|
|
"timestamp": str(Path().resolve()) |
|
|
} |
|
|
|
|
|
with open(output_file, 'w') as f: |
|
|
json.dump(results, f, indent=2) |
|
|
|
|
|
logger.info("="*60) |
|
|
logger.info("Evaluation Complete") |
|
|
logger.info("="*60) |
|
|
logger.info(f"Results saved to: {output_file}") |
|
|
|
|
|
return metrics |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main evaluation function.""" |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser( |
|
|
description="Evaluate Helion-V1-Embeddings" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--model", |
|
|
default="DeepXR/Helion-V1-embeddings", |
|
|
help="Model to evaluate" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output", |
|
|
default="embeddings_eval_results.json", |
|
|
help="Output file for results" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
evaluator = EmbeddingsEvaluator(args.model) |
|
|
metrics = evaluator.run_full_evaluation(args.output) |
|
|
|
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("EVALUATION RESULTS") |
|
|
print("="*60) |
|
|
print(f"STS Correlation: {metrics.sts_correlation:.4f}") |
|
|
print(f"Retrieval Accuracy: {metrics.retrieval_accuracy:.4f}") |
|
|
print(f"Clustering Score: {metrics.clustering_score:.4f}") |
|
|
print(f"Speed: {metrics.speed_sentences_per_sec:.0f} sent/sec") |
|
|
print(f"Model Size: {metrics.model_size_mb:.2f} MB") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |