|
|
""" |
|
|
Helion-V1-Embeddings Inference Script |
|
|
Simple interface for generating embeddings and similarity search |
|
|
""" |
|
|
|
|
|
import numpy as np |
|
|
import logging |
|
|
from typing import List, Union, Optional |
|
|
from pathlib import Path |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class HelionEmbeddings: |
|
|
"""Simple interface for Helion-V1-Embeddings model.""" |
|
|
|
|
|
def __init__(self, model_name: str = "DeepXR/Helion-V1-embeddings"): |
|
|
""" |
|
|
Initialize embeddings model. |
|
|
|
|
|
Args: |
|
|
model_name: Model name or path |
|
|
""" |
|
|
try: |
|
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
logger.info(f"Loading model: {model_name}") |
|
|
self.model = SentenceTransformer(model_name) |
|
|
self.dimension = self.model.get_sentence_embedding_dimension() |
|
|
logger.info(f"Model loaded. Embedding dimension: {self.dimension}") |
|
|
|
|
|
except ImportError: |
|
|
logger.error("sentence-transformers not installed. Install with: pip install sentence-transformers") |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"Failed to load model: {e}") |
|
|
raise |
|
|
|
|
|
def encode( |
|
|
self, |
|
|
texts: Union[str, List[str]], |
|
|
batch_size: int = 32, |
|
|
show_progress: bool = False, |
|
|
normalize: bool = True |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Generate embeddings for text(s). |
|
|
|
|
|
Args: |
|
|
texts: Single text or list of texts |
|
|
batch_size: Batch size for encoding |
|
|
show_progress: Show progress bar |
|
|
normalize: L2 normalize embeddings |
|
|
|
|
|
Returns: |
|
|
Numpy array of embeddings |
|
|
""" |
|
|
embeddings = self.model.encode( |
|
|
texts, |
|
|
batch_size=batch_size, |
|
|
show_progress_bar=show_progress, |
|
|
normalize_embeddings=normalize |
|
|
) |
|
|
|
|
|
return embeddings |
|
|
|
|
|
def similarity( |
|
|
self, |
|
|
text1: Union[str, List[str]], |
|
|
text2: Union[str, List[str]] |
|
|
) -> Union[float, np.ndarray]: |
|
|
""" |
|
|
Calculate cosine similarity between texts. |
|
|
|
|
|
Args: |
|
|
text1: First text or list of texts |
|
|
text2: Second text or list of texts |
|
|
|
|
|
Returns: |
|
|
Similarity score(s) |
|
|
""" |
|
|
from sentence_transformers import util |
|
|
|
|
|
emb1 = self.encode(text1) |
|
|
emb2 = self.encode(text2) |
|
|
|
|
|
similarity = util.cos_sim(emb1, emb2) |
|
|
|
|
|
|
|
|
if isinstance(text1, str) and isinstance(text2, str): |
|
|
return float(similarity[0][0]) |
|
|
|
|
|
return similarity.numpy() |
|
|
|
|
|
def search( |
|
|
self, |
|
|
query: str, |
|
|
documents: List[str], |
|
|
top_k: int = 5 |
|
|
) -> List[tuple]: |
|
|
""" |
|
|
Search for most similar documents to query. |
|
|
|
|
|
Args: |
|
|
query: Search query |
|
|
documents: List of documents to search |
|
|
top_k: Number of top results to return |
|
|
|
|
|
Returns: |
|
|
List of (document, score, index) tuples |
|
|
""" |
|
|
from sentence_transformers import util |
|
|
|
|
|
query_emb = self.encode(query) |
|
|
doc_embs = self.encode(documents) |
|
|
|
|
|
|
|
|
similarities = util.cos_sim(query_emb, doc_embs)[0] |
|
|
|
|
|
|
|
|
top_results = np.argsort(-similarities.numpy())[:top_k] |
|
|
|
|
|
results = [] |
|
|
for idx in top_results: |
|
|
results.append(( |
|
|
documents[idx], |
|
|
float(similarities[idx]), |
|
|
int(idx) |
|
|
)) |
|
|
|
|
|
return results |
|
|
|
|
|
def cluster( |
|
|
self, |
|
|
texts: List[str], |
|
|
num_clusters: int = 5, |
|
|
min_cluster_size: int = 2 |
|
|
) -> List[List[int]]: |
|
|
""" |
|
|
Cluster texts by similarity. |
|
|
|
|
|
Args: |
|
|
texts: List of texts to cluster |
|
|
num_clusters: Number of clusters |
|
|
min_cluster_size: Minimum cluster size |
|
|
|
|
|
Returns: |
|
|
List of clusters (each cluster is a list of indices) |
|
|
""" |
|
|
from sentence_transformers import util |
|
|
|
|
|
embeddings = self.encode(texts) |
|
|
|
|
|
|
|
|
clusters = util.community_detection( |
|
|
embeddings, |
|
|
min_community_size=min_cluster_size, |
|
|
threshold=0.75 |
|
|
) |
|
|
|
|
|
return clusters |
|
|
|
|
|
def save_embeddings( |
|
|
self, |
|
|
texts: List[str], |
|
|
output_file: str, |
|
|
format: str = "npy" |
|
|
): |
|
|
""" |
|
|
Generate and save embeddings to file. |
|
|
|
|
|
Args: |
|
|
texts: Texts to embed |
|
|
output_file: Output file path |
|
|
format: Format ('npy', 'npz', or 'json') |
|
|
""" |
|
|
embeddings = self.encode(texts, show_progress=True) |
|
|
|
|
|
if format == "npy": |
|
|
np.save(output_file, embeddings) |
|
|
elif format == "npz": |
|
|
np.savez_compressed(output_file, embeddings=embeddings, texts=texts) |
|
|
elif format == "json": |
|
|
import json |
|
|
data = { |
|
|
"embeddings": embeddings.tolist(), |
|
|
"texts": texts, |
|
|
"dimension": self.dimension |
|
|
} |
|
|
with open(output_file, 'w') as f: |
|
|
json.dump(data, f) |
|
|
|
|
|
logger.info(f"Saved {len(texts)} embeddings to {output_file}") |
|
|
|
|
|
|
|
|
def demo_usage(): |
|
|
"""Demonstrate usage examples.""" |
|
|
|
|
|
print("="*60) |
|
|
print("Helion-V1-Embeddings Demo") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
embedder = HelionEmbeddings("DeepXR/Helion-V1-embeddings") |
|
|
|
|
|
|
|
|
print("\n1. Basic Encoding:") |
|
|
text = "Hello, how are you?" |
|
|
embedding = embedder.encode(text) |
|
|
print(f"Text: {text}") |
|
|
print(f"Embedding shape: {embedding.shape}") |
|
|
print(f"First 5 values: {embedding[:5]}") |
|
|
|
|
|
|
|
|
print("\n2. Similarity Calculation:") |
|
|
text1 = "How do I reset my password?" |
|
|
text2 = "Password reset instructions" |
|
|
similarity = embedder.similarity(text1, text2) |
|
|
print(f"Text 1: {text1}") |
|
|
print(f"Text 2: {text2}") |
|
|
print(f"Similarity: {similarity:.4f}") |
|
|
|
|
|
|
|
|
print("\n3. Semantic Search:") |
|
|
query = "machine learning tutorial" |
|
|
documents = [ |
|
|
"Learn machine learning basics", |
|
|
"Cooking recipes for beginners", |
|
|
"Introduction to neural networks", |
|
|
"Travel guide to Europe", |
|
|
"Python programming course" |
|
|
] |
|
|
|
|
|
results = embedder.search(query, documents, top_k=3) |
|
|
print(f"Query: {query}") |
|
|
print("\nTop 3 Results:") |
|
|
for i, (doc, score, idx) in enumerate(results, 1): |
|
|
print(f"{i}. [{score:.4f}] {doc}") |
|
|
|
|
|
print("\n" + "="*60) |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main CLI interface.""" |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser( |
|
|
description="Helion-V1-Embeddings Inference" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--model", |
|
|
default="DeepXR/Helion-V1-embeddings", |
|
|
help="Model name or path" |
|
|
) |
|
|
|
|
|
subparsers = parser.add_subparsers(dest="command", help="Command to run") |
|
|
|
|
|
|
|
|
encode_parser = subparsers.add_parser("encode", help="Encode text(s)") |
|
|
encode_parser.add_argument("text", nargs="+", help="Text(s) to encode") |
|
|
encode_parser.add_argument("--output", help="Save embeddings to file") |
|
|
|
|
|
|
|
|
sim_parser = subparsers.add_parser("similarity", help="Calculate similarity") |
|
|
sim_parser.add_argument("text1", help="First text") |
|
|
sim_parser.add_argument("text2", help="Second text") |
|
|
|
|
|
|
|
|
search_parser = subparsers.add_parser("search", help="Search documents") |
|
|
search_parser.add_argument("query", help="Search query") |
|
|
search_parser.add_argument("--documents", nargs="+", required=True) |
|
|
search_parser.add_argument("--top-k", type=int, default=5) |
|
|
|
|
|
|
|
|
subparsers.add_parser("demo", help="Run demo") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
if args.command == "demo": |
|
|
demo_usage() |
|
|
return |
|
|
|
|
|
|
|
|
embedder = HelionEmbeddings(args.model) |
|
|
|
|
|
if args.command == "encode": |
|
|
embeddings = embedder.encode(args.text, show_progress=True) |
|
|
print(f"Generated {len(embeddings)} embeddings") |
|
|
print(f"Shape: {embeddings.shape}") |
|
|
|
|
|
if args.output: |
|
|
embedder.save_embeddings(args.text, args.output) |
|
|
|
|
|
elif args.command == "similarity": |
|
|
similarity = embedder.similarity(args.text1, args.text2) |
|
|
print(f"Text 1: {args.text1}") |
|
|
print(f"Text 2: {args.text2}") |
|
|
print(f"Similarity: {similarity:.4f}") |
|
|
|
|
|
elif args.command == "search": |
|
|
results = embedder.search( |
|
|
args.query, |
|
|
args.documents, |
|
|
top_k=args.top_k |
|
|
) |
|
|
|
|
|
print(f"Query: {args.query}") |
|
|
print(f"\nTop {args.top_k} results:") |
|
|
for i, (doc, score, idx) in enumerate(results, 1): |
|
|
print(f"{i}. [{score:.4f}] {doc}") |
|
|
|
|
|
else: |
|
|
parser.print_help() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |