helion-v1-embeddings / inference_embeddings.py
Trouter-Library's picture
Create inference_embeddings.py
cc8ba82 verified
"""
Helion-V1-Embeddings Inference Script
Simple interface for generating embeddings and similarity search
"""
import numpy as np
import logging
from typing import List, Union, Optional
from pathlib import Path
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class HelionEmbeddings:
"""Simple interface for Helion-V1-Embeddings model."""
def __init__(self, model_name: str = "DeepXR/Helion-V1-embeddings"):
"""
Initialize embeddings model.
Args:
model_name: Model name or path
"""
try:
from sentence_transformers import SentenceTransformer
logger.info(f"Loading model: {model_name}")
self.model = SentenceTransformer(model_name)
self.dimension = self.model.get_sentence_embedding_dimension()
logger.info(f"Model loaded. Embedding dimension: {self.dimension}")
except ImportError:
logger.error("sentence-transformers not installed. Install with: pip install sentence-transformers")
raise
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise
def encode(
self,
texts: Union[str, List[str]],
batch_size: int = 32,
show_progress: bool = False,
normalize: bool = True
) -> np.ndarray:
"""
Generate embeddings for text(s).
Args:
texts: Single text or list of texts
batch_size: Batch size for encoding
show_progress: Show progress bar
normalize: L2 normalize embeddings
Returns:
Numpy array of embeddings
"""
embeddings = self.model.encode(
texts,
batch_size=batch_size,
show_progress_bar=show_progress,
normalize_embeddings=normalize
)
return embeddings
def similarity(
self,
text1: Union[str, List[str]],
text2: Union[str, List[str]]
) -> Union[float, np.ndarray]:
"""
Calculate cosine similarity between texts.
Args:
text1: First text or list of texts
text2: Second text or list of texts
Returns:
Similarity score(s)
"""
from sentence_transformers import util
emb1 = self.encode(text1)
emb2 = self.encode(text2)
similarity = util.cos_sim(emb1, emb2)
# Return single value if both inputs are single strings
if isinstance(text1, str) and isinstance(text2, str):
return float(similarity[0][0])
return similarity.numpy()
def search(
self,
query: str,
documents: List[str],
top_k: int = 5
) -> List[tuple]:
"""
Search for most similar documents to query.
Args:
query: Search query
documents: List of documents to search
top_k: Number of top results to return
Returns:
List of (document, score, index) tuples
"""
from sentence_transformers import util
query_emb = self.encode(query)
doc_embs = self.encode(documents)
# Calculate similarities
similarities = util.cos_sim(query_emb, doc_embs)[0]
# Get top-k results
top_results = np.argsort(-similarities.numpy())[:top_k]
results = []
for idx in top_results:
results.append((
documents[idx],
float(similarities[idx]),
int(idx)
))
return results
def cluster(
self,
texts: List[str],
num_clusters: int = 5,
min_cluster_size: int = 2
) -> List[List[int]]:
"""
Cluster texts by similarity.
Args:
texts: List of texts to cluster
num_clusters: Number of clusters
min_cluster_size: Minimum cluster size
Returns:
List of clusters (each cluster is a list of indices)
"""
from sentence_transformers import util
embeddings = self.encode(texts)
# Perform clustering
clusters = util.community_detection(
embeddings,
min_community_size=min_cluster_size,
threshold=0.75
)
return clusters
def save_embeddings(
self,
texts: List[str],
output_file: str,
format: str = "npy"
):
"""
Generate and save embeddings to file.
Args:
texts: Texts to embed
output_file: Output file path
format: Format ('npy', 'npz', or 'json')
"""
embeddings = self.encode(texts, show_progress=True)
if format == "npy":
np.save(output_file, embeddings)
elif format == "npz":
np.savez_compressed(output_file, embeddings=embeddings, texts=texts)
elif format == "json":
import json
data = {
"embeddings": embeddings.tolist(),
"texts": texts,
"dimension": self.dimension
}
with open(output_file, 'w') as f:
json.dump(data, f)
logger.info(f"Saved {len(texts)} embeddings to {output_file}")
def demo_usage():
"""Demonstrate usage examples."""
print("="*60)
print("Helion-V1-Embeddings Demo")
print("="*60)
# Initialize
embedder = HelionEmbeddings("DeepXR/Helion-V1-embeddings")
# Example 1: Basic encoding
print("\n1. Basic Encoding:")
text = "Hello, how are you?"
embedding = embedder.encode(text)
print(f"Text: {text}")
print(f"Embedding shape: {embedding.shape}")
print(f"First 5 values: {embedding[:5]}")
# Example 2: Similarity
print("\n2. Similarity Calculation:")
text1 = "How do I reset my password?"
text2 = "Password reset instructions"
similarity = embedder.similarity(text1, text2)
print(f"Text 1: {text1}")
print(f"Text 2: {text2}")
print(f"Similarity: {similarity:.4f}")
# Example 3: Search
print("\n3. Semantic Search:")
query = "machine learning tutorial"
documents = [
"Learn machine learning basics",
"Cooking recipes for beginners",
"Introduction to neural networks",
"Travel guide to Europe",
"Python programming course"
]
results = embedder.search(query, documents, top_k=3)
print(f"Query: {query}")
print("\nTop 3 Results:")
for i, (doc, score, idx) in enumerate(results, 1):
print(f"{i}. [{score:.4f}] {doc}")
print("\n" + "="*60)
def main():
"""Main CLI interface."""
import argparse
parser = argparse.ArgumentParser(
description="Helion-V1-Embeddings Inference"
)
parser.add_argument(
"--model",
default="DeepXR/Helion-V1-embeddings",
help="Model name or path"
)
subparsers = parser.add_subparsers(dest="command", help="Command to run")
# Encode command
encode_parser = subparsers.add_parser("encode", help="Encode text(s)")
encode_parser.add_argument("text", nargs="+", help="Text(s) to encode")
encode_parser.add_argument("--output", help="Save embeddings to file")
# Similarity command
sim_parser = subparsers.add_parser("similarity", help="Calculate similarity")
sim_parser.add_argument("text1", help="First text")
sim_parser.add_argument("text2", help="Second text")
# Search command
search_parser = subparsers.add_parser("search", help="Search documents")
search_parser.add_argument("query", help="Search query")
search_parser.add_argument("--documents", nargs="+", required=True)
search_parser.add_argument("--top-k", type=int, default=5)
# Demo command
subparsers.add_parser("demo", help="Run demo")
args = parser.parse_args()
if args.command == "demo":
demo_usage()
return
# Initialize model
embedder = HelionEmbeddings(args.model)
if args.command == "encode":
embeddings = embedder.encode(args.text, show_progress=True)
print(f"Generated {len(embeddings)} embeddings")
print(f"Shape: {embeddings.shape}")
if args.output:
embedder.save_embeddings(args.text, args.output)
elif args.command == "similarity":
similarity = embedder.similarity(args.text1, args.text2)
print(f"Text 1: {args.text1}")
print(f"Text 2: {args.text2}")
print(f"Similarity: {similarity:.4f}")
elif args.command == "search":
results = embedder.search(
args.query,
args.documents,
top_k=args.top_k
)
print(f"Query: {args.query}")
print(f"\nTop {args.top_k} results:")
for i, (doc, score, idx) in enumerate(results, 1):
print(f"{i}. [{score:.4f}] {doc}")
else:
parser.print_help()
if __name__ == "__main__":
main()