Create inference.py

Browse files

Files changed (1) hide show

inference.py +322 -0

inference.py ADDED Viewed

	@@ -0,0 +1,322 @@

+"""
+Helion-V1.5-XL Inference Script
+Supports multiple inference modes and optimization techniques
+"""
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig,
+    GenerationConfig
+)
+from typing import Optional, Dict, Any, List
+import argparse
+import json
+import time
+class HelionInference:
+    """Inference wrapper for Helion-V1.5-XL"""
+    def __init__(
+        self,
+        model_name: str = "DeepXR/Helion-V1.5-XL",
+        load_in_4bit: bool = False,
+        load_in_8bit: bool = False,
+        device_map: str = "auto",
+        torch_dtype: str = "bfloat16"
+    ):
+        """
+        Initialize the model and tokenizer
+        Args:
+            model_name: HuggingFace model identifier
+            load_in_4bit: Enable 4-bit quantization
+            load_in_8bit: Enable 8-bit quantization
+            device_map: Device mapping strategy
+            torch_dtype: PyTorch dtype for model weights
+        """
+        self.model_name = model_name
+        print(f"Loading model: {model_name}")
+        # Setup dtype
+        dtype_map = {
+            "bfloat16": torch.bfloat16,
+            "float16": torch.float16,
+            "float32": torch.float32
+        }
+        torch_dtype = dtype_map.get(torch_dtype, torch.bfloat16)
+        # Setup quantization config
+        quantization_config = None
+        if load_in_4bit:
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch_dtype,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4"
+            )
+        elif load_in_8bit:
+            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            trust_remote_code=True
+        )
+        # Load model
+        model_kwargs = {
+            "device_map": device_map,
+            "trust_remote_code": True,
+        }
+        if quantization_config:
+            model_kwargs["quantization_config"] = quantization_config
+        else:
+            model_kwargs["torch_dtype"] = torch_dtype
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            **model_kwargs
+        )
+        self.model.eval()
+        print("Model loaded successfully!")
+    def generate(
+        self,
+        prompt: str,
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        top_k: int = 50,
+        repetition_penalty: float = 1.1,
+        do_sample: bool = True,
+        num_return_sequences: int = 1,
+        **kwargs
+    ) -> List[str]:
+        """
+        Generate text from a prompt
+        Args:
+            prompt: Input text prompt
+            max_new_tokens: Maximum number of tokens to generate
+            temperature: Sampling temperature (0.0 to 2.0)
+            top_p: Nucleus sampling threshold
+            top_k: Top-k sampling threshold
+            repetition_penalty: Penalty for repetition
+            do_sample: Whether to use sampling
+            num_return_sequences: Number of sequences to generate
+        Returns:
+            List of generated text strings
+        """
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        generation_config = GenerationConfig(
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repetition_penalty=repetition_penalty,
+            do_sample=do_sample,
+            num_return_sequences=num_return_sequences,
+            pad_token_id=self.tokenizer.pad_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            **kwargs
+        )
+        start_time = time.time()
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                generation_config=generation_config
+            )
+        generation_time = time.time() - start_time
+        # Decode outputs
+        responses = []
+        for output in outputs:
+            response = self.tokenizer.decode(output, skip_special_tokens=True)
+            # Remove the prompt from response
+            response = response[len(prompt):].strip()
+            responses.append(response)
+        # Calculate tokens per second
+        total_tokens = sum(len(output) for output in outputs)
+        tokens_per_sec = total_tokens / generation_time
+        print(f"\nGeneration Stats:")
+        print(f"  Time: {generation_time:.2f}s")
+        print(f"  Tokens/sec: {tokens_per_sec:.2f}")
+        return responses
+    def chat(
+        self,
+        messages: List[Dict[str, str]],
+        max_new_tokens: int = 512,
+        temperature: float = 0.7,
+        **kwargs
+    ) -> str:
+        """
+        Generate response in chat format
+        Args:
+            messages: List of message dicts with 'role' and 'content'
+            max_new_tokens: Maximum tokens to generate
+            temperature: Sampling temperature
+        Returns:
+            Generated response string
+        """
+        # Apply chat template
+        prompt = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        responses = self.generate(
+            prompt,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            **kwargs
+        )
+        return responses[0]
+    def batch_generate(
+        self,
+        prompts: List[str],
+        max_new_tokens: int = 512,
+        **kwargs
+    ) -> List[str]:
+        """
+        Generate responses for multiple prompts in batch
+        Args:
+            prompts: List of input prompts
+            max_new_tokens: Maximum tokens per generation
+        Returns:
+            List of generated responses
+        """
+        inputs = self.tokenizer(
+            prompts,
+            return_tensors="pt",
+            padding=True,
+            truncation=True
+        ).to(self.model.device)
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                **kwargs
+            )
+        responses = []
+        for i, output in enumerate(outputs):
+            response = self.tokenizer.decode(output, skip_special_tokens=True)
+            # Remove prompt
+            response = response[len(prompts[i]):].strip()
+            responses.append(response)
+        return responses
+def main():
+    parser = argparse.ArgumentParser(description="Helion-V1.5-XL Inference")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="DeepXR/Helion-V1.5-XL",
+        help="Model name or path"
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        required=True,
+        help="Input prompt"
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=512,
+        help="Maximum tokens to generate"
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.7,
+        help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--top-p",
+        type=float,
+        default=0.9,
+        help="Nucleus sampling threshold"
+    )
+    parser.add_argument(
+        "--load-in-4bit",
+        action="store_true",
+        help="Load model in 4-bit quantization"
+    )
+    parser.add_argument(
+        "--load-in-8bit",
+        action="store_true",
+        help="Load model in 8-bit quantization"
+    )
+    parser.add_argument(
+        "--chat-mode",
+        action="store_true",
+        help="Use chat format"
+    )
+    args = parser.parse_args()
+    # Initialize model
+    inference = HelionInference(
+        model_name=args.model,
+        load_in_4bit=args.load_in_4bit,
+        load_in_8bit=args.load_in_8bit
+    )
+    # Generate response
+    if args.chat_mode:
+        messages = [
+            {"role": "user", "content": args.prompt}
+        ]
+        response = inference.chat(
+            messages,
+            max_new_tokens=args.max_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p
+        )
+    else:
+        responses = inference.generate(
+            args.prompt,
+            max_new_tokens=args.max_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p
+        )
+        response = responses[0]
+    print("\n" + "="*80)
+    print("PROMPT:")
+    print("="*80)
+    print(args.prompt)
+    print("\n" + "="*80)
+    print("RESPONSE:")
+    print("="*80)
+    print(response)
+    print("="*80)
+if __name__ == "__main__":
+    main()