"""
Helion-V1 Safety and Safeguard System
Implements content filtering and safety checks
"""

import re
from typing import List, Tuple, Optional
from enum import Enum


class SafetyLevel(Enum):
    """Safety levels for content classification."""
    SAFE = "safe"
    CAUTION = "caution"
    UNSAFE = "unsafe"


class SafetyCategory(Enum):
    """Categories of unsafe content."""
    VIOLENCE = "violence"
    HATE_SPEECH = "hate_speech"
    ILLEGAL_ACTIVITY = "illegal_activity"
    SELF_HARM = "self_harm"
    SEXUAL_CONTENT = "sexual_content"
    PRIVACY_VIOLATION = "privacy_violation"
    MISINFORMATION = "misinformation"
    HARASSMENT = "harassment"


class HelionSafeguards:
    """
    Safety system for Helion-V1 to prevent harmful outputs.
    """
    
    def __init__(self):
        # Unsafe content patterns
        self.unsafe_patterns = {
            SafetyCategory.VIOLENCE: [
                r'\b(kill|murder|attack|assault|weapon|bomb|explosive)\b',
                r'\b(violent|torture|abuse|harm someone)\b'
            ],
            SafetyCategory.HATE_SPEECH: [
                r'\b(racist|sexist|bigot|discriminate)\b',
                r'\b(hate|slur)\s+(speech|against)\b'
            ],
            SafetyCategory.ILLEGAL_ACTIVITY: [
                r'\b(illegal|hack|exploit|steal|fraud|scam)\b',
                r'\b(drug deal|money launder|counterfeit)\b'
            ],
            SafetyCategory.SELF_HARM: [
                r'\b(suicide|self.?harm|end.*life|kill.*myself)\b',
                r'\b(cutting|overdose)\b'
            ],
            SafetyCategory.SEXUAL_CONTENT: [
                r'\b(explicit|pornographic|sexual content)\b'
            ],
            SafetyCategory.PRIVACY_VIOLATION: [
                r'\b(dox|personal information|private data)\b',
                r'\b(steal.*identity|impersonate)\b'
            ]
        }
        
        # Caution patterns (may need review but not automatically unsafe)
        self.caution_patterns = [
            r'\b(how to make|create|build)\s+(weapon|bomb|drug)\b',
            r'\b(bypass|circumvent)\s+(security|protection)\b'
        ]
        
        # Safe educational contexts
        self.educational_contexts = [
            r'\b(learn|understand|study|research|education)\b',
            r'\b(history|historical|academic)\b',
            r'\b(fiction|story|novel|game)\b'
        ]
    
    def check_content_safety(
        self, 
        text: str, 
        context: Optional[str] = None
    ) -> Tuple[SafetyLevel, Optional[SafetyCategory], str]:
        """
        Check if content is safe.
        
        Args:
            text: Text to check
            context: Optional context for better evaluation
            
        Returns:
            Tuple of (safety_level, category, explanation)
        """
        text_lower = text.lower()
        
        # Check for unsafe patterns
        for category, patterns in self.unsafe_patterns.items():
            for pattern in patterns:
                if re.search(pattern, text_lower, re.IGNORECASE):
                    # Check if educational context
                    if context and any(re.search(p, context, re.IGNORECASE) 
                                     for p in self.educational_contexts):
                        return (
                            SafetyLevel.CAUTION,
                            category,
                            f"Content may be sensitive but appears educational"
                        )
                    
                    return (
                        SafetyLevel.UNSAFE,
                        category,
                        f"Content matches unsafe pattern for {category.value}"
                    )
        
        # Check for caution patterns
        for pattern in self.caution_patterns:
            if re.search(pattern, text_lower, re.IGNORECASE):
                return (
                    SafetyLevel.CAUTION,
                    None,
                    "Content requires careful handling"
                )
        
        return (SafetyLevel.SAFE, None, "Content appears safe")
    
    def get_refusal_message(self, category: Optional[SafetyCategory] = None) -> str:
        """
        Generate appropriate refusal message based on category.
        
        Args:
            category: Category of unsafe content
            
        Returns:
            Refusal message
        """
        base_message = "I apologize, but I can't assist with that request. "
        
        if category == SafetyCategory.VIOLENCE:
            return base_message + "I'm designed to promote safety and cannot provide information that could lead to harm."
        elif category == SafetyCategory.ILLEGAL_ACTIVITY:
            return base_message + "I cannot help with illegal activities or provide information that could be used for harmful purposes."
        elif category == SafetyCategory.SELF_HARM:
            return (base_message + 
                   "If you're struggling, please reach out to a mental health professional or crisis helpline. "
                   "You're not alone, and help is available.")
        elif category == SafetyCategory.HATE_SPEECH:
            return base_message + "I'm committed to being respectful and inclusive of all people."
        else:
            return base_message + "Is there something else I can help you with?"
    
    def filter_response(self, response: str, user_input: str) -> Tuple[bool, str]:
        """
        Filter and validate model response before returning to user.
        
        Args:
            response: Generated response
            user_input: Original user input
            
        Returns:
            Tuple of (is_safe, filtered_response)
        """
        safety_level, category, _ = self.check_content_safety(
            response, 
            context=user_input
        )
        
        if safety_level == SafetyLevel.UNSAFE:
            return False, self.get_refusal_message(category)
        elif safety_level == SafetyLevel.CAUTION:
            # Add disclaimer for caution content
            disclaimer = "\n\n⚠️ Note: This information is provided for educational purposes only."
            return True, response + disclaimer
        
        return True, response
    
    def check_injection_attempt(self, text: str) -> bool:
        """
        Check for potential prompt injection attempts.
        
        Args:
            text: Input text to check
            
        Returns:
            True if injection detected, False otherwise
        """
        injection_patterns = [
            r'ignore\s+(previous|above|all)\s+instructions',
            r'disregard\s+.*\s+instructions',
            r'you\s+are\s+now',
            r'new\s+instructions',
            r'system\s*:\s*ignore',
            r'<\|.*\|>',  # Special tokens
        ]
        
        text_lower = text.lower()
        for pattern in injection_patterns:
            if re.search(pattern, text_lower, re.IGNORECASE):
                return True
        
        return False


# Utility functions for easy integration
def is_safe_content(text: str) -> bool:
    """Quick check if content is safe."""
    safeguards = HelionSafeguards()
    level, _, _ = safeguards.check_content_safety(text)
    return level != SafetyLevel.UNSAFE


def get_safety_report(text: str) -> dict:
    """Get detailed safety report for content."""
    safeguards = HelionSafeguards()
    level, category, explanation = safeguards.check_content_safety(text)
    
    return {
        "safe": level == SafetyLevel.SAFE,
        "level": level.value,
        "category": category.value if category else None,
        "explanation": explanation
    }


if __name__ == "__main__":
    # Example usage
    safeguards = HelionSafeguards()
    
    test_cases = [
        "How do I bake a cake?",
        "Tell me how to harm someone",
        "What are the historical uses of weapons in warfare?",
        "I'm feeling suicidal"
    ]
    
    print("Testing Helion Safeguards System\n" + "="*50)
    for test in test_cases:
        level, category, explanation = safeguards.check_content_safety(test)
        print(f"\nInput: {test}")
        print(f"Level: {level.value}")
        print(f"Category: {category.value if category else 'None'}")
        print(f"Explanation: {explanation}")
        
        if level == SafetyLevel.UNSAFE:
            print(f"Refusal: {safeguards.get_refusal_message(category)}")