""" Helion-V1 Safety and Safeguard System Implements content filtering and safety checks """ import re from typing import List, Tuple, Optional from enum import Enum class SafetyLevel(Enum): """Safety levels for content classification.""" SAFE = "safe" CAUTION = "caution" UNSAFE = "unsafe" class SafetyCategory(Enum): """Categories of unsafe content.""" VIOLENCE = "violence" HATE_SPEECH = "hate_speech" ILLEGAL_ACTIVITY = "illegal_activity" SELF_HARM = "self_harm" SEXUAL_CONTENT = "sexual_content" PRIVACY_VIOLATION = "privacy_violation" MISINFORMATION = "misinformation" HARASSMENT = "harassment" class HelionSafeguards: """ Safety system for Helion-V1 to prevent harmful outputs. """ def __init__(self): # Unsafe content patterns self.unsafe_patterns = { SafetyCategory.VIOLENCE: [ r'\b(kill|murder|attack|assault|weapon|bomb|explosive)\b', r'\b(violent|torture|abuse|harm someone)\b' ], SafetyCategory.HATE_SPEECH: [ r'\b(racist|sexist|bigot|discriminate)\b', r'\b(hate|slur)\s+(speech|against)\b' ], SafetyCategory.ILLEGAL_ACTIVITY: [ r'\b(illegal|hack|exploit|steal|fraud|scam)\b', r'\b(drug deal|money launder|counterfeit)\b' ], SafetyCategory.SELF_HARM: [ r'\b(suicide|self.?harm|end.*life|kill.*myself)\b', r'\b(cutting|overdose)\b' ], SafetyCategory.SEXUAL_CONTENT: [ r'\b(explicit|pornographic|sexual content)\b' ], SafetyCategory.PRIVACY_VIOLATION: [ r'\b(dox|personal information|private data)\b', r'\b(steal.*identity|impersonate)\b' ] } # Caution patterns (may need review but not automatically unsafe) self.caution_patterns = [ r'\b(how to make|create|build)\s+(weapon|bomb|drug)\b', r'\b(bypass|circumvent)\s+(security|protection)\b' ] # Safe educational contexts self.educational_contexts = [ r'\b(learn|understand|study|research|education)\b', r'\b(history|historical|academic)\b', r'\b(fiction|story|novel|game)\b' ] def check_content_safety( self, text: str, context: Optional[str] = None ) -> Tuple[SafetyLevel, Optional[SafetyCategory], str]: """ Check if content is safe. Args: text: Text to check context: Optional context for better evaluation Returns: Tuple of (safety_level, category, explanation) """ text_lower = text.lower() # Check for unsafe patterns for category, patterns in self.unsafe_patterns.items(): for pattern in patterns: if re.search(pattern, text_lower, re.IGNORECASE): # Check if educational context if context and any(re.search(p, context, re.IGNORECASE) for p in self.educational_contexts): return ( SafetyLevel.CAUTION, category, f"Content may be sensitive but appears educational" ) return ( SafetyLevel.UNSAFE, category, f"Content matches unsafe pattern for {category.value}" ) # Check for caution patterns for pattern in self.caution_patterns: if re.search(pattern, text_lower, re.IGNORECASE): return ( SafetyLevel.CAUTION, None, "Content requires careful handling" ) return (SafetyLevel.SAFE, None, "Content appears safe") def get_refusal_message(self, category: Optional[SafetyCategory] = None) -> str: """ Generate appropriate refusal message based on category. Args: category: Category of unsafe content Returns: Refusal message """ base_message = "I apologize, but I can't assist with that request. " if category == SafetyCategory.VIOLENCE: return base_message + "I'm designed to promote safety and cannot provide information that could lead to harm." elif category == SafetyCategory.ILLEGAL_ACTIVITY: return base_message + "I cannot help with illegal activities or provide information that could be used for harmful purposes." elif category == SafetyCategory.SELF_HARM: return (base_message + "If you're struggling, please reach out to a mental health professional or crisis helpline. " "You're not alone, and help is available.") elif category == SafetyCategory.HATE_SPEECH: return base_message + "I'm committed to being respectful and inclusive of all people." else: return base_message + "Is there something else I can help you with?" def filter_response(self, response: str, user_input: str) -> Tuple[bool, str]: """ Filter and validate model response before returning to user. Args: response: Generated response user_input: Original user input Returns: Tuple of (is_safe, filtered_response) """ safety_level, category, _ = self.check_content_safety( response, context=user_input ) if safety_level == SafetyLevel.UNSAFE: return False, self.get_refusal_message(category) elif safety_level == SafetyLevel.CAUTION: # Add disclaimer for caution content disclaimer = "\n\n⚠️ Note: This information is provided for educational purposes only." return True, response + disclaimer return True, response def check_injection_attempt(self, text: str) -> bool: """ Check for potential prompt injection attempts. Args: text: Input text to check Returns: True if injection detected, False otherwise """ injection_patterns = [ r'ignore\s+(previous|above|all)\s+instructions', r'disregard\s+.*\s+instructions', r'you\s+are\s+now', r'new\s+instructions', r'system\s*:\s*ignore', r'<\|.*\|>', # Special tokens ] text_lower = text.lower() for pattern in injection_patterns: if re.search(pattern, text_lower, re.IGNORECASE): return True return False # Utility functions for easy integration def is_safe_content(text: str) -> bool: """Quick check if content is safe.""" safeguards = HelionSafeguards() level, _, _ = safeguards.check_content_safety(text) return level != SafetyLevel.UNSAFE def get_safety_report(text: str) -> dict: """Get detailed safety report for content.""" safeguards = HelionSafeguards() level, category, explanation = safeguards.check_content_safety(text) return { "safe": level == SafetyLevel.SAFE, "level": level.value, "category": category.value if category else None, "explanation": explanation } if __name__ == "__main__": # Example usage safeguards = HelionSafeguards() test_cases = [ "How do I bake a cake?", "Tell me how to harm someone", "What are the historical uses of weapons in warfare?", "I'm feeling suicidal" ] print("Testing Helion Safeguards System\n" + "="*50) for test in test_cases: level, category, explanation = safeguards.check_content_safety(test) print(f"\nInput: {test}") print(f"Level: {level.value}") print(f"Category: {category.value if category else 'None'}") print(f"Explanation: {explanation}") if level == SafetyLevel.UNSAFE: print(f"Refusal: {safeguards.get_refusal_message(category)}")