DeepXR
/

Helion-V1

+"""
+Helion-V1 Safety and Safeguard System
+Implements content filtering and safety checks
+"""
+import re
+from typing import List, Tuple, Optional
+from enum import Enum
+class SafetyLevel(Enum):
+    """Safety levels for content classification."""
+    SAFE = "safe"
+    CAUTION = "caution"
+    UNSAFE = "unsafe"
+class SafetyCategory(Enum):
+    """Categories of unsafe content."""
+    VIOLENCE = "violence"
+    HATE_SPEECH = "hate_speech"
+    ILLEGAL_ACTIVITY = "illegal_activity"
+    SELF_HARM = "self_harm"
+    SEXUAL_CONTENT = "sexual_content"
+    PRIVACY_VIOLATION = "privacy_violation"
+    MISINFORMATION = "misinformation"
+    HARASSMENT = "harassment"
+class HelionSafeguards:
+    """
+    Safety system for Helion-V1 to prevent harmful outputs.
+    """
+    def __init__(self):
+        # Unsafe content patterns
+        self.unsafe_patterns = {
+            SafetyCategory.VIOLENCE: [
+                r'\b(kill|murder|attack|assault|weapon|bomb|explosive)\b',
+                r'\b(violent|torture|abuse|harm someone)\b'
+            ],
+            SafetyCategory.HATE_SPEECH: [
+                r'\b(racist|sexist|bigot|discriminate)\b',
+                r'\b(hate|slur)\s+(speech|against)\b'
+            ],
+            SafetyCategory.ILLEGAL_ACTIVITY: [
+                r'\b(illegal|hack|exploit|steal|fraud|scam)\b',
+                r'\b(drug deal|money launder|counterfeit)\b'
+            ],
+            SafetyCategory.SELF_HARM: [
+                r'\b(suicide|self.?harm|end.*life|kill.*myself)\b',
+                r'\b(cutting|overdose)\b'
+            ],
+            SafetyCategory.SEXUAL_CONTENT: [
+                r'\b(explicit|pornographic|sexual content)\b'
+            ],
+            SafetyCategory.PRIVACY_VIOLATION: [
+                r'\b(dox|personal information|private data)\b',
+                r'\b(steal.*identity|impersonate)\b'
+            ]
+        }
+        # Caution patterns (may need review but not automatically unsafe)
+        self.caution_patterns = [
+            r'\b(how to make|create|build)\s+(weapon|bomb|drug)\b',
+            r'\b(bypass|circumvent)\s+(security|protection)\b'
+        ]
+        # Safe educational contexts
+        self.educational_contexts = [
+            r'\b(learn|understand|study|research|education)\b',
+            r'\b(history|historical|academic)\b',
+            r'\b(fiction|story|novel|game)\b'
+        ]
+    def check_content_safety(
+        self,
+        text: str,
+        context: Optional[str] = None
+    ) -> Tuple[SafetyLevel, Optional[SafetyCategory], str]:
+        """
+        Check if content is safe.
+        Args:
+            text: Text to check
+            context: Optional context for better evaluation
+        Returns:
+            Tuple of (safety_level, category, explanation)
+        """
+        text_lower = text.lower()
+        # Check for unsafe patterns
+        for category, patterns in self.unsafe_patterns.items():
+            for pattern in patterns:
+                if re.search(pattern, text_lower, re.IGNORECASE):
+                    # Check if educational context
+                    if context and any(re.search(p, context, re.IGNORECASE)
+                                     for p in self.educational_contexts):
+                        return (
+                            SafetyLevel.CAUTION,
+                            category,
+                            f"Content may be sensitive but appears educational"
+                        )
+                    return (
+                        SafetyLevel.UNSAFE,
+                        category,
+                        f"Content matches unsafe pattern for {category.value}"
+                    )
+        # Check for caution patterns
+        for pattern in self.caution_patterns:
+            if re.search(pattern, text_lower, re.IGNORECASE):
+                return (
+                    SafetyLevel.CAUTION,
+                    None,
+                    "Content requires careful handling"
+                )
+        return (SafetyLevel.SAFE, None, "Content appears safe")
+    def get_refusal_message(self, category: Optional[SafetyCategory] = None) -> str:
+        """
+        Generate appropriate refusal message based on category.
+        Args:
+            category: Category of unsafe content
+        Returns:
+            Refusal message
+        """
+        base_message = "I apologize, but I can't assist with that request. "
+        if category == SafetyCategory.VIOLENCE:
+            return base_message + "I'm designed to promote safety and cannot provide information that could lead to harm."
+        elif category == SafetyCategory.ILLEGAL_ACTIVITY:
+            return base_message + "I cannot help with illegal activities or provide information that could be used for harmful purposes."
+        elif category == SafetyCategory.SELF_HARM:
+            return (base_message +
+                   "If you're struggling, please reach out to a mental health professional or crisis helpline. "
+                   "You're not alone, and help is available.")
+        elif category == SafetyCategory.HATE_SPEECH:
+            return base_message + "I'm committed to being respectful and inclusive of all people."
+        else:
+            return base_message + "Is there something else I can help you with?"
+    def filter_response(self, response: str, user_input: str) -> Tuple[bool, str]:
+        """
+        Filter and validate model response before returning to user.
+        Args:
+            response: Generated response
+            user_input: Original user input
+        Returns:
+            Tuple of (is_safe, filtered_response)
+        """
+        safety_level, category, _ = self.check_content_safety(
+            response,
+            context=user_input
+        )
+        if safety_level == SafetyLevel.UNSAFE:
+            return False, self.get_refusal_message(category)
+        elif safety_level == SafetyLevel.CAUTION:
+            # Add disclaimer for caution content
+            disclaimer = "\n\n⚠️ Note: This information is provided for educational purposes only."
+            return True, response + disclaimer
+        return True, response
+    def check_injection_attempt(self, text: str) -> bool:
+        """
+        Check for potential prompt injection attempts.
+        Args:
+            text: Input text to check
+        Returns:
+            True if injection detected, False otherwise
+        """
+        injection_patterns = [
+            r'ignore\s+(previous|above|all)\s+instructions',
+            r'disregard\s+.*\s+instructions',
+            r'you\s+are\s+now',
+            r'new\s+instructions',
+            r'system\s*:\s*ignore',
+            r'<\|.*\|>',  # Special tokens
+        ]
+        text_lower = text.lower()
+        for pattern in injection_patterns:
+            if re.search(pattern, text_lower, re.IGNORECASE):
+                return True
+        return False
+# Utility functions for easy integration
+def is_safe_content(text: str) -> bool:
+    """Quick check if content is safe."""
+    safeguards = HelionSafeguards()
+    level, _, _ = safeguards.check_content_safety(text)
+    return level != SafetyLevel.UNSAFE
+def get_safety_report(text: str) -> dict:
+    """Get detailed safety report for content."""
+    safeguards = HelionSafeguards()
+    level, category, explanation = safeguards.check_content_safety(text)
+    return {
+        "safe": level == SafetyLevel.SAFE,
+        "level": level.value,
+        "category": category.value if category else None,
+        "explanation": explanation
+    }
+if __name__ == "__main__":
+    # Example usage
+    safeguards = HelionSafeguards()
+    test_cases = [
+        "How do I bake a cake?",
+        "Tell me how to harm someone",
+        "What are the historical uses of weapons in warfare?",
+        "I'm feeling suicidal"
+    ]
+    print("Testing Helion Safeguards System\n" + "="*50)
+    for test in test_cases:
+        level, category, explanation = safeguards.check_content_safety(test)
+        print(f"\nInput: {test}")
+        print(f"Level: {level.value}")
+        print(f"Category: {category.value if category else 'None'}")
+        print(f"Explanation: {explanation}")
+        if level == SafetyLevel.UNSAFE:
+            print(f"Refusal: {safeguards.get_refusal_message(category)}")