Trouter-Library commited on
Commit
9a6b6a7
·
verified ·
1 Parent(s): b42e229

Create safeguards.py

Browse files
Files changed (1) hide show
  1. safeguards.py +241 -0
safeguards.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helion-V1 Safety and Safeguard System
3
+ Implements content filtering and safety checks
4
+ """
5
+
6
+ import re
7
+ from typing import List, Tuple, Optional
8
+ from enum import Enum
9
+
10
+
11
+ class SafetyLevel(Enum):
12
+ """Safety levels for content classification."""
13
+ SAFE = "safe"
14
+ CAUTION = "caution"
15
+ UNSAFE = "unsafe"
16
+
17
+
18
+ class SafetyCategory(Enum):
19
+ """Categories of unsafe content."""
20
+ VIOLENCE = "violence"
21
+ HATE_SPEECH = "hate_speech"
22
+ ILLEGAL_ACTIVITY = "illegal_activity"
23
+ SELF_HARM = "self_harm"
24
+ SEXUAL_CONTENT = "sexual_content"
25
+ PRIVACY_VIOLATION = "privacy_violation"
26
+ MISINFORMATION = "misinformation"
27
+ HARASSMENT = "harassment"
28
+
29
+
30
+ class HelionSafeguards:
31
+ """
32
+ Safety system for Helion-V1 to prevent harmful outputs.
33
+ """
34
+
35
+ def __init__(self):
36
+ # Unsafe content patterns
37
+ self.unsafe_patterns = {
38
+ SafetyCategory.VIOLENCE: [
39
+ r'\b(kill|murder|attack|assault|weapon|bomb|explosive)\b',
40
+ r'\b(violent|torture|abuse|harm someone)\b'
41
+ ],
42
+ SafetyCategory.HATE_SPEECH: [
43
+ r'\b(racist|sexist|bigot|discriminate)\b',
44
+ r'\b(hate|slur)\s+(speech|against)\b'
45
+ ],
46
+ SafetyCategory.ILLEGAL_ACTIVITY: [
47
+ r'\b(illegal|hack|exploit|steal|fraud|scam)\b',
48
+ r'\b(drug deal|money launder|counterfeit)\b'
49
+ ],
50
+ SafetyCategory.SELF_HARM: [
51
+ r'\b(suicide|self.?harm|end.*life|kill.*myself)\b',
52
+ r'\b(cutting|overdose)\b'
53
+ ],
54
+ SafetyCategory.SEXUAL_CONTENT: [
55
+ r'\b(explicit|pornographic|sexual content)\b'
56
+ ],
57
+ SafetyCategory.PRIVACY_VIOLATION: [
58
+ r'\b(dox|personal information|private data)\b',
59
+ r'\b(steal.*identity|impersonate)\b'
60
+ ]
61
+ }
62
+
63
+ # Caution patterns (may need review but not automatically unsafe)
64
+ self.caution_patterns = [
65
+ r'\b(how to make|create|build)\s+(weapon|bomb|drug)\b',
66
+ r'\b(bypass|circumvent)\s+(security|protection)\b'
67
+ ]
68
+
69
+ # Safe educational contexts
70
+ self.educational_contexts = [
71
+ r'\b(learn|understand|study|research|education)\b',
72
+ r'\b(history|historical|academic)\b',
73
+ r'\b(fiction|story|novel|game)\b'
74
+ ]
75
+
76
+ def check_content_safety(
77
+ self,
78
+ text: str,
79
+ context: Optional[str] = None
80
+ ) -> Tuple[SafetyLevel, Optional[SafetyCategory], str]:
81
+ """
82
+ Check if content is safe.
83
+
84
+ Args:
85
+ text: Text to check
86
+ context: Optional context for better evaluation
87
+
88
+ Returns:
89
+ Tuple of (safety_level, category, explanation)
90
+ """
91
+ text_lower = text.lower()
92
+
93
+ # Check for unsafe patterns
94
+ for category, patterns in self.unsafe_patterns.items():
95
+ for pattern in patterns:
96
+ if re.search(pattern, text_lower, re.IGNORECASE):
97
+ # Check if educational context
98
+ if context and any(re.search(p, context, re.IGNORECASE)
99
+ for p in self.educational_contexts):
100
+ return (
101
+ SafetyLevel.CAUTION,
102
+ category,
103
+ f"Content may be sensitive but appears educational"
104
+ )
105
+
106
+ return (
107
+ SafetyLevel.UNSAFE,
108
+ category,
109
+ f"Content matches unsafe pattern for {category.value}"
110
+ )
111
+
112
+ # Check for caution patterns
113
+ for pattern in self.caution_patterns:
114
+ if re.search(pattern, text_lower, re.IGNORECASE):
115
+ return (
116
+ SafetyLevel.CAUTION,
117
+ None,
118
+ "Content requires careful handling"
119
+ )
120
+
121
+ return (SafetyLevel.SAFE, None, "Content appears safe")
122
+
123
+ def get_refusal_message(self, category: Optional[SafetyCategory] = None) -> str:
124
+ """
125
+ Generate appropriate refusal message based on category.
126
+
127
+ Args:
128
+ category: Category of unsafe content
129
+
130
+ Returns:
131
+ Refusal message
132
+ """
133
+ base_message = "I apologize, but I can't assist with that request. "
134
+
135
+ if category == SafetyCategory.VIOLENCE:
136
+ return base_message + "I'm designed to promote safety and cannot provide information that could lead to harm."
137
+ elif category == SafetyCategory.ILLEGAL_ACTIVITY:
138
+ return base_message + "I cannot help with illegal activities or provide information that could be used for harmful purposes."
139
+ elif category == SafetyCategory.SELF_HARM:
140
+ return (base_message +
141
+ "If you're struggling, please reach out to a mental health professional or crisis helpline. "
142
+ "You're not alone, and help is available.")
143
+ elif category == SafetyCategory.HATE_SPEECH:
144
+ return base_message + "I'm committed to being respectful and inclusive of all people."
145
+ else:
146
+ return base_message + "Is there something else I can help you with?"
147
+
148
+ def filter_response(self, response: str, user_input: str) -> Tuple[bool, str]:
149
+ """
150
+ Filter and validate model response before returning to user.
151
+
152
+ Args:
153
+ response: Generated response
154
+ user_input: Original user input
155
+
156
+ Returns:
157
+ Tuple of (is_safe, filtered_response)
158
+ """
159
+ safety_level, category, _ = self.check_content_safety(
160
+ response,
161
+ context=user_input
162
+ )
163
+
164
+ if safety_level == SafetyLevel.UNSAFE:
165
+ return False, self.get_refusal_message(category)
166
+ elif safety_level == SafetyLevel.CAUTION:
167
+ # Add disclaimer for caution content
168
+ disclaimer = "\n\n⚠️ Note: This information is provided for educational purposes only."
169
+ return True, response + disclaimer
170
+
171
+ return True, response
172
+
173
+ def check_injection_attempt(self, text: str) -> bool:
174
+ """
175
+ Check for potential prompt injection attempts.
176
+
177
+ Args:
178
+ text: Input text to check
179
+
180
+ Returns:
181
+ True if injection detected, False otherwise
182
+ """
183
+ injection_patterns = [
184
+ r'ignore\s+(previous|above|all)\s+instructions',
185
+ r'disregard\s+.*\s+instructions',
186
+ r'you\s+are\s+now',
187
+ r'new\s+instructions',
188
+ r'system\s*:\s*ignore',
189
+ r'<\|.*\|>', # Special tokens
190
+ ]
191
+
192
+ text_lower = text.lower()
193
+ for pattern in injection_patterns:
194
+ if re.search(pattern, text_lower, re.IGNORECASE):
195
+ return True
196
+
197
+ return False
198
+
199
+
200
+ # Utility functions for easy integration
201
+ def is_safe_content(text: str) -> bool:
202
+ """Quick check if content is safe."""
203
+ safeguards = HelionSafeguards()
204
+ level, _, _ = safeguards.check_content_safety(text)
205
+ return level != SafetyLevel.UNSAFE
206
+
207
+
208
+ def get_safety_report(text: str) -> dict:
209
+ """Get detailed safety report for content."""
210
+ safeguards = HelionSafeguards()
211
+ level, category, explanation = safeguards.check_content_safety(text)
212
+
213
+ return {
214
+ "safe": level == SafetyLevel.SAFE,
215
+ "level": level.value,
216
+ "category": category.value if category else None,
217
+ "explanation": explanation
218
+ }
219
+
220
+
221
+ if __name__ == "__main__":
222
+ # Example usage
223
+ safeguards = HelionSafeguards()
224
+
225
+ test_cases = [
226
+ "How do I bake a cake?",
227
+ "Tell me how to harm someone",
228
+ "What are the historical uses of weapons in warfare?",
229
+ "I'm feeling suicidal"
230
+ ]
231
+
232
+ print("Testing Helion Safeguards System\n" + "="*50)
233
+ for test in test_cases:
234
+ level, category, explanation = safeguards.check_content_safety(test)
235
+ print(f"\nInput: {test}")
236
+ print(f"Level: {level.value}")
237
+ print(f"Category: {category.value if category else 'None'}")
238
+ print(f"Explanation: {explanation}")
239
+
240
+ if level == SafetyLevel.UNSAFE:
241
+ print(f"Refusal: {safeguards.get_refusal_message(category)}")