ThreatLevelD
commited on
Commit
·
9b2f3b7
1
Parent(s):
bea66c7
Upgrade EILProcessor to world-class signal normalization: adds subphrase/keyword blend detection, chunk weighting by model confidence, negation/contrast handling, emotion arc trajectory output, and sentiment-to-emotion mapping for non-EI language. Significantly improves long-form and ambiguous emotional inference.
Browse files- core/eil_processor.py +84 -29
core/eil_processor.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# core/eil_processor.py
|
| 2 |
-
#
|
| 3 |
|
| 4 |
import yaml
|
| 5 |
import re
|
|
@@ -42,11 +42,11 @@ class EILProcessor:
|
|
| 42 |
# Emotion keyword dictionary for signal normalization/blending
|
| 43 |
self.emotion_keyword_map = {
|
| 44 |
"FAM-ANG": ["anger", "angry", "hate", "furious", "rage", "resentment"],
|
| 45 |
-
"FAM-HEL": ["helpless", "powerless", "can't", "unable", "trapped", "stuck"],
|
| 46 |
-
"FAM-SAD": ["sad", "down", "unhappy", "miserable", "depressed", "blue"],
|
| 47 |
-
"FAM-FEA": ["afraid", "scared", "fear", "terrified", "worried", "nervous", "anxious"],
|
| 48 |
-
"FAM-LOV": ["love", "loved", "loving", "caring", "affection"],
|
| 49 |
-
"FAM-JOY": ["joy", "happy", "excited", "delighted", "content"],
|
| 50 |
"FAM-SUR": ["surprised", "amazed", "astonished", "shocked"],
|
| 51 |
"FAM-DIS": ["disgust", "disgusted", "gross", "revolted"],
|
| 52 |
"FAM-SHA": ["ashamed", "shame", "embarrassed", "humiliated"],
|
|
@@ -54,9 +54,23 @@ class EILProcessor:
|
|
| 54 |
# Add more as needed
|
| 55 |
}
|
| 56 |
|
| 57 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
self.tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-emotion')
|
| 59 |
self.model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-emotion')
|
|
|
|
|
|
|
| 60 |
|
| 61 |
def normalize_text(self, text):
|
| 62 |
normalization_map = {
|
|
@@ -79,24 +93,36 @@ class EILProcessor:
|
|
| 79 |
clause_markers = [',', ';', '.', 'but', 'because', 'so that', 'which', 'when', 'while']
|
| 80 |
token_count = len(text.split())
|
| 81 |
clause_hits = any(marker in text for marker in clause_markers)
|
| 82 |
-
|
| 83 |
-
return True
|
| 84 |
-
return False
|
| 85 |
|
| 86 |
def chunk_story(self, text):
|
| 87 |
-
# Also split on conjunctions and relative pronouns, not just punctuation
|
| 88 |
chunks = re.split(r'[.,;!?]|\b(?:and|but|because|so|although|though|while|when)\b', text, flags=re.IGNORECASE)
|
| 89 |
chunks = [chunk.strip() for chunk in chunks if chunk and chunk.strip()]
|
| 90 |
return chunks
|
| 91 |
|
| 92 |
-
def
|
| 93 |
blend = {}
|
| 94 |
for fam, keywords in self.emotion_keyword_map.items():
|
| 95 |
for kw in keywords:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
if kw in norm_text:
|
| 97 |
blend[fam] = blend.get(fam, 0) + 1.0
|
| 98 |
return blend
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
def infer_emotion(self, input_text):
|
| 101 |
norm_text = self.normalize_text(input_text)
|
| 102 |
|
|
@@ -113,7 +139,8 @@ class EILProcessor:
|
|
| 113 |
'primary_emotion_code': emotion_data['primary_emotion_code'],
|
| 114 |
'arc': emotion_data['arc'],
|
| 115 |
'resonance': emotion_data['resonance'],
|
| 116 |
-
'blend': {emotion_data['primary_emotion_code']: 1.0}
|
|
|
|
| 117 |
}
|
| 118 |
return packet
|
| 119 |
|
|
@@ -126,35 +153,41 @@ class EILProcessor:
|
|
| 126 |
|
| 127 |
chunk_results = []
|
| 128 |
blend_accum = {}
|
|
|
|
| 129 |
|
| 130 |
for chunk in chunks:
|
| 131 |
sub_result = self.infer_emotion(chunk) # RECURSIVE CALL
|
| 132 |
chunk_results.append(sub_result)
|
| 133 |
-
# Accumulate blends
|
|
|
|
| 134 |
for fam, val in sub_result.get('blend', {}).items():
|
| 135 |
-
blend_accum[fam] = blend_accum.get(fam, 0) + val
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
# Normalize blend
|
| 138 |
if blend_accum:
|
| 139 |
total = sum(blend_accum.values())
|
| 140 |
for k in blend_accum:
|
| 141 |
blend_accum[k] /= total
|
| 142 |
-
|
| 143 |
dominant_family = max(blend_accum.items(), key=lambda x: x[1])[0]
|
| 144 |
else:
|
| 145 |
dominant_family = "FAM-NEU"
|
| 146 |
blend_accum = {"FAM-NEU": 1.0}
|
|
|
|
| 147 |
|
| 148 |
emotion_data = self.codex_informer.resolve_emotion_family(dominant_family)
|
| 149 |
packet = {
|
| 150 |
'phrases': [input_text] + [r['phrases'][0] for r in chunk_results],
|
| 151 |
-
'emotion_candidates': [{'phrase': r['phrases'][0], 'candidate_emotion': r
|
| 152 |
'metadata': {'source': 'EILProcessor (story mode)', 'input_type': input_type},
|
| 153 |
'emotion_family': emotion_data['emotion_family'],
|
| 154 |
'primary_emotion_code': emotion_data['primary_emotion_code'],
|
| 155 |
'arc': emotion_data['arc'],
|
| 156 |
'resonance': emotion_data['resonance'],
|
| 157 |
-
'blend': blend_accum
|
|
|
|
| 158 |
}
|
| 159 |
return packet
|
| 160 |
|
|
@@ -171,7 +204,8 @@ class EILProcessor:
|
|
| 171 |
'primary_emotion_code': emotion_data['primary_emotion_code'],
|
| 172 |
'arc': emotion_data['arc'],
|
| 173 |
'resonance': emotion_data['resonance'],
|
| 174 |
-
'blend': {emotion_data['primary_emotion_code']: 1.0}
|
|
|
|
| 175 |
}
|
| 176 |
return packet
|
| 177 |
|
|
@@ -189,14 +223,14 @@ class EILProcessor:
|
|
| 189 |
'primary_emotion_code': variant_code,
|
| 190 |
'arc': 'Pending',
|
| 191 |
'resonance': 'Pending',
|
| 192 |
-
'blend': {variant_code: 1.0}
|
|
|
|
| 193 |
}
|
| 194 |
return packet
|
| 195 |
|
| 196 |
-
# 5️⃣ Signal normalization -
|
| 197 |
-
blend = self.
|
| 198 |
if blend:
|
| 199 |
-
# Normalize
|
| 200 |
total = sum(blend.values())
|
| 201 |
for k in blend:
|
| 202 |
blend[k] /= total
|
|
@@ -211,12 +245,32 @@ class EILProcessor:
|
|
| 211 |
'primary_emotion_code': emotion_data['primary_emotion_code'],
|
| 212 |
'arc': emotion_data['arc'],
|
| 213 |
'resonance': emotion_data['resonance'],
|
| 214 |
-
'blend': blend
|
|
|
|
| 215 |
}
|
| 216 |
return packet
|
| 217 |
|
| 218 |
-
# 6️⃣
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
tokens = self.tokenizer(norm_text, return_tensors='pt')
|
| 221 |
with torch.no_grad():
|
| 222 |
logits = self.model(**tokens).logits
|
|
@@ -243,15 +297,16 @@ class EILProcessor:
|
|
| 243 |
primary_emotion_code = model_to_codex_map.get(predicted_label.lower(), "FAM-NEU")
|
| 244 |
emotion_data = self.codex_informer.resolve_emotion_family(primary_emotion_code)
|
| 245 |
blend = {emotion_data['primary_emotion_code']: 1.0}
|
| 246 |
-
|
| 247 |
packet = {
|
| 248 |
'phrases': [input_text],
|
| 249 |
'emotion_candidates': [{'phrase': input_text, 'candidate_emotion': predicted_label}],
|
| 250 |
-
'metadata': {'source': 'EILProcessor (model)', 'input_type': input_type},
|
| 251 |
'emotion_family': emotion_data['emotion_family'],
|
| 252 |
'primary_emotion_code': emotion_data['primary_emotion_code'],
|
| 253 |
'arc': emotion_data['arc'],
|
| 254 |
'resonance': emotion_data['resonance'],
|
| 255 |
-
'blend': blend
|
|
|
|
|
|
|
| 256 |
}
|
| 257 |
return packet
|
|
|
|
| 1 |
# core/eil_processor.py
|
| 2 |
+
# MEC EIL Processor – World-Class Signal Normalization Edition
|
| 3 |
|
| 4 |
import yaml
|
| 5 |
import re
|
|
|
|
| 42 |
# Emotion keyword dictionary for signal normalization/blending
|
| 43 |
self.emotion_keyword_map = {
|
| 44 |
"FAM-ANG": ["anger", "angry", "hate", "furious", "rage", "resentment"],
|
| 45 |
+
"FAM-HEL": ["helpless", "powerless", "can't", "unable", "trapped", "stuck", "overwhelmed", "overwhelm"],
|
| 46 |
+
"FAM-SAD": ["sad", "down", "unhappy", "miserable", "depressed", "blue", "empty"],
|
| 47 |
+
"FAM-FEA": ["afraid", "scared", "fear", "terrified", "worried", "nervous", "anxious", "can't sleep"],
|
| 48 |
+
"FAM-LOV": ["love", "loved", "loving", "caring", "affection", "proud"],
|
| 49 |
+
"FAM-JOY": ["joy", "happy", "excited", "delighted", "content", "proud"],
|
| 50 |
"FAM-SUR": ["surprised", "amazed", "astonished", "shocked"],
|
| 51 |
"FAM-DIS": ["disgust", "disgusted", "gross", "revolted"],
|
| 52 |
"FAM-SHA": ["ashamed", "shame", "embarrassed", "humiliated"],
|
|
|
|
| 54 |
# Add more as needed
|
| 55 |
}
|
| 56 |
|
| 57 |
+
# For sentiment-to-emotion mapping of ambiguous/indirect language
|
| 58 |
+
self.sentiment_cue_map = [
|
| 59 |
+
# (sentiment, regex or cue, mapped emotion)
|
| 60 |
+
("negative", r"can.?t sleep|insomnia|restless|wake up", "FAM-FEA"),
|
| 61 |
+
("negative", r"too much|overwhelmed|can.?t cope|can.?t deal", "FAM-HEL"),
|
| 62 |
+
("negative", r"nothing feels right|empty|pointless|no purpose", "FAM-SAD"),
|
| 63 |
+
("negative", r"don't care|apathy|numb", "FAM-LON"),
|
| 64 |
+
("positive", r"did it|proud|relieved", "FAM-JOY"),
|
| 65 |
+
("neutral", r"just tired|exhausted", "FAM-HEL"),
|
| 66 |
+
# ...add more for coverage
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
# Load emotion and sentiment models
|
| 70 |
self.tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-emotion')
|
| 71 |
self.model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-emotion')
|
| 72 |
+
self.sentiment_tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
|
| 73 |
+
self.sentiment_model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
|
| 74 |
|
| 75 |
def normalize_text(self, text):
|
| 76 |
normalization_map = {
|
|
|
|
| 93 |
clause_markers = [',', ';', '.', 'but', 'because', 'so that', 'which', 'when', 'while']
|
| 94 |
token_count = len(text.split())
|
| 95 |
clause_hits = any(marker in text for marker in clause_markers)
|
| 96 |
+
return token_count > 12 or clause_hits
|
|
|
|
|
|
|
| 97 |
|
| 98 |
def chunk_story(self, text):
|
|
|
|
| 99 |
chunks = re.split(r'[.,;!?]|\b(?:and|but|because|so|although|though|while|when)\b', text, flags=re.IGNORECASE)
|
| 100 |
chunks = [chunk.strip() for chunk in chunks if chunk and chunk.strip()]
|
| 101 |
return chunks
|
| 102 |
|
| 103 |
+
def detect_emotion_blend_with_negation(self, norm_text):
|
| 104 |
blend = {}
|
| 105 |
for fam, keywords in self.emotion_keyword_map.items():
|
| 106 |
for kw in keywords:
|
| 107 |
+
negation_patterns = [
|
| 108 |
+
rf"not {kw}", rf"no longer {kw}", rf"never {kw}",
|
| 109 |
+
rf"no {kw}", rf"\bwithout {kw}"
|
| 110 |
+
]
|
| 111 |
+
if any(re.search(p, norm_text) for p in negation_patterns):
|
| 112 |
+
continue
|
| 113 |
if kw in norm_text:
|
| 114 |
blend[fam] = blend.get(fam, 0) + 1.0
|
| 115 |
return blend
|
| 116 |
|
| 117 |
+
def get_sentiment(self, norm_text):
|
| 118 |
+
tokens = self.sentiment_tokenizer(norm_text, return_tensors='pt')
|
| 119 |
+
with torch.no_grad():
|
| 120 |
+
logits = self.sentiment_model(**tokens).logits
|
| 121 |
+
probs = F.softmax(logits, dim=-1).squeeze()
|
| 122 |
+
top_prob, top_idx = torch.max(probs, dim=-1)
|
| 123 |
+
sentiment_label = self.sentiment_model.config.id2label[top_idx.item()]
|
| 124 |
+
return sentiment_label.lower(), top_prob.item()
|
| 125 |
+
|
| 126 |
def infer_emotion(self, input_text):
|
| 127 |
norm_text = self.normalize_text(input_text)
|
| 128 |
|
|
|
|
| 139 |
'primary_emotion_code': emotion_data['primary_emotion_code'],
|
| 140 |
'arc': emotion_data['arc'],
|
| 141 |
'resonance': emotion_data['resonance'],
|
| 142 |
+
'blend': {emotion_data['primary_emotion_code']: 1.0},
|
| 143 |
+
'trajectory': [emotion_data['primary_emotion_code']],
|
| 144 |
}
|
| 145 |
return packet
|
| 146 |
|
|
|
|
| 153 |
|
| 154 |
chunk_results = []
|
| 155 |
blend_accum = {}
|
| 156 |
+
trajectory = []
|
| 157 |
|
| 158 |
for chunk in chunks:
|
| 159 |
sub_result = self.infer_emotion(chunk) # RECURSIVE CALL
|
| 160 |
chunk_results.append(sub_result)
|
| 161 |
+
# Accumulate blends (weighted by confidence if available)
|
| 162 |
+
conf = sub_result.get('confidence', 1.0)
|
| 163 |
for fam, val in sub_result.get('blend', {}).items():
|
| 164 |
+
blend_accum[fam] = blend_accum.get(fam, 0) + val * conf
|
| 165 |
+
# Trajectory
|
| 166 |
+
if 'primary_emotion_code' in sub_result:
|
| 167 |
+
trajectory.append(sub_result['primary_emotion_code'])
|
| 168 |
|
| 169 |
# Normalize blend
|
| 170 |
if blend_accum:
|
| 171 |
total = sum(blend_accum.values())
|
| 172 |
for k in blend_accum:
|
| 173 |
blend_accum[k] /= total
|
|
|
|
| 174 |
dominant_family = max(blend_accum.items(), key=lambda x: x[1])[0]
|
| 175 |
else:
|
| 176 |
dominant_family = "FAM-NEU"
|
| 177 |
blend_accum = {"FAM-NEU": 1.0}
|
| 178 |
+
trajectory = ["FAM-NEU"]
|
| 179 |
|
| 180 |
emotion_data = self.codex_informer.resolve_emotion_family(dominant_family)
|
| 181 |
packet = {
|
| 182 |
'phrases': [input_text] + [r['phrases'][0] for r in chunk_results],
|
| 183 |
+
'emotion_candidates': [{'phrase': r['phrases'][0], 'candidate_emotion': r.get('primary_emotion_code', 'FAM-NEU')} for r in chunk_results],
|
| 184 |
'metadata': {'source': 'EILProcessor (story mode)', 'input_type': input_type},
|
| 185 |
'emotion_family': emotion_data['emotion_family'],
|
| 186 |
'primary_emotion_code': emotion_data['primary_emotion_code'],
|
| 187 |
'arc': emotion_data['arc'],
|
| 188 |
'resonance': emotion_data['resonance'],
|
| 189 |
+
'blend': blend_accum,
|
| 190 |
+
'trajectory': trajectory,
|
| 191 |
}
|
| 192 |
return packet
|
| 193 |
|
|
|
|
| 204 |
'primary_emotion_code': emotion_data['primary_emotion_code'],
|
| 205 |
'arc': emotion_data['arc'],
|
| 206 |
'resonance': emotion_data['resonance'],
|
| 207 |
+
'blend': {emotion_data['primary_emotion_code']: 1.0},
|
| 208 |
+
'trajectory': [emotion_data['primary_emotion_code']],
|
| 209 |
}
|
| 210 |
return packet
|
| 211 |
|
|
|
|
| 223 |
'primary_emotion_code': variant_code,
|
| 224 |
'arc': 'Pending',
|
| 225 |
'resonance': 'Pending',
|
| 226 |
+
'blend': {variant_code: 1.0},
|
| 227 |
+
'trajectory': [variant_code],
|
| 228 |
}
|
| 229 |
return packet
|
| 230 |
|
| 231 |
+
# 5️⃣ Signal normalization - blend detection & negation
|
| 232 |
+
blend = self.detect_emotion_blend_with_negation(norm_text)
|
| 233 |
if blend:
|
|
|
|
| 234 |
total = sum(blend.values())
|
| 235 |
for k in blend:
|
| 236 |
blend[k] /= total
|
|
|
|
| 245 |
'primary_emotion_code': emotion_data['primary_emotion_code'],
|
| 246 |
'arc': emotion_data['arc'],
|
| 247 |
'resonance': emotion_data['resonance'],
|
| 248 |
+
'blend': blend,
|
| 249 |
+
'trajectory': [primary_code],
|
| 250 |
}
|
| 251 |
return packet
|
| 252 |
|
| 253 |
+
# 6️⃣ Sentiment-to-emotion mapping for non-EI language
|
| 254 |
+
sentiment, sentiment_conf = self.get_sentiment(norm_text)
|
| 255 |
+
print(f"[EILProcessor] Sentiment fallback: {sentiment} ({sentiment_conf:.2f})")
|
| 256 |
+
for sent, cue, fam in self.sentiment_cue_map:
|
| 257 |
+
if sent == sentiment and re.search(cue, norm_text):
|
| 258 |
+
emotion_data = self.codex_informer.resolve_emotion_family(fam)
|
| 259 |
+
packet = {
|
| 260 |
+
'phrases': [input_text],
|
| 261 |
+
'emotion_candidates': [{'phrase': input_text, 'candidate_emotion': fam}],
|
| 262 |
+
'metadata': {'source': 'EILProcessor (sentiment-to-emotion)', 'input_type': input_type},
|
| 263 |
+
'emotion_family': emotion_data['emotion_family'],
|
| 264 |
+
'primary_emotion_code': emotion_data['primary_emotion_code'],
|
| 265 |
+
'arc': emotion_data['arc'],
|
| 266 |
+
'resonance': emotion_data['resonance'],
|
| 267 |
+
'blend': {fam: 1.0},
|
| 268 |
+
'trajectory': [fam],
|
| 269 |
+
}
|
| 270 |
+
return packet
|
| 271 |
+
|
| 272 |
+
# 7️⃣ Model fallback (last resort)
|
| 273 |
+
print(f"[EILProcessor] No crosswalk/alias/keyword/sentiment match — running model on: '{norm_text}'")
|
| 274 |
tokens = self.tokenizer(norm_text, return_tensors='pt')
|
| 275 |
with torch.no_grad():
|
| 276 |
logits = self.model(**tokens).logits
|
|
|
|
| 297 |
primary_emotion_code = model_to_codex_map.get(predicted_label.lower(), "FAM-NEU")
|
| 298 |
emotion_data = self.codex_informer.resolve_emotion_family(primary_emotion_code)
|
| 299 |
blend = {emotion_data['primary_emotion_code']: 1.0}
|
|
|
|
| 300 |
packet = {
|
| 301 |
'phrases': [input_text],
|
| 302 |
'emotion_candidates': [{'phrase': input_text, 'candidate_emotion': predicted_label}],
|
| 303 |
+
'metadata': {'source': 'EILProcessor (model)', 'input_type': input_type, 'confidence': confidence},
|
| 304 |
'emotion_family': emotion_data['emotion_family'],
|
| 305 |
'primary_emotion_code': emotion_data['primary_emotion_code'],
|
| 306 |
'arc': emotion_data['arc'],
|
| 307 |
'resonance': emotion_data['resonance'],
|
| 308 |
+
'blend': blend,
|
| 309 |
+
'trajectory': [emotion_data['primary_emotion_code']],
|
| 310 |
+
'confidence': confidence
|
| 311 |
}
|
| 312 |
return packet
|