Spaces:
Sleeping
Sleeping
Update enhanced_search_v2.py
Browse files- enhanced_search_v2.py +71 -51
enhanced_search_v2.py
CHANGED
|
@@ -1,21 +1,37 @@
|
|
| 1 |
-
# enhanced_search_v2.py (Versão com
|
| 2 |
###################################################################################################
|
| 3 |
#
|
| 4 |
-
# MELHORIAS
|
| 5 |
-
# 1. REORDENAÇÃO HÍBRIDA: Para resolver casos como 'pneumococo', a reordenação da IA
|
| 6 |
-
# agora usa uma chave híbrida. Se o score semântico é < 85, a ordenação considera
|
| 7 |
-
# a SOMA dos scores (semântico + textual), priorizando resultados com forte
|
| 8 |
-
# correspondência textual.
|
| 9 |
#
|
| 10 |
-
#
|
| 11 |
-
#
|
| 12 |
-
#
|
|
|
|
|
|
|
|
|
|
| 13 |
#
|
| 14 |
-
#
|
| 15 |
-
#
|
|
|
|
|
|
|
|
|
|
| 16 |
#
|
| 17 |
-
#
|
| 18 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
#
|
| 20 |
###################################################################################################
|
| 21 |
|
|
@@ -186,41 +202,48 @@ def create_unified_document_text(result_dict):
|
|
| 186 |
for i in range(1, 5): text_parts.add(result_dict.get(f'Sinonimo_{i}', ''))
|
| 187 |
return ". ".join(sorted([part for part in text_parts if part and str(part).strip()]))
|
| 188 |
|
| 189 |
-
|
| 190 |
def rerank_with_cross_encoder(query, results_list, model):
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
sentence_pairs = [[query, create_unified_document_text(result)] for result in results_list]
|
| 193 |
-
if not sentence_pairs:
|
|
|
|
| 194 |
|
| 195 |
try:
|
|
|
|
| 196 |
raw_scores = model.predict(sentence_pairs, show_progress_bar=False)
|
| 197 |
semantic_scores_normalized = torch.sigmoid(torch.tensor(raw_scores)).numpy() * 100
|
| 198 |
for i, result in enumerate(results_list):
|
| 199 |
result['semantic_score'] = round(semantic_scores_normalized[i])
|
| 200 |
|
| 201 |
-
#
|
| 202 |
-
|
|
|
|
|
|
|
| 203 |
sem_score = result.get('semantic_score', 0)
|
| 204 |
txt_score = result.get('text_score', 0)
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
if sem_score >= 85:
|
| 208 |
-
return (1, sem_score, txt_score) # Grupo 1 (alta confiança)
|
| 209 |
-
# Se a IA tem baixa/média confiança, a soma com o score textual decide.
|
| 210 |
-
else:
|
| 211 |
-
return (0, sem_score + txt_score, sem_score) # Grupo 0 (confiança mista)
|
| 212 |
|
| 213 |
-
reranked_results = sorted(results_list, key=
|
| 214 |
-
log_message = f"Reordenação
|
| 215 |
return reranked_results, log_message
|
| 216 |
|
| 217 |
except Exception as e:
|
| 218 |
log_message = f"Erro no Cross-Encoder: {e}"; print(log_message)
|
|
|
|
| 219 |
key_function = lambda x: (x.get('text_score', 0), x.get('is_rol_procedure', False))
|
| 220 |
reranked_results = sorted(results_list, key=key_function, reverse=True)
|
| 221 |
return reranked_results, log_message
|
| 222 |
|
|
|
|
| 223 |
def _boost_technical_jargon_matches(results, query_words, doc_freq, portuguese_word_set, boost_factor=1.2, rarity_threshold_count=10):
|
|
|
|
| 224 |
if not results or not query_words: return results, None
|
| 225 |
technical_jargon_terms = {
|
| 226 |
word for word in query_words
|
|
@@ -234,14 +257,12 @@ def _boost_technical_jargon_matches(results, query_words, doc_freq, portuguese_w
|
|
| 234 |
boosted_score = min(result['text_score'] * boost_factor, 99)
|
| 235 |
result.update({'text_score': round(boosted_score), 'score': round(boosted_score), 'match_type': result['match_type'] + " + Jargão Boost"})
|
| 236 |
boosted_indices.append(result['row_index'])
|
| 237 |
-
return results, f"
|
|
|
|
| 238 |
|
| 239 |
-
# --- MELHORIA 3A: BOOST PARA PALAVRAS ULTRA-RARAS ---
|
| 240 |
def _boost_extremely_rare_words(results, query_words, doc_freq, boost_factor=1.3, rarity_threshold_count=3):
|
| 241 |
""" Impulsiona resultados que contenham palavras da query que são extremamente raras na base."""
|
| 242 |
if not results or not query_words: return results, None
|
| 243 |
-
|
| 244 |
-
# Identifica palavras na query que aparecem em 3 ou menos documentos.
|
| 245 |
extremely_rare_words = {word for word in query_words if doc_freq.get(word, 0) <= rarity_threshold_count and len(word) > 3}
|
| 246 |
if not extremely_rare_words: return results, None
|
| 247 |
|
|
@@ -251,7 +272,7 @@ def _boost_extremely_rare_words(results, query_words, doc_freq, boost_factor=1.3
|
|
| 251 |
boosted_score = min(result['text_score'] * boost_factor, 99)
|
| 252 |
result.update({'text_score': round(boosted_score), 'score': round(boosted_score), 'match_type': result['match_type'] + " + Ultra-Rare Boost"})
|
| 253 |
boosted_indices.append(result['row_index'])
|
| 254 |
-
return results, f"
|
| 255 |
|
| 256 |
|
| 257 |
# --- FUNÇÃO INTERNA DE BUSCA COM CAMADAS --- #
|
|
@@ -344,7 +365,6 @@ def search_procedure_with_log(query, df_original, df_normalized, fuzzy_search_co
|
|
| 344 |
cross_encoder_model=None,
|
| 345 |
user_best_matches_counts=None, user_feedback_threshold=10):
|
| 346 |
start_time = time.time(); original_query = str(query).strip()
|
| 347 |
-
# --- MELHORIA 1: CONSTANTES PARA LIMITES ---
|
| 348 |
BROAD_BLOCK_CANDIDATE_LIMIT = 30
|
| 349 |
FINAL_RESULTS_LIMIT = 20
|
| 350 |
|
|
@@ -352,82 +372,82 @@ def search_procedure_with_log(query, df_original, df_normalized, fuzzy_search_co
|
|
| 352 |
if not original_query: response["search_log"].append("Query vazia."); return response
|
| 353 |
response["search_log"].append(f"Buscando por: '{original_query}'")
|
| 354 |
|
|
|
|
| 355 |
stopwords = {'de', 'do', 'da', 'dos', 'das', 'a', 'o', 'e', 'em', 'um', 'uma', 'para', 'com'}
|
| 356 |
-
|
| 357 |
-
query_after_correction = original_query
|
| 358 |
-
# ... (lógica de correção de query) ...
|
| 359 |
|
| 360 |
cleaned_query = " ".join([word for word in query_after_correction.split() if normalize_text(word) not in stopwords])
|
| 361 |
normalized_query = normalize_text(cleaned_query)
|
| 362 |
if not cleaned_query.strip(): response["search_log"].append("Query resultante vazia."); return response
|
| 363 |
if cleaned_query != query_after_correction: response["search_log"].append(f"Query limpa (sem stop words): '{cleaned_query}'")
|
| 364 |
|
|
|
|
| 365 |
_run_search_layers(literal_normalize_text(query_after_correction), normalized_query, response, df_original, df_normalized, fuzzy_search_corpus, bm25_model, limit_per_layer)
|
| 366 |
|
| 367 |
-
# --- MELHORIA 2: LOG COMPLETO RESTAURADO ---
|
| 368 |
layer_names_pt = {"literal_matches": "Busca Literal", "exact_matches": "Busca Exata", "phrase_matches": "Busca por Frase", "fuzzy_matches": "Busca por Aproximação", "logical_matches": "Busca Lógica (E)", "term_matches": "Busca por Relevância (BM25)", "keyword_matches": "Busca por Palavra-Chave"}
|
| 369 |
response["search_log"].append("\n--- Detalhamento por Camada ---")
|
| 370 |
for key, name in layer_names_pt.items(): response["search_log"].append(f"Camada '{name}': {len(response['results_by_layer'].get(key, []))} candidatos.")
|
| 371 |
|
|
|
|
| 372 |
response["search_log"].append("\n--- Agregação de Candidatos em Blocos ---")
|
| 373 |
protected_candidates, broad_candidates, seen_indices = [], [], set()
|
| 374 |
|
| 375 |
protected_layers = ["literal_matches", "exact_matches", "phrase_matches"]
|
| 376 |
for layer_name in protected_layers:
|
| 377 |
for result in response['results_by_layer'].get(layer_name, []):
|
| 378 |
-
if result['row_index'] not in seen_indices:
|
| 379 |
-
protected_candidates.append(result); seen_indices.add(result['row_index'])
|
| 380 |
|
| 381 |
HIGH_FUZZ_THRESHOLD = 95
|
| 382 |
for result in response['results_by_layer'].get('fuzzy_matches', []):
|
| 383 |
if result['row_index'] not in seen_indices:
|
| 384 |
-
if result.get('fuzz_score', 0) >= HIGH_FUZZ_THRESHOLD:
|
| 385 |
-
protected_candidates.append(result)
|
| 386 |
else: broad_candidates.append(result)
|
| 387 |
seen_indices.add(result['row_index'])
|
| 388 |
|
| 389 |
broad_layers = ["logical_matches", "term_matches", "keyword_matches"]
|
| 390 |
for layer_name in broad_layers:
|
| 391 |
for result in response['results_by_layer'].get(layer_name, []):
|
| 392 |
-
if result['row_index'] not in seen_indices:
|
| 393 |
-
broad_candidates.append(result); seen_indices.add(result['row_index'])
|
| 394 |
|
| 395 |
-
# --- MELHORIA 1 (APLICAÇÃO): LIMITA O BLOCO AMPLO ---
|
| 396 |
broad_candidates = sorted(broad_candidates, key=lambda x: x.get('text_score', 0), reverse=True)[:BROAD_BLOCK_CANDIDATE_LIMIT]
|
| 397 |
response["search_log"].append(f"Candidatos - Bloco Protegido: {len(protected_candidates)}, Bloco Amplo (limitado a {BROAD_BLOCK_CANDIDATE_LIMIT}): {len(broad_candidates)}")
|
| 398 |
|
| 399 |
for cand_list in [protected_candidates, broad_candidates]:
|
| 400 |
for r in cand_list: r['full_text_norm'] = df_normalized.loc[r['row_index'], 'full_text_norm']
|
| 401 |
|
|
|
|
| 402 |
response["search_log"].append("\n--- Aplicação de Boosts ---")
|
| 403 |
query_words_for_boost = [word for word in normalized_query.split() if word not in stopwords]
|
| 404 |
|
| 405 |
-
# Aplica os dois tipos de boost
|
| 406 |
for c_list in [protected_candidates, broad_candidates]:
|
| 407 |
_, boost_log_jargon = _boost_technical_jargon_matches(c_list, query_words_for_boost, doc_freq, portuguese_word_set)
|
| 408 |
if boost_log_jargon: response["search_log"].append(boost_log_jargon)
|
| 409 |
_, boost_log_rare = _boost_extremely_rare_words(c_list, query_words_for_boost, doc_freq)
|
| 410 |
if boost_log_rare: response["search_log"].append(boost_log_rare)
|
| 411 |
|
| 412 |
-
|
| 413 |
-
# Lógica de feedback do usuário...
|
| 414 |
-
pass
|
| 415 |
|
| 416 |
-
|
|
|
|
| 417 |
final_list = []
|
| 418 |
query_for_semantic = response.get("corrected_query") or cleaned_query
|
| 419 |
|
| 420 |
for block_name, candidates in [("Protegido", protected_candidates), ("Amplo", broad_candidates)]:
|
| 421 |
if not candidates: continue
|
| 422 |
-
|
|
|
|
| 423 |
to_rerank_sorted = sorted(candidates, key=lambda x: x.get('text_score', 0), reverse=True)
|
|
|
|
|
|
|
| 424 |
reranked_by_ia, log_msg = rerank_with_cross_encoder(query_for_semantic, to_rerank_sorted, cross_encoder_model)
|
|
|
|
| 425 |
final_list.extend(reranked_by_ia)
|
| 426 |
response["search_log"].append(f"Bloco '{block_name}': {log_msg}")
|
| 427 |
|
| 428 |
response["final_semantic_results"] = _highlight_matches(final_list[:FINAL_RESULTS_LIMIT], query_for_semantic)
|
| 429 |
end_time = time.time(); response["search_duration_seconds"] = round(end_time - start_time, 4)
|
| 430 |
-
response["search_log"].append(f"
|
| 431 |
print(f"\n\n==================== LOG DE DEPURAÇÃO (QUERY: '{original_query}') ====================")
|
| 432 |
for log_item in response["search_log"]: print(log_item)
|
| 433 |
return response
|
|
|
|
| 1 |
+
# enhanced_search_v2.py (Versão Final com Ranking Ponderado)
|
| 2 |
###################################################################################################
|
| 3 |
#
|
| 4 |
+
# RESUMO DAS MELHORIAS IMPLEMENTADAS (Sessão Atual):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
#
|
| 6 |
+
# 1. RANKING HÍBRIDO PONDERADO (AJUSTE FINAL):
|
| 7 |
+
# - A lógica de reordenação foi substituída por uma média ponderada, tornando o ranking mais
|
| 8 |
+
# balanceado e previsível.
|
| 9 |
+
# - FÓRMULA FINAL: `Score Final = (Score Semântico * 0.6) + (Score Textual * 0.4)`
|
| 10 |
+
# - Isso garante que tanto a compreensão da IA quanto a relevância textual contribuam para
|
| 11 |
+
# a posição final de cada resultado.
|
| 12 |
#
|
| 13 |
+
# 2. LÓGICA DE BLOCOS (PROTEGIDO E AMPLO):
|
| 14 |
+
# - Os resultados são separados em um "Bloco Protegido" (matches exatos, de frase, etc.)
|
| 15 |
+
# e um "Bloco Amplo" (relevância, lógicos, etc.).
|
| 16 |
+
# - O Bloco Protegido sempre tem prioridade no ranking, garantindo que resultados
|
| 17 |
+
# textualmente perfeitos não sejam rebaixados.
|
| 18 |
#
|
| 19 |
+
# 3. SISTEMA DE BOOSTS EM DUAS CAMADAS:
|
| 20 |
+
# - BOOST DE JARGÃO TÉCNICO: Impulsiona o score de termos raros na base de dados que
|
| 21 |
+
# NÃO constam no dicionário de português, focando em jargões médicos.
|
| 22 |
+
# - BOOST DE PALAVRAS ULTRA-RARAS: Impulsiona o score de termos que são extremamente
|
| 23 |
+
# raros na base (ex: <= 3 ocorrências), independentemente do dicionário. Crucial
|
| 24 |
+
# para casos como "pneumococo".
|
| 25 |
+
#
|
| 26 |
+
# 4. OTIMIZAÇÃO DO BM25 E LIMITES:
|
| 27 |
+
# - O modelo BM25 foi ajustado (parâmetro k1=1.2) para valorizar mais a raridade
|
| 28 |
+
# dos termos (IDF).
|
| 29 |
+
# - O Bloco Amplo foi limitado a 30 candidatos para otimizar a performance da
|
| 30 |
+
# reordenação pela IA.
|
| 31 |
+
#
|
| 32 |
+
# 5. LOG DETALHADO:
|
| 33 |
+
# - O log de busca foi configurado para exibir o número de candidatos de cada
|
| 34 |
+
# camada da busca, facilitando a depuração e a análise da lógica.
|
| 35 |
#
|
| 36 |
###################################################################################################
|
| 37 |
|
|
|
|
| 202 |
for i in range(1, 5): text_parts.add(result_dict.get(f'Sinonimo_{i}', ''))
|
| 203 |
return ". ".join(sorted([part for part in text_parts if part and str(part).strip()]))
|
| 204 |
|
| 205 |
+
|
| 206 |
def rerank_with_cross_encoder(query, results_list, model):
|
| 207 |
+
"""
|
| 208 |
+
Reordena uma lista de resultados usando um modelo Cross-Encoder e uma fórmula ponderada.
|
| 209 |
+
"""
|
| 210 |
+
if not model or not results_list or not query:
|
| 211 |
+
return results_list, "Cross-Encoder não fornecido ou lista de candidatos vazia."
|
| 212 |
+
|
| 213 |
sentence_pairs = [[query, create_unified_document_text(result)] for result in results_list]
|
| 214 |
+
if not sentence_pairs:
|
| 215 |
+
return results_list, "Não foram encontrados pares para reordenar."
|
| 216 |
|
| 217 |
try:
|
| 218 |
+
# Calcula os scores semânticos
|
| 219 |
raw_scores = model.predict(sentence_pairs, show_progress_bar=False)
|
| 220 |
semantic_scores_normalized = torch.sigmoid(torch.tensor(raw_scores)).numpy() * 100
|
| 221 |
for i, result in enumerate(results_list):
|
| 222 |
result['semantic_score'] = round(semantic_scores_normalized[i])
|
| 223 |
|
| 224 |
+
# --- MELHORIA 1: RANKING HÍBRIDO PONDERADO ---
|
| 225 |
+
# A chave de ordenação agora é uma média ponderada dos scores.
|
| 226 |
+
# Isso cria um ranking balanceado entre a compreensão semântica e a relevância textual.
|
| 227 |
+
def weighted_hybrid_sort_key(result):
|
| 228 |
sem_score = result.get('semantic_score', 0)
|
| 229 |
txt_score = result.get('text_score', 0)
|
| 230 |
+
# Fórmula: 60% do score semântico + 40% do score de texto
|
| 231 |
+
return (sem_score * 0.6) + (txt_score * 0.4)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
+
reranked_results = sorted(results_list, key=weighted_hybrid_sort_key, reverse=True)
|
| 234 |
+
log_message = f"Reordenação por score ponderado (60% semântico, 40% texto) em {len(reranked_results)} candidatos."
|
| 235 |
return reranked_results, log_message
|
| 236 |
|
| 237 |
except Exception as e:
|
| 238 |
log_message = f"Erro no Cross-Encoder: {e}"; print(log_message)
|
| 239 |
+
# Fallback para o score de texto se a IA falhar
|
| 240 |
key_function = lambda x: (x.get('text_score', 0), x.get('is_rol_procedure', False))
|
| 241 |
reranked_results = sorted(results_list, key=key_function, reverse=True)
|
| 242 |
return reranked_results, log_message
|
| 243 |
|
| 244 |
+
|
| 245 |
def _boost_technical_jargon_matches(results, query_words, doc_freq, portuguese_word_set, boost_factor=1.2, rarity_threshold_count=10):
|
| 246 |
+
""" Impulsiona resultados que contenham jargões técnicos (raros E fora do dicionário)."""
|
| 247 |
if not results or not query_words: return results, None
|
| 248 |
technical_jargon_terms = {
|
| 249 |
word for word in query_words
|
|
|
|
| 257 |
boosted_score = min(result['text_score'] * boost_factor, 99)
|
| 258 |
result.update({'text_score': round(boosted_score), 'score': round(boosted_score), 'match_type': result['match_type'] + " + Jargão Boost"})
|
| 259 |
boosted_indices.append(result['row_index'])
|
| 260 |
+
return results, f"Boost de Jargão: {list(technical_jargon_terms)} ({len(boosted_indices)} afetados)."
|
| 261 |
+
|
| 262 |
|
|
|
|
| 263 |
def _boost_extremely_rare_words(results, query_words, doc_freq, boost_factor=1.3, rarity_threshold_count=3):
|
| 264 |
""" Impulsiona resultados que contenham palavras da query que são extremamente raras na base."""
|
| 265 |
if not results or not query_words: return results, None
|
|
|
|
|
|
|
| 266 |
extremely_rare_words = {word for word in query_words if doc_freq.get(word, 0) <= rarity_threshold_count and len(word) > 3}
|
| 267 |
if not extremely_rare_words: return results, None
|
| 268 |
|
|
|
|
| 272 |
boosted_score = min(result['text_score'] * boost_factor, 99)
|
| 273 |
result.update({'text_score': round(boosted_score), 'score': round(boosted_score), 'match_type': result['match_type'] + " + Ultra-Rare Boost"})
|
| 274 |
boosted_indices.append(result['row_index'])
|
| 275 |
+
return results, f"Boost Ultra-Raro: {list(extremely_rare_words)} ({len(boosted_indices)} afetados)."
|
| 276 |
|
| 277 |
|
| 278 |
# --- FUNÇÃO INTERNA DE BUSCA COM CAMADAS --- #
|
|
|
|
| 365 |
cross_encoder_model=None,
|
| 366 |
user_best_matches_counts=None, user_feedback_threshold=10):
|
| 367 |
start_time = time.time(); original_query = str(query).strip()
|
|
|
|
| 368 |
BROAD_BLOCK_CANDIDATE_LIMIT = 30
|
| 369 |
FINAL_RESULTS_LIMIT = 20
|
| 370 |
|
|
|
|
| 372 |
if not original_query: response["search_log"].append("Query vazia."); return response
|
| 373 |
response["search_log"].append(f"Buscando por: '{original_query}'")
|
| 374 |
|
| 375 |
+
# --- ETAPA 1: PREPARAÇÃO E CORREÇÃO DA QUERY ---
|
| 376 |
stopwords = {'de', 'do', 'da', 'dos', 'das', 'a', 'o', 'e', 'em', 'um', 'uma', 'para', 'com'}
|
| 377 |
+
# (O código de correção da query, se aplicável, continua aqui)
|
| 378 |
+
query_after_correction = original_query
|
|
|
|
| 379 |
|
| 380 |
cleaned_query = " ".join([word for word in query_after_correction.split() if normalize_text(word) not in stopwords])
|
| 381 |
normalized_query = normalize_text(cleaned_query)
|
| 382 |
if not cleaned_query.strip(): response["search_log"].append("Query resultante vazia."); return response
|
| 383 |
if cleaned_query != query_after_correction: response["search_log"].append(f"Query limpa (sem stop words): '{cleaned_query}'")
|
| 384 |
|
| 385 |
+
# --- ETAPA 2: EXECUÇÃO DAS CAMADAS DE BUSCA ---
|
| 386 |
_run_search_layers(literal_normalize_text(query_after_correction), normalized_query, response, df_original, df_normalized, fuzzy_search_corpus, bm25_model, limit_per_layer)
|
| 387 |
|
|
|
|
| 388 |
layer_names_pt = {"literal_matches": "Busca Literal", "exact_matches": "Busca Exata", "phrase_matches": "Busca por Frase", "fuzzy_matches": "Busca por Aproximação", "logical_matches": "Busca Lógica (E)", "term_matches": "Busca por Relevância (BM25)", "keyword_matches": "Busca por Palavra-Chave"}
|
| 389 |
response["search_log"].append("\n--- Detalhamento por Camada ---")
|
| 390 |
for key, name in layer_names_pt.items(): response["search_log"].append(f"Camada '{name}': {len(response['results_by_layer'].get(key, []))} candidatos.")
|
| 391 |
|
| 392 |
+
# --- ETAPA 3: AGREGAÇÃO EM BLOCOS ---
|
| 393 |
response["search_log"].append("\n--- Agregação de Candidatos em Blocos ---")
|
| 394 |
protected_candidates, broad_candidates, seen_indices = [], [], set()
|
| 395 |
|
| 396 |
protected_layers = ["literal_matches", "exact_matches", "phrase_matches"]
|
| 397 |
for layer_name in protected_layers:
|
| 398 |
for result in response['results_by_layer'].get(layer_name, []):
|
| 399 |
+
if result['row_index'] not in seen_indices: protected_candidates.append(result); seen_indices.add(result['row_index'])
|
|
|
|
| 400 |
|
| 401 |
HIGH_FUZZ_THRESHOLD = 95
|
| 402 |
for result in response['results_by_layer'].get('fuzzy_matches', []):
|
| 403 |
if result['row_index'] not in seen_indices:
|
| 404 |
+
if result.get('fuzz_score', 0) >= HIGH_FUZZ_THRESHOLD: protected_candidates.append(result)
|
|
|
|
| 405 |
else: broad_candidates.append(result)
|
| 406 |
seen_indices.add(result['row_index'])
|
| 407 |
|
| 408 |
broad_layers = ["logical_matches", "term_matches", "keyword_matches"]
|
| 409 |
for layer_name in broad_layers:
|
| 410 |
for result in response['results_by_layer'].get(layer_name, []):
|
| 411 |
+
if result['row_index'] not in seen_indices: broad_candidates.append(result); seen_indices.add(result['row_index'])
|
|
|
|
| 412 |
|
|
|
|
| 413 |
broad_candidates = sorted(broad_candidates, key=lambda x: x.get('text_score', 0), reverse=True)[:BROAD_BLOCK_CANDIDATE_LIMIT]
|
| 414 |
response["search_log"].append(f"Candidatos - Bloco Protegido: {len(protected_candidates)}, Bloco Amplo (limitado a {BROAD_BLOCK_CANDIDATE_LIMIT}): {len(broad_candidates)}")
|
| 415 |
|
| 416 |
for cand_list in [protected_candidates, broad_candidates]:
|
| 417 |
for r in cand_list: r['full_text_norm'] = df_normalized.loc[r['row_index'], 'full_text_norm']
|
| 418 |
|
| 419 |
+
# --- ETAPA 4: APLICAÇÃO DE BOOSTS ---
|
| 420 |
response["search_log"].append("\n--- Aplicação de Boosts ---")
|
| 421 |
query_words_for_boost = [word for word in normalized_query.split() if word not in stopwords]
|
| 422 |
|
|
|
|
| 423 |
for c_list in [protected_candidates, broad_candidates]:
|
| 424 |
_, boost_log_jargon = _boost_technical_jargon_matches(c_list, query_words_for_boost, doc_freq, portuguese_word_set)
|
| 425 |
if boost_log_jargon: response["search_log"].append(boost_log_jargon)
|
| 426 |
_, boost_log_rare = _boost_extremely_rare_words(c_list, query_words_for_boost, doc_freq)
|
| 427 |
if boost_log_rare: response["search_log"].append(boost_log_rare)
|
| 428 |
|
| 429 |
+
# (Lógica de priorização por feedback do usuário, se aplicável)
|
|
|
|
|
|
|
| 430 |
|
| 431 |
+
# --- ETAPA 5: REORDENAÇÃO FINAL E RESULTADOS ---
|
| 432 |
+
response["search_log"].append("\n--- Reordenação Final por Bloco ---")
|
| 433 |
final_list = []
|
| 434 |
query_for_semantic = response.get("corrected_query") or cleaned_query
|
| 435 |
|
| 436 |
for block_name, candidates in [("Protegido", protected_candidates), ("Amplo", broad_candidates)]:
|
| 437 |
if not candidates: continue
|
| 438 |
+
|
| 439 |
+
# Ordena os candidatos do bloco pelo score textual antes de enviar para a IA
|
| 440 |
to_rerank_sorted = sorted(candidates, key=lambda x: x.get('text_score', 0), reverse=True)
|
| 441 |
+
|
| 442 |
+
# Reordena com a IA usando a chave ponderada
|
| 443 |
reranked_by_ia, log_msg = rerank_with_cross_encoder(query_for_semantic, to_rerank_sorted, cross_encoder_model)
|
| 444 |
+
|
| 445 |
final_list.extend(reranked_by_ia)
|
| 446 |
response["search_log"].append(f"Bloco '{block_name}': {log_msg}")
|
| 447 |
|
| 448 |
response["final_semantic_results"] = _highlight_matches(final_list[:FINAL_RESULTS_LIMIT], query_for_semantic)
|
| 449 |
end_time = time.time(); response["search_duration_seconds"] = round(end_time - start_time, 4)
|
| 450 |
+
response["search_log"].append(f"\nBusca completa em {response['search_duration_seconds']} segundos.")
|
| 451 |
print(f"\n\n==================== LOG DE DEPURAÇÃO (QUERY: '{original_query}') ====================")
|
| 452 |
for log_item in response["search_log"]: print(log_item)
|
| 453 |
return response
|