ssrogue commited on Oct 2

Commit

b1e8fe0

verified ·

1 Parent(s): 8f160bc

Upload folder using huggingface_hub

Browse files

Files changed (25) hide show

.gitattributes +6 -0
LICENSE +21 -0
README.md +196 -3
__init__.py +38 -0
ablation_and_evaluation/ablation_studies.py +239 -0
ablation_and_evaluation/eval2.py +143 -0
ablation_and_evaluation/evaluation_studies.py +193 -0
config.py +14 -0
data/Fin_ExBERT_data.xlsx +3 -0
data/Fin_ExBERT_test_set.xlsx +3 -0
data/Fin_ExBERT_train_val_data.xlsx +3 -0
finetune_lora.py +164 -0
images/methodology_flowchart.png +3 -0
images/test.txt +1 -0
main.py +132 -0
models.py +251 -0
preprocess_data.py +210 -0
requirements.txt +26 -0
results/Ablation.txt +46 -0
results/Fin-ExBERT.pptx +3 -0
results/ablation_study.png +0 -0
results/combined_results.xlsx +3 -0
results/fine_tuning_results.png +0 -0
results/methods_summary.xlsx +0 -0
utils.py +967 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/Fin_ExBERT_data.xlsx filter=lfs diff=lfs merge=lfs -text
+data/Fin_ExBERT_test_set.xlsx filter=lfs diff=lfs merge=lfs -text
+data/Fin_ExBERT_train_val_data.xlsx filter=lfs diff=lfs merge=lfs -text
+images/methodology_flowchart.png filter=lfs diff=lfs merge=lfs -text
+results/combined_results.xlsx filter=lfs diff=lfs merge=lfs -text
+results/Fin-ExBERT.pptx filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Soumick Sarker
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,196 @@
----
-license: mit
----

+# FinExBERT: Financial Sentence Extraction with Graph-Augmented BERT
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-green.svg)](https://www.python.org/downloads/)
+[![PyTorch](https://img.shields.io/badge/PyTorch-1.9+-red.svg)](https://pytorch.org/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![arXiv](https://img.shields.io/badge/arXiv-2025.23259-b31b1b.svg)]([https://arxiv.org/](https://www.arxiv.org/abs/2509.23259))
+> A state-of-the-art neural architecture for extracting relevant sentences from financial conversations using graph-augmented BERT with dependency parsing.
+**Accepted at EMNLP 2025 Industry Track**
+## Overview
+FinExBERT combines BERT's contextual understanding with graph neural networks to capture syntactic dependencies in financial conversations. The model achieves superior performance in extracting relevant sentences based on user intent, making it particularly effective for financial customer service applications.
+### Problem Statement
+Traditional sequence-to-sequence models struggle with:
+- Complex financial terminology and context
+- Long conversation dependencies
+- Intent-based sentence extraction
+- Domain-specific reasoning requirements
+### Our Solution
+FinExBERT addresses these challenges through:
+- **Graph-Augmented Architecture**: Incorporates dependency parsing graphs to capture syntactic relationships
+- **Financial Domain Adaptation**: LoRA fine-tuning on financial datasets
+- **Intent-Aware Extraction**: Semantic similarity matching for targeted sentence selection
+- **Efficient Training**: Mixed precision training with gradient accumulation
+## Key Features
+- 🏆 **State-of-the-art Performance**: Outperforms baseline BERT by 37% in accuracy on financial conversation tasks
+- 🧠 **Graph Neural Networks**: Integrates dependency parsing for enhanced linguistic understanding
+- 💰 **Financial Domain Expertise**: Pre-trained on financial conversation data
+- ⚡ **Production Ready**: Optimized for real-world deployment with batched inference
+- 🔧 **Flexible Architecture**: Configurable model components for different use cases
+- 📊 **Comprehensive Evaluation**: Extensive ablation studies and evaluation metrics
+## Installation
+### Prerequisites
+- Python 3.10 or higher
+- PyTorch 1.9 or higher
+- CUDA 11.0+ (for GPU acceleration)
+### Install dependencies
+```bash
+git clone https://github.com/soumick1/Fin-ExBERT.git
+pip install -r requirements.txt
+```
+## Quick Start
+### Download the model weights
+Download the weights from the [Weights Link](https://drive.google.com/drive/folders/1jm3Yxpew8Y8mVsRizTyVvXKrGBXQ3ApI?usp=sharing)
+And put the 3 folders inside the cloned directory.
+### Data setup
+The CreditCall12H Dataset is available in the 'data' folder. If you want to train or test on your own data please use the same format.
+### Basic Usage and Testing
+```python
+from utils import batch_predict_and_save
+from config import *
+from preprocess_data import SentenceDataset
+from models import SentenceExtractionModel
+# Initialize the model
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  ### You can change the tokenizer if you want
+dataset = SentenceDataset("data/Fin_ExBERT_train_val_data.xlsx", tokenizer)
+model = SentenceExtractionModel(
+    base_model_name=MODEL_NAME,
+    backbone='finexbert'
+)
+# Extract relevant sentences
+batch_predict_and_save(
+    model,
+    tokenizer,
+    excel_path="data/Fin_ExBERT_test_set.xlsx",
+    ckpt_path="checkpoints/sentence_extractor/best_model.pth",
+    output_path="results/predictions_sample200.xlsx",
+    n_samples=200,
+    temperature=1.0,
+    device="cuda"
+)
+```
+### Training the model
+```python
+from utils import train_model_with_chkpt
+from config import *
+from preprocess_data import SentenceDataset
+from models import SentenceExtractionModel
+# Initialize the model
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  ### You can change the tokenizer if you want
+dataset = SentenceDataset("data/Fin_ExBERT_train_val_data.xlsx", tokenizer)
+model = SentenceExtractionModel(
+    base_model_name=MODEL_NAME,
+    backbone='finexbert'
+)
+train_sentence_extractor(
+    model,
+    dataset,
+    output_dir="checkpoints/sentence_extractor",
+    val_split=0.3,
+    epochs=10,
+    batch_size=16,
+    lr=3e-4,
+    device=DEVICE,
+    unfreeze_after_epoch=4
+)
+```
+## Model Architecture
+![FinExBERT Architecture](images/methodology_flowchart.png)
+### Core Components
+1. **BERT Encoder**: Contextual embeddings for input sequences
+2. **Dependency Graph Parser**: SpaCy-based syntactic analysis
+3. **Graph Neural Network**: Message passing over dependency graphs
+4. **Fusion Layer**: Combines BERT and GNN representations
+5. **Classification Head**: Intent-aware sentence scoring
+### Technical Details
+- **Base Model**: BERT-base-uncased (110M parameters)
+- **GNN Architecture**: Simple message passing with attention
+- **Training Strategy**: LoRA adaptation + full fine-tuning
+## Evaluation
+### Ablation Studies
+We provide comprehensive ablation studies comparing:
+- Baseline BERT vs. Graph-Augmented BERT
+- Different GNN architectures
+- Various training strategies
+- Domain adaptation techniques
+### Performance Metrics
+| Model | Accuracy | F1-Score | Precision | Recall |
+|-------|----------|----------|-----------|--------|
+| BERT Baseline | 0.323 | 0.163 | 0.145 | 0.189 |
+| FinExBERT | 0.694 | 0.418 | 0.456 | 0.391 |
+| **Improvement** | **+37%** | **+26%** | **+31%** | **+20%** |
+## Citation
+If you use FinExBERT in your research, please cite:
+```bibtex
+Will post it soon
+```
+## License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## Acknowledgments
+- Built on top of [Transformers](https://github.com/huggingface/transformers) by Hugging Face
+- Graph processing with [SpaCy](https://spacy.io/)
+- Training infrastructure powered by [PyTorch](https://pytorch.org/)
+## Support
+- 📧 Email: [email protected]
+---
+<div align="center">
+  <strong>FinExBERT</strong> - Advancing Financial NLP with Graph-Augmented Models
+</div>

__init__.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import subprocess
+import sys
+def install_without_version(package_name):
+    package_name = str(package_name)
+    try:
+        __import__(package_name)
+        print(f"{package_name} is already installed.")
+    except ImportError:
+        print(f"{package_name} is not installed. Installing now...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", '-U', package_name])
+def install_with_version(package_name, package_version):
+    package_name = str(package_name)
+    package_version = str(package_version)
+    try:
+        pkg = __import__(package_name)
+        installed_version = pkg.__version__
+        if installed_version == package_version:
+            print(f"{package_name} {package_version} is already installed.")
+        else:
+            print(f"{package_name} {installed_version} is installed, but {package_version} is required. Updating now...")
+            subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={package_version}"])
+    except ImportError:
+        print(f"Installing version {package_version} now...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package_name}=={package_version}"])
+if __name__=='__main__':
+    packages = [['torch', ''], ['datasets', ''], ['spacy', ''], ['networkx', ''], ['numpy', '1.26.4']]
+    for package in packages:
+        if package[1] == '':
+            install_without_version(package[0])
+        else:
+            install_with_version(package[0], package[1])
+    subprocess.check_call([sys.executable, "python", "-m", "spacy", "download", "en_core_web_sm"])

ablation_and_evaluation/ablation_studies.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import os
+import random
+import logging
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+import matplotlib.pyplot as plt
+from sklearn.metrics import accuracy_score, f1_score
+from torch.utils.data import DataLoader, Subset
+from datasets import load_from_disk
+from utils import my_collate_fn
+from config import MODEL_NAME, PREPROCESSED_DIR, DEVICE
+from preprocess_data import process_data, SpanExtractionChunkedDataset, span_collate_fn
+from models import GraphAugmentedNLIModel
+from transformers import AutoConfig, AutoModel
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+# ---------------------
+# 1) Define a BERT‐only baseline
+# ---------------------
+class BertOnlyNLIModel(nn.Module):
+    def __init__(self, base_model_name: str, num_labels: int = 3):
+        super().__init__()
+        config = AutoConfig.from_pretrained(base_model_name)
+        config.num_labels = num_labels
+        self.bert = AutoModel.from_pretrained(base_model_name, config=config)
+        hidden_dim = config.hidden_size
+        self.dropout = nn.Dropout(0.1)
+        self.classifier = nn.Linear(hidden_dim, num_labels)
+    def forward(self, input_ids, attention_mask, labels=None):
+        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        cls_emb = outputs.last_hidden_state[:, 0, :]
+        x = self.dropout(cls_emb)
+        logits = self.classifier(x)
+        loss = None
+        if labels is not None:
+            loss_fn = nn.CrossEntropyLoss()
+            loss = loss_fn(logits, labels)
+        return {"loss": loss, "logits": logits}
+# ---------------------
+# 2) Training & evaluation routines
+# ---------------------
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def train_one_epoch(model, loader, optimizer, scheduler):
+    model.train()
+    losses = []
+    is_gnn = hasattr(model, "gnn_premise")  # True for GraphAugmentedNLIModel
+    for batch in tqdm(loader, leave=False):
+        optimizer.zero_grad()
+        # Move all tensor fields to DEVICE
+        batch = {
+            k: v.to(DEVICE) if torch.is_tensor(v) else v
+            for k, v in batch.items()
+        }
+        if is_gnn:
+            out = model(
+                input_ids=batch["input_ids"],
+                attention_mask=batch["attention_mask"],
+                premise_graph_tokens=batch["premise_graph_tokens"],
+                premise_graph_edges=batch["premise_graph_edges"],
+                premise_node_indices=batch["premise_node_indices"],
+                hypothesis_graph_tokens=batch["hypothesis_graph_tokens"],
+                hypothesis_graph_edges=batch["hypothesis_graph_edges"],
+                hypothesis_node_indices=batch["hypothesis_node_indices"],
+                labels=batch["labels"],
+            )
+        else:
+            out = model(
+                input_ids=batch["input_ids"],
+                attention_mask=batch["attention_mask"],
+                labels=batch["labels"],
+            )
+        loss = out["loss"]
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
+        losses.append(loss.item())
+    return float(np.mean(losses))
+@torch.no_grad()
+def evaluate(model, loader):
+    model.eval()
+    preds, golds = [], []
+    is_gnn = hasattr(model, "gnn_premise")
+    for batch in loader:
+        batch = {
+            k: v.to(DEVICE) if torch.is_tensor(v) else v
+            for k, v in batch.items()
+        }
+        if is_gnn:
+            out = model(
+                input_ids=batch["input_ids"],
+                attention_mask=batch["attention_mask"],
+                premise_graph_tokens=batch["premise_graph_tokens"],
+                premise_graph_edges=batch["premise_graph_edges"],
+                premise_node_indices=batch["premise_node_indices"],
+                hypothesis_graph_tokens=batch["hypothesis_graph_tokens"],
+                hypothesis_graph_edges=batch["hypothesis_graph_edges"],
+                hypothesis_node_indices=batch["hypothesis_node_indices"],
+            )
+        else:
+            out = model(
+                input_ids=batch["input_ids"],
+                attention_mask=batch["attention_mask"],
+            )
+        logits = out["logits"].cpu().numpy()
+        preds.extend(np.argmax(logits, axis=1).tolist())
+        golds.extend(batch["labels"].cpu().tolist())
+    acc = accuracy_score(golds, preds)
+    f1  = f1_score(golds, preds, average="macro")
+    return acc, f1
+# ---------------------
+# 3) Ablation runner
+# ---------------------
+def run_ablation(
+    epochs=3,
+    batch_size=16,
+    lr=2e-5,
+    sample_frac=0.05,    # ← fraction of data to use
+):
+    set_seed()
+    process_data()
+    # --- Load the preprocessed SNLI dataset from disk ---
+    snli = load_from_disk(PREPROCESSED_DIR)
+    full_train = snli["train"]
+    full_val   = snli["validation"]
+    # --- Sample 10% of each split ---
+    num_train = len(full_train)
+    num_val   = len(full_val)
+    n_train   = max(1, int(sample_frac * num_train))
+    n_val     = max(1, int(sample_frac * num_val))
+    # reproducible shuffling
+    train_indices = list(range(num_train))
+    random.shuffle(train_indices)
+    train_subset = Subset(full_train, train_indices[:n_train])
+    val_indices = list(range(num_val))
+    random.shuffle(val_indices)
+    val_subset = Subset(full_val, val_indices[:n_val])
+    # --- Build DataLoaders with the SNLI collate_fn ---
+    train_loader = DataLoader(
+        train_subset,
+        batch_size=batch_size,
+        shuffle=True,
+        collate_fn=my_collate_fn,
+        num_workers=4,
+        pin_memory=True,
+    )
+    val_loader = DataLoader(
+        val_subset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=my_collate_fn,
+        num_workers=2,
+        pin_memory=True,
+    )
+    # 4) Define models
+    models = {
+        "Baseline-BERT": BertOnlyNLIModel(MODEL_NAME).to(DEVICE),
+        "GNN-Augmented": GraphAugmentedNLIModel(
+            base_model_name=MODEL_NAME,
+            num_labels=3,
+            hidden_dim=768,
+            gnn_dim=128
+        ).to(DEVICE),
+    }
+    results = {}
+    for name, model in models.items():
+        logging.info(f"--- Training {name} on {sample_frac*100:.0f}% of data ---")
+        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+        total_steps = epochs * len(train_loader)
+        scheduler = torch.optim.lr_scheduler.LinearLR(
+            optimizer,
+            start_factor=0.1,
+            total_iters=total_steps
+        )
+        # training loop
+        for epoch in range(1, epochs + 1):
+            train_loss = train_one_epoch(model, train_loader, optimizer, scheduler)
+            logging.info(f"{name} Epoch {epoch}: train_loss={train_loss:.4f}")
+        # evaluation
+        acc, f1 = evaluate(model, val_loader)
+        logging.info(f"{name} on {sample_frac*100:.0f}% val → acc={acc:.4f}, f1={f1:.4f}")
+        results[name] = {"accuracy": acc, "f1": f1}
+    # 5) Plot
+    names = list(results.keys())
+    accs  = [results[n]["accuracy"] for n in names]
+    f1s   = [results[n]["f1"] for n in names]
+    plt.figure()
+    plt.bar(names, accs)
+    plt.xlabel("Model")
+    plt.ylabel("Validation Accuracy")
+    plt.title(f"Ablation on {sample_frac*100:.0f}% Data: Accuracy")
+    plt.figure()
+    plt.bar(names, f1s)
+    plt.xlabel("Model")
+    plt.ylabel("Validation Macro-F1")
+    plt.title(f"Ablation on {sample_frac*100:.0f}% Data: Macro-F1")
+    plt.show()
+if __name__ == "__main__":
+    run_ablation(epochs=5, batch_size=16, lr=5e-3, sample_frac=0.1)

ablation_and_evaluation/eval2.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os, re, numpy as np, pandas as pd
+from tqdm.auto import tqdm
+from datasets import load_dataset
+from transformers import pipeline
+from utils import extract_sentences_by_intent, nlp   # <- your spaCy model
+# ─── CONFIG ──────────────────────────────────────────────────────────
+TOP_K       = 3          # candidate spans per example
+N_PER_DS    = 200        # keep *valid* examples per dataset
+BATCH_SIZE  = 16
+DEVICE      = 0          # GPU id (-1 = CPU)
+OUTPUT_PATH = "results/combined_results.xlsx"
+# ────────────────────────────────────────────────────────────────────
+# ── helper: flatten arbitrary json-ish field to plain text ──────────
+def flatten_to_text(x):
+    if isinstance(x, str):
+        return x
+    if isinstance(x, dict):
+        if "text" in x and isinstance(x["text"], str):
+            return x["text"]
+        return "\n".join(flatten_to_text(v) for v in x.values())
+    if isinstance(x, (list, tuple)):
+        return "\n".join(flatten_to_text(v) for v in x)
+    return str(x)
+# ── helper: map any label (“score 3” or “good answer”) → int 1-5 ────
+LABEL_STRINGS = [
+    "very bad answer",   # 1
+    "bad answer",        # 2
+    "acceptable answer", # 3
+    "good answer",       # 4
+    "perfect answer"     # 5
+]
+def label_to_int(lbl: str) -> int:
+    m = re.search(r"([1-5])", lbl)
+    if m:                       # the label already contains a digit
+        return int(m.group(1))
+    for i, s in enumerate(LABEL_STRINGS, 1):
+        if s in lbl.lower():
+            return i
+    return 1                    # fallback
+# ── datasets – full splits will be shuffled, then filtered ──────────
+datasets_info = [
+    ("FinQA-10K", "virattt/financial-qa-10K", "train",      False, {}),
+    ("SQuAD",     "rajpurkar/squad",          "validation", False, {}),
+]
+# ── zero-shot classification judges (all ~125-140 M params) ─────────
+candidate_labels = LABEL_STRINGS           # same list for every judge
+judge1 = pipeline("zero-shot-classification",
+                  model="roberta-large-mnli",
+                  device=DEVICE, batch_size=BATCH_SIZE)
+judge2 = pipeline("zero-shot-classification",
+                  model="microsoft/deberta-base-mnli",
+                  device=DEVICE, batch_size=BATCH_SIZE)
+judge3 = pipeline("zero-shot-classification",
+                  model="valhalla/distilbart-mnli-12-3",
+                  device=DEVICE, batch_size=BATCH_SIZE)
+# ── main loop ───────────────────────────────────────────────────────
+rows = []
+for ds_name, hf_id, split, trust_code, extra_kwargs in datasets_info:
+    print(f"\n→ Loading {ds_name} ({hf_id}#{split}) and collecting {N_PER_DS} examples…")
+    ds = load_dataset(hf_id, split=split, trust_remote_code=trust_code, **extra_kwargs)
+    ds = ds.shuffle(seed=42)
+    collected, bar = 0, tqdm(total=N_PER_DS, desc=f"{ds_name} valid")
+    for ex in ds:
+        # unified fields -------------------------------------------------------
+        question = ex.get("question") or ex.get("question_text") or ex.get("query") or ""
+        context  = flatten_to_text(
+            ex.get("context") or ex.get("document_text") or ex.get("story") or ex.get("text") or ""
+        )
+        # keep only if context has ≥ 2 sentences -------------------------------
+        if len(list(nlp(context).sents)) < 2:
+            continue
+        # candidate spans ------------------------------------------------------
+        spans = [s for s, _ in extract_sentences_by_intent(
+            text=context, intent=question, threshold=-1.0, top_k=TOP_K)]
+        if not spans:                       # no hit – fill with defaults
+            rows.append({
+                "dataset": ds_name, "question": question, "context": context,
+                "span": "", "score1": 5.0, "score2": 5.0, "score3": 5.0, "score_avg": 5.0
+            })
+            collected += 1; bar.update(1)
+            if collected >= N_PER_DS: break
+            continue
+        prompts = [
+            f"Question: {question}\nCandidate answer: {span}\n\n"
+            "On a scale from 1 (completely wrong) to 5 (perfect), reply with a single digit."
+            for span in spans
+        ]
+        # run judges -----------------------------------------------------------
+        out1 = judge1(prompts, candidate_labels=candidate_labels, multi_label=False)
+        out2 = judge2(prompts, candidate_labels=candidate_labels, multi_label=False)
+        out3 = judge3(prompts, candidate_labels=candidate_labels, multi_label=False)
+        j1 = [label_to_int(o["labels"][0]) for o in out1]
+        j2 = [label_to_int(o["labels"][0]) for o in out2]
+        j3 = [label_to_int(o["labels"][0]) for o in out3]
+        avg_scores = [(a+b+c)/3.0 for a, b, c in zip(j1, j2, j3)]
+        best = int(np.argmax(avg_scores))
+        rows.append({
+            "dataset": ds_name, "question": question, "context": context,
+            "span": spans[best],
+            "score1": float(j1[best]), "score2": float(j2[best]), "score3": float(j3[best]),
+            "score_avg": float(avg_scores[best])
+        })
+        collected += 1; bar.update(1)
+        if collected >= N_PER_DS:
+            break
+    bar.close()
+    if collected < N_PER_DS:
+        print(f"⚠️  Only {collected} qualifying examples found for {ds_name}")
+# ── save & report ───────────────────────────────────────────────────
+os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
+pd.DataFrame(rows).to_excel(OUTPUT_PATH, index=False)
+print(f"\n✔️  Saved combined results →  {OUTPUT_PATH}")
+print("\n▶︎ Per-dataset judge averages:")
+summary = (pd.DataFrame(rows)
+           .groupby("dataset")[["score1", "score2", "score3", "score_avg"]]
+           .mean())
+for ds, row in summary.iterrows():
+    print(f"  {ds:12s} | "
+          f"Judge1 {row['score1']:.2f}  "
+          f"Judge2 {row['score2']:.2f}  "
+          f"Judge3 {row['score3']:.2f}  "
+          f"Combined {row['score_avg']:.2f}")

ablation_and_evaluation/evaluation_studies.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import os
+import re
+import numpy as np
+import pandas as pd
+from tqdm.auto import tqdm
+from datasets import load_dataset
+from transformers import pipeline
+from utils import extract_sentences_by_intent, nlp
+# ─── CONFIG ────────────────────────────────────────────────────────────────────
+TOP_K       = 3          # number of spans to extract per example
+N_PER_DS    = 200        # how many *valid* examples per dataset
+BATCH_SIZE  = 16         # batch size for judge pipelines
+DEVICE      = 0          # GPU id, or -1 for CPU
+OUTPUT_PATH = "results/combined_results.xlsx"
+# ────────────────────────────────────────────────────────────────────────────────
+def flatten_to_text(x):
+    if isinstance(x, str):
+        return x
+    if isinstance(x, dict):
+        if "text" in x and isinstance(x["text"], str):
+            return x["text"]
+        return "\n".join(flatten_to_text(v) for v in x.values())
+    if isinstance(x, list):
+        return "\n".join(flatten_to_text(v) for v in x)
+    return str(x)
+def label_to_int(lbl: str) -> int:
+    # handles both variants A and B
+    m = re.search(r"([1-5])", lbl)
+    if m:                       # digits present -> easy
+        return int(m.group(1))
+    # descriptive version -> map by order
+    mapping = {
+        "very bad answer": 1,
+        "bad answer": 2,
+        "acceptable answer": 3,
+        "good answer": 4,
+        "perfect answer": 5
+    }
+    return mapping.get(lbl.lower(), 1)
+# ─── 1) choose these two datasets ──────────────────────────────────────────────
+datasets_info = [
+    ("FinQA-10K", "virattt/financial-qa-10K", "train", False, {}),
+    ("SQuAD",     "rajpurkar/squad",          "validation", False, {}),
+]
+# ─── 2) spin up zero‐shot classification judges ────────────────────────────────
+candidate_labels = [
+    "very bad answer",   # 1
+    "bad answer",        # 2
+    "acceptable answer", # 3
+    "good answer",       # 4
+    "perfect answer"     # 5
+]
+judge1 = pipeline(
+    "zero-shot-classification",
+    model="roberta-large-mnli",
+    device=DEVICE,
+    batch_size=BATCH_SIZE
+)
+judge2 = pipeline(
+    "zero-shot-classification",
+    model="microsoft/deberta-base-mnli",
+    tokenizer="microsoft/deberta-base-mnli",
+    device=DEVICE,
+    batch_size=BATCH_SIZE
+)
+judge3 = pipeline(
+    "zero-shot-classification",
+    model="valhalla/distilbart-mnli-12-3",
+    tokenizer="valhalla/distilbart-mnli-12-3",
+    device=DEVICE,
+    batch_size=BATCH_SIZE
+)
+all_rows = []
+for ds_name, hf_id, split, trust_code, extra_kwargs in datasets_info:
+    print(f"\n→ Loading {ds_name} ({hf_id}#{split}), gathering {N_PER_DS} valid examples…")
+    # load full split and shuffle
+    ds = load_dataset(hf_id, split=split, trust_remote_code=trust_code, **extra_kwargs)
+    ds = ds.shuffle(seed=42)
+    collected = 0
+    pbar = tqdm(total=N_PER_DS, desc=f"{ds_name} valid examples")
+    for ex in ds:
+        # unify question & context
+        question = (
+            ex.get("question")
+            or ex.get("question_text")
+            or ex.get("query")
+            or ""
+        )
+        raw_ctx = (
+            ex.get("context")
+            or ex.get("document_text")
+            or ex.get("story")
+            or ex.get("text")
+            or ""
+        )
+        context = flatten_to_text(raw_ctx)
+        # only keep examples whose context has at least 2 sentences
+        if len(list(nlp(context).sents)) < 2:
+            continue
+        # extract top-K spans
+        hits = extract_sentences_by_intent(
+            text        = context,
+            intent      = question,
+            threshold   = -1.0,
+            top_k       = TOP_K,
+            convo_focus = None
+        )
+        spans = [s for s,_ in hits]
+        if not spans:
+            # record defaults if no span found
+            all_rows.append({
+                "dataset":   ds_name,
+                "question":  question,
+                "context":   context,
+                "span":      "",
+                "score1":    5.0,
+                "score2":    5.0,
+                "score3":    5.0,
+                "score_avg": 5.0
+            })
+            collected += 1
+            pbar.update(1)
+            if collected >= N_PER_DS:
+                break
+            continue
+        # build prompts
+        prompts = [
+            f"Question: {question}\nCandidate answer: {span}\n\n"
+            "On a scale from 1 (completely wrong) to 5 (perfect), "
+            "reply with a single digit."
+            for span in spans
+        ]
+        # run judges
+        out1 = judge1(prompts, candidate_labels=candidate_labels, multi_label=False)
+        out2 = judge2(prompts, candidate_labels=candidate_labels, multi_label=False)
+        out3 = judge3(prompts, candidate_labels=candidate_labels, multi_label=False)
+        # parse their top‐chosen labels
+        j1 = [int(o["labels"][0]) for o in out1]
+        j2 = [int(o["labels"][0]) for o in out2]
+        j3 = [int(o["labels"][0]) for o in out3]
+        # average per span, pick best
+        avg_scores = [(a+b+c)/3.0 for a,b,c in zip(j1,j2,j3)]
+        best_idx   = int(np.argmax(avg_scores))
+        all_rows.append({
+            "dataset":   ds_name,
+            "question":  question,
+            "context":   context,
+            "span":      spans[best_idx],
+            "score1":    float(j1[best_idx]),
+            "score2":    float(j2[best_idx]),
+            "score3":    float(j3[best_idx]),
+            "score_avg": float(avg_scores[best_idx])
+        })
+        collected += 1
+        pbar.update(1)
+        if collected >= N_PER_DS:
+            break
+    pbar.close()
+    if collected < N_PER_DS:
+        print(f"⚠️ Only found {collected} valid examples for {ds_name}.")
+# ─── dump to Excel ─────────────────────────────────────────────────────────────
+df = pd.DataFrame(all_rows)
+os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
+df.to_excel(OUTPUT_PATH, index=False)
+print(f"\n✔️  Saved combined results to ./{OUTPUT_PATH}")
+# ─── per‐dataset summary ───────────────────────────────────────────────────────
+print("\n▶︎ Per‐dataset judge averages:")
+grouped = df.groupby("dataset")[["score1","score2","score3","score_avg"]].mean()
+for ds, row in grouped.iterrows():
+    print(f"  {ds}: "
+          f"Judge1={row['score1']:.2f}, "
+          f"Judge2={row['score2']:.2f}, "
+          f"Judge3={row['score3']:.2f}, "
+          f"Combined={row['score_avg']:.2f}")

config.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# config.py
+import torch
+import numpy as np
+from transformers import AutoTokenizer
+import spacy
+MODEL_NAME      = "bert-base-uncased"
+MAX_LENGTH      = 128
+OVERLAP         = 32
+PREPROCESSED_DIR= "preprocessed_snli"
+DEVICE          = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+nlp       = spacy.load("en_core_web_sm")

data/Fin_ExBERT_data.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bcd855be3a6ee5d740c9f61439268f5a84a085a2bf5a0593027c80c8a97f34c
+size 919294

data/Fin_ExBERT_test_set.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ca7db6ee8c28319f3fd1736a0bb05cfe5bacf1f33e07549f54fd90134b3706f
+size 324485

data/Fin_ExBERT_train_val_data.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d4804d0e204a40e797a537767c95999e585eb7f50cea416b784311efe76c731
+size 1571514

finetune_lora.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+import torch
+import random
+import numpy as np
+import math
+import matplotlib.pyplot as plt
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from tqdm.auto import tqdm
+from transformers import (
+    AutoTokenizer,
+    AutoModelForMaskedLM,
+    DataCollatorForLanguageModeling,
+    get_linear_schedule_with_warmup,
+)
+from accelerate import Accelerator
+from peft import LoraConfig, get_peft_model
+# Configuration constants
+MODEL_NAME = "bert-base-uncased"
+BATCH_SIZE = 16
+MAX_LENGTH = 128
+LEARNING_RATE = 5e-4
+EPOCHS = 20
+SEED = 42
+ADAPTER_SAVE_DIR = "./lora_finance_adapter"
+CHECKPOINT_PATH = os.path.join(ADAPTER_SAVE_DIR, "training_checkpoint.pt")
+def set_seed(seed: int = SEED):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+def fine_tune_lora(dataset_name: str = "FinGPT/fingpt-fiqa_qa", split: str = "train"):
+    """
+    Fine-tune BERT with LoRA on an MLM objective.
+    Supports checkpointing and resuming, and plots loss, perplexity, and MLM accuracy per epoch.
+    Saves the LoRA adapter and checkpoint in ADAPTER_SAVE_DIR.
+    """
+    set_seed()
+    # Prepare save directory
+    os.makedirs(ADAPTER_SAVE_DIR, exist_ok=True)
+    # Load and prepare dataset
+    dataset = load_dataset(dataset_name, split=split)
+    def combine_fields(example):
+        text = ' '.join([example.get(k, '').strip() for k in ['instruction', 'input', 'output'] if example.get(k)])
+        return {"text": text}
+    dataset = dataset.map(combine_fields, remove_columns=[c for c in dataset.column_names if c != 'text'])
+    # Tokenization and DataLoader
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
+    def tokenize_fn(examples):
+        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=MAX_LENGTH)
+    tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=[c for c in dataset.column_names if c != 'text'])
+    tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask'])
+    collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
+    train_loader = DataLoader(
+        tokenized,
+        batch_size=BATCH_SIZE,
+        shuffle=True,
+        collate_fn=collator,
+        num_workers=4,
+        pin_memory=True,
+    )
+    # Model, LoRA, optimizer, scheduler
+    model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
+    lora_cfg = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1, bias='none', task_type='CAUSAL_LM')
+    model = get_peft_model(model, lora_cfg)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
+    total_steps = EPOCHS * len(train_loader)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)
+    # Accelerator
+    accelerator = Accelerator()
+    model, optimizer, train_loader, scheduler = accelerator.prepare(model, optimizer, train_loader, scheduler)
+    device = accelerator.device
+    # Metrics storage and resume state
+    start_epoch = 1
+    epoch_losses = []
+    epoch_ppls = []
+    epoch_accs = []
+    # Load checkpoint if exists
+    if os.path.exists(CHECKPOINT_PATH):
+        ckpt = torch.load(CHECKPOINT_PATH, map_location=device)
+        model.load_state_dict(ckpt['model_state_dict'])
+        optimizer.load_state_dict(ckpt['optimizer_state_dict'])
+        scheduler.load_state_dict(ckpt['scheduler_state_dict'])
+        start_epoch = ckpt['epoch'] + 1
+        epoch_losses = ckpt.get('epoch_losses', [])
+        epoch_ppls = ckpt.get('epoch_ppls', [])
+        epoch_accs = ckpt.get('epoch_accs', [])
+        print(f"Resuming from epoch {start_epoch}")
+    # Training loop
+    model.train()
+    for epoch in range(start_epoch, EPOCHS + 1):
+        total_loss, total_masked, correct_masked = 0.0, 0, 0
+        progress = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False)
+        for batch in progress:
+            optimizer.zero_grad()
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            labels = batch['labels'].to(device)
+            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+            loss, logits = outputs.loss, outputs.logits
+            accelerator.backward(loss)
+            optimizer.step()
+            scheduler.step()
+            # Accumulate
+            step_loss = loss.item()
+            total_loss += step_loss
+            preds = torch.argmax(logits, dim=-1)
+            mask = labels.ne(-100)
+            correct_masked += preds.eq(labels).masked_select(mask).sum().item()
+            total_masked += mask.sum().item()
+            progress.set_postfix({'loss': f"{step_loss:.4f}"})
+        # Epoch metrics
+        avg_loss = total_loss / len(train_loader)
+        avg_ppl = math.exp(avg_loss)
+        avg_acc = correct_masked / total_masked if total_masked > 0 else 0
+        epoch_losses.append(avg_loss)
+        epoch_ppls.append(avg_ppl)
+        epoch_accs.append(avg_acc)
+        print(f"Epoch {epoch}: Loss={avg_loss:.4f}, PPL={avg_ppl:.2f}, MLM Acc={avg_acc:.4%}")
+        # Save checkpoint
+        ckpt = {
+            'epoch': epoch,
+            'model_state_dict': model.state_dict(),
+            'optimizer_state_dict': optimizer.state_dict(),
+            'scheduler_state_dict': scheduler.state_dict(),
+            'epoch_losses': epoch_losses,
+            'epoch_ppls': epoch_ppls,
+            'epoch_accs': epoch_accs,
+        }
+        torch.save(ckpt, CHECKPOINT_PATH)
+    # Final plots
+    fig, axes = plt.subplots(3, 1, figsize=(6, 10), sharex=True)
+    epochs_list = list(range(1, len(epoch_losses) + 1))
+    axes[0].plot(epochs_list, epoch_losses, marker='o'); axes[0].set_ylabel('Loss'); axes[0].set_title('Training Loss'); axes[0].grid(True)
+    axes[1].plot(epochs_list, epoch_ppls, marker='o'); axes[1].set_ylabel('Perplexity'); axes[1].set_title('Training Perplexity'); axes[1].grid(True)
+    axes[2].plot(epochs_list, epoch_accs, marker='o'); axes[2].set_ylabel('MLM Accuracy'); axes[2].set_xlabel('Epoch'); axes[2].set_title('Masked LM Accuracy'); axes[2].grid(True)
+    plt.tight_layout(); plt.show()
+    # Save LoRA adapter
+    model.save_pretrained(ADAPTER_SAVE_DIR)
+    print(f"LoRA adapter saved to {ADAPTER_SAVE_DIR}")
+if __name__ == '__main__':
+    fine_tune_lora()

images/methodology_flowchart.png ADDED Viewed

Git LFS Details

SHA256: e3aafb2e0d0a78a75fdeb34deceabb946a6f08cba7ab861aba18eb10a7003639
Pointer size: 131 Bytes
Size of remote file: 239 kB

images/test.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

main.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#from models import *
+#from preprocess_data import *
+from utils import extract_sentences_by_intent, train_model_with_chkpt, batch_predict_and_save
+from time import time
+import logging
+from config import *
+if __name__ == '__main__':
+    # train_model_with_chkpt(epochs=5, batch_size=16, lr=2e-3,
+    #             save_model=True,
+    #             save_path='gnn_model_checkpoint.pt',
+    #             resume=True)
+    from transformers import BertTokenizer
+    from preprocess_data import SentenceDataset
+    from models import SentenceExtractionModel
+    from utils import train_sentence_extractor
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    dataset = SentenceDataset("data/Fin_ExBERT_train_val_data.xlsx", tokenizer)
+    model = SentenceExtractionModel(
+        base_model_name=MODEL_NAME,
+        backbone='finexbert'
+    )
+    # train_sentence_extractor(
+    #     model,
+    #     dataset,
+    #     output_dir="checkpoints/sentence_extractor",
+    #     val_split=0.3,
+    #     epochs=10,
+    #     batch_size=16,
+    #     lr=3e-4,
+    #     device=DEVICE,
+    #     unfreeze_after_epoch=4
+    # )
+    #
+    # from transformers import BertTokenizer
+    # from models import SentenceExtractionModel
+    # from utils import demo_on_random_val
+    #
+    # tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    # model = SentenceExtractionModel(
+    #     base_model_name=MODEL_NAME,
+    #     backbone='finexbert'
+    # )
+    #
+    # demo_on_random_val(
+    #     model,
+    #     tokenizer,
+    #     excel_path="data/Fin_ExBERT_test_set.xlsx",
+    #     ckpt_path="checkpoints/sentence_extractor/best_model.pth",
+    #     device="cuda",  # or "cpu"
+    #     temperature=1,
+    # )
+    batch_predict_and_save(
+        model,
+        tokenizer,
+        excel_path="data/Fin_ExBERT_test_set.xlsx",
+        ckpt_path="checkpoints/sentence_extractor/best_model.pth",
+        output_path="results/predictions_sample200.xlsx",
+        n_samples=200,
+        temperature=1.0,
+        device="cuda"
+    )
+    sample_transcript = """
+    Agent: Hello, thank you for calling Acme Financial Services. My name is Priya. How can I help you today?
+    Customer: Hi Priya, I’m considering opening a new savings account with you.
+    Agent: Absolutely—our savings account offers 4% interest per annum. Do you have a balance in mind?
+    Customer: Yes, I’d like to deposit ₹50,000 initially, and then I’m interested in investing another ₹2 lakh in mutual funds over the next month.
+    Agent: Great, we have several mutual fund options. Are you more growth-oriented or looking for steady income?
+    Customer: I want to focus on growth. Also, could you tell me about your home loan rates? I may need a ₹30 lakh mortgage in the next six months.
+    Agent: Certainly—we currently offer home loan rates starting at 6.8%. Do you already own property or are you planning to buy?
+    Customer: Planning to buy. Finally, I’d like to apply for a credit card with a high cashback—maybe one that gives 2% on all spends.
+    Agent: We have a Platinum Cashback Card at 1.5%, and our Signature Cashback Card at 2%. Would you like me to initiate the application?
+    Customer: Yes please, go ahead with the Signature Cashback Card, and send me the home-loan documents via email.
+    Agent: Done. You’ll receive an email shortly. Is there anything else I can help you with?
+    Customer: No, that’s all for today—thank you!
+    """
+    complex_transcript = """
+    Agent: Good morning, thank you for calling Maple Grove Bank. This is Rahul speaking—how may I assist you today?
+    Customer: Hi Rahul, I’ve been reviewing my financial goals for the next five years and want to discuss a mix of savings, investments, and insurance.
+    Agent: Absolutely. Would you like to start with your current cash savings or jump straight into investment products?
+    Customer: Let’s begin with savings: I’d like to open a high-yield savings account with at least ₹1 lakh to start, and then set up an automatic top-up of ₹10,000 each month.
+    Agent: Great choice. We have our “Plus Savings” account at 4.2% APY. Next, investments—are you looking at mutual funds, stocks, or retirement plans?
+    Customer: I’m particularly interested in tax-saving ELSS mutual funds and a more conservative retirement pension plan. Also, could you explain your term insurance offerings?
+    Agent: Sure—our ELSS options include Fund A (equity-heavy) and Fund B (balanced). For term cover, we have 20-year plans up to ₹50 lakhs. Any preference?
+    Customer: I want a balanced ELSS with a 3-year lock-in, and term insurance of ₹30 lakhs for 25 years. After that, I may need advice on buying a second home—so let’s also discuss mortgage pre-approval.
+    Agent: Understood. For a ₹30 lakh home loan, current interest rates start at 6.9%. We can pre-approve you based on your income. Shall I proceed?
+    Customer: Yes, please initiate the home-loan pre-approval. And lastly, I’d like to apply for a debit card with no annual fee and a co-branded credit card offering travel rewards.
+    Agent: Certainly—our “Freedom” debit card has no fee, and the “SkyMiles” credit card gives 2 airline miles per ₹100 spent. Would you like to complete those applications now?
+    Customer: Yes, go ahead with both. Also, can you set up a quarterly portfolio review call with a financial advisor?
+    Agent: Absolutely. I’ll schedule a review every three months starting next quarter. You’ll get email confirmations shortly.
+    Customer: Perfect—that covers all my needs. Thanks for your help!
+    Agent: My pleasure! Have a great day and feel free to call back anytime.
+    """
+    # premise_input = "personA is on the stage giving a speech."
+    # hypothesis_input = "personA is using a microphone."
+    # prediction, _ = predict_fin_nli(premise=premise_input, hypothesis=hypothesis_input, model_path='gnn_model_checkpoint.pt')
+    # print("Prediction:", prediction)
+    # print('Final layer logits:', _)
+    ################################
+    # start = time()
+    # results = extract_sentences_by_intent(
+    #     complex_transcript,
+    #     intent="customer tells about own financial condition",#"customer states specific financial product requests and planning preferences",
+    #     #"agent provides assistance", #"customer states their financial needs",
+    #     threshold=0.60,
+    #     top_k=10,
+    #     convo_focus='customer'
+    # )
+    # end = time()
+    #
+    # logging.info('Prediction Done in {:.2f}sec'.format(end - start))
+    #
+    # for sentence, score in results:
+    #     print(f"{score:.2f} → {sentence}")

models.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import torch
+import os
+import math
+import torch.nn as nn
+import torch.nn.functional as F
+from peft import PeftModel, LoraConfig, get_peft_model
+from transformers import AutoTokenizer, AutoModel, AutoConfig, get_linear_schedule_with_warmup
+from torch.nn import MultiheadAttention, GELU
+MODEL_NAME = "bert-base-uncased"
+BATCH_SIZE = 16
+MAX_LENGTH = 128
+LEARNING_RATE = 2e-5
+EPOCHS = 5
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+PREPROCESSED_DIR = "preprocessed_snli"
+MIXED_PRECISION = "fp16"
+class SimpleGNN(nn.Module):
+    def __init__(self, input_dim, hidden_dim):
+        super().__init__()
+        self.fc = nn.Linear(input_dim, hidden_dim)
+    def forward(self, node_embeddings, edges):
+        if node_embeddings.size(0) == 0:
+            return torch.zeros(1, self.fc.out_features, device=node_embeddings.device)
+        num_nodes = node_embeddings.size(0)
+        adj = torch.zeros((num_nodes, num_nodes), device=node_embeddings.device)
+        for (src, dst) in edges:
+            if src < num_nodes and dst < num_nodes:
+                adj[src, dst] = 1.0
+        deg = adj.sum(dim=1, keepdim=True) + 1e-10
+        adj_norm = adj / deg
+        agg_embeddings = adj_norm @ node_embeddings
+        return F.relu(self.fc(agg_embeddings))
+class GraphAugmentedNLIModel(nn.Module):
+    def __init__(self, base_model_name, num_labels=3, hidden_dim=768, gnn_dim=128):
+        super().__init__()
+        config = AutoConfig.from_pretrained(base_model_name)
+        config.num_labels = num_labels
+        self.bert = AutoModel.from_pretrained(base_model_name, config=config)
+        self.dropout = nn.Dropout(0.1)
+        self.gnn_premise = SimpleGNN(hidden_dim, gnn_dim)
+        self.gnn_hypothesis = SimpleGNN(hidden_dim, gnn_dim)
+        self.classifier = nn.Linear(hidden_dim + gnn_dim*2, num_labels)
+    def forward(self, input_ids, attention_mask, premise_graph_tokens, premise_graph_edges, premise_node_indices,
+                hypothesis_graph_tokens, hypothesis_graph_edges, hypothesis_node_indices, labels=None):
+        outputs = self.bert(input_ids, attention_mask=attention_mask)
+        cls_embedding = outputs.last_hidden_state[:,0,:]  # [batch, hidden_dim]
+        batch_size = input_ids.size(0)
+        gnn_p_outputs = []
+        gnn_h_outputs = []
+        # Now node indices are precomputed. We just take those embeddings directly.
+        # node_indices correspond to the positions in input_ids whose embeddings represent that node.
+        for i in range(batch_size):
+            instance_hidden = outputs.last_hidden_state[i]  # [seq_len, hidden_dim]
+            p_edges = premise_graph_edges[i]
+            p_indices = premise_node_indices[i]
+            h_edges = hypothesis_graph_edges[i]
+            h_indices = hypothesis_node_indices[i]
+            # Gather node embeddings
+            p_nodes = instance_hidden[p_indices] if len(p_indices) > 0 else torch.empty(0, instance_hidden.size(-1), device=instance_hidden.device)
+            h_nodes = instance_hidden[h_indices] if len(h_indices) > 0 else torch.empty(0, instance_hidden.size(-1), device=instance_hidden.device)
+            p_gnn_out = self.gnn_premise(p_nodes, p_edges) if p_nodes.size(0) > 0 else torch.zeros(1,128, device=DEVICE)
+            h_gnn_out = self.gnn_hypothesis(h_nodes, h_edges) if h_nodes.size(0) > 0 else torch.zeros(1,128, device=DEVICE)
+            p_mean = p_gnn_out.mean(dim=0, keepdim=True)
+            h_mean = h_gnn_out.mean(dim=0, keepdim=True)
+            gnn_p_outputs.append(p_mean)
+            gnn_h_outputs.append(h_mean)
+        gnn_p_outputs = torch.cat(gnn_p_outputs, dim=0) # [batch, gnn_dim]
+        gnn_h_outputs = torch.cat(gnn_h_outputs, dim=0) # [batch, gnn_dim]
+        fused = torch.cat([cls_embedding, gnn_p_outputs, gnn_h_outputs], dim=-1)
+        fused = self.dropout(fused)
+        logits = self.classifier(fused)
+        loss = None
+        if labels is not None:
+            loss_fn = nn.CrossEntropyLoss()
+            loss = loss_fn(logits, labels)
+        return {"loss": loss, "logits": logits}
+class SimpleFinGNN(nn.Module):
+    def __init__(self, input_dim, hidden_dim):
+        super().__init__()
+        self.fc = nn.Linear(input_dim, hidden_dim)
+    def forward(self, node_embeddings, edges):
+        if node_embeddings.size(0) == 0:
+            return torch.zeros(1, self.fc.out_features, device=node_embeddings.device)
+        num_nodes = node_embeddings.size(0)
+        adj = torch.zeros((num_nodes, num_nodes), device=node_embeddings.device)
+        for (src, dst) in edges:
+            if src < num_nodes and dst < num_nodes:
+                adj[src, dst] = 1.0
+        deg = adj.sum(dim=1, keepdim=True) + 1e-10
+        adj_norm = adj / deg
+        agg_embeddings = adj_norm @ node_embeddings
+        return F.relu(self.fc(agg_embeddings))
+class GraphAugmentedFinNLIModel(nn.Module):
+    def __init__(self, base_model_name, num_labels=3, hidden_dim=768, gnn_dim=128):
+        super().__init__()
+        config = AutoConfig.from_pretrained(base_model_name)
+        config.num_labels = num_labels
+        self.bert = AutoModel.from_pretrained(base_model_name, config=config)
+        self.dropout = nn.Dropout(0.1)
+        self.gnn_premise = SimpleGNN(hidden_dim, gnn_dim)
+        self.gnn_hypothesis = SimpleGNN(hidden_dim, gnn_dim)
+        self.classifier = nn.Linear(hidden_dim + gnn_dim*2, num_labels)
+        self.config = self.bert.config
+        self.config.num_labels = num_labels
+    def forward(self,
+            input_ids=None,
+            attention_mask=None,
+            premise_graph_tokens=None,
+            hypothesis_graph_tokens=None,
+            premise_graph_edges=None,
+            hypothesis_graph_edges=None,
+            premise_node_indices=None,
+            hypothesis_node_indices=None,
+            labels=None,
+            inputs_embeds=None,
+            **kwargs):
+        # Even if we don't use inputs_embeds, we should pass it into self.bert call:
+        outputs = self.bert(input_ids=input_ids,
+                            attention_mask=attention_mask,
+                            inputs_embeds=inputs_embeds,
+                            **{k:v for k,v in kwargs.items() if k in self.bert.forward.__code__.co_varnames})
+        cls_embedding = outputs.last_hidden_state[:,0,:]  # [batch, hidden_dim]
+        batch_size = input_ids.size(0) if input_ids is not None else outputs.last_hidden_state.size(0)
+        gnn_p_outputs = []
+        gnn_h_outputs = []
+        for i in range(batch_size):
+            instance_hidden = outputs.last_hidden_state[i]  # [seq_len, hidden_dim]
+            p_edges = premise_graph_edges[i]
+            p_indices = premise_node_indices[i]
+            h_edges = hypothesis_graph_edges[i]
+            h_indices = hypothesis_node_indices[i]
+            p_nodes = instance_hidden[p_indices] if len(p_indices) > 0 else torch.empty(0, instance_hidden.size(-1), device=instance_hidden.device)
+            h_nodes = instance_hidden[h_indices] if len(h_indices) > 0 else torch.empty(0, instance_hidden.size(-1), device=instance_hidden.device)
+            p_gnn_out = self.gnn_premise(p_nodes, p_edges) if p_nodes.size(0) > 0 else torch.zeros(1,128, device=instance_hidden.device)
+            h_gnn_out = self.gnn_hypothesis(h_nodes, h_edges) if h_nodes.size(0) > 0 else torch.zeros(1,128, device=instance_hidden.device)
+            p_mean = p_gnn_out.mean(dim=0, keepdim=True)
+            h_mean = h_gnn_out.mean(dim=0, keepdim=True)
+            gnn_p_outputs.append(p_mean)
+            gnn_h_outputs.append(h_mean)
+        gnn_p_outputs = torch.cat(gnn_p_outputs, dim=0) # [batch, gnn_dim]
+        gnn_h_outputs = torch.cat(gnn_h_outputs, dim=0) # [batch, gnn_dim]
+        fused = torch.cat([cls_embedding, gnn_p_outputs, gnn_h_outputs], dim=-1)
+        logits = self.classifier(fused)
+        loss = None
+        if labels is not None:
+            loss_fn = nn.CrossEntropyLoss()
+            loss = loss_fn(logits, labels)
+        return {"loss": loss, "logits": logits}
+class SentenceExtractionModel(nn.Module):
+    def __init__(self,
+                 base_model_name: str,
+                 dropout_prob: float = 0.1,
+                 adapter_dir: str = "./lora_finance_adapter",
+                 backbone: str = 'default',
+                 init_pos_frac: float = None    # NEW!
+    ):
+        """
+        backbone:
+          - 'default' → plain AutoModel.from_pretrained(base_model_name)
+          - 'finexbert' → use the .bert submodule of your GraphAugmentedFinNLIModel
+        """
+        super().__init__()
+        # load config
+        config = AutoConfig.from_pretrained(base_model_name)
+        if backbone == 'default':
+            # plain BERT
+            self.bert = AutoModel.from_pretrained(base_model_name, config=config)
+        elif backbone == 'finexbert':
+            # instantiate your full FinNLI model, then grab its .bert
+            base_model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
+            lora_cfg = LoraConfig(
+                r=8,
+                lora_alpha=32,
+                lora_dropout=0.1,
+                bias="none",
+                task_type="SEQ_CLS"#"CAUSAL_LM",  # must match your fine-tune setting
+            )
+            full = get_peft_model(base_model, lora_cfg).to(DEVICE)
+            chkpt_path = os.path.join(adapter_dir, "training_checkpoint.pt")
+            if not os.path.isfile(chkpt_path):
+                raise FileNotFoundError(f"No LoRA checkpoint at {chkpt_path}")
+            ckpt = torch.load(chkpt_path, map_location=DEVICE)
+            # ckpt["model_state_dict"] contains both base + LoRA weights; strict=False
+            full.load_state_dict(ckpt["model_state_dict"], strict=False)
+            # if you have a saved finexbert checkpoint, load it here:
+            # full.load_state_dict(torch.load("path/to/finexbert.pth", map_location='cpu'))
+            self.bert = full.base_model
+        else:
+            raise ValueError(f"Unknown backbone {backbone}")
+        hidden_size = self.bert.config.hidden_size
+        self.dropout = nn.Dropout(dropout_prob)
+        self.classifier = nn.Linear(hidden_size, 1)
+        # initialize bias to log-odds of init_pos_frac
+        if init_pos_frac is not None:
+            b0 = float(math.log(init_pos_frac / (1.0 - init_pos_frac)))
+            self.classifier.bias.data.fill_(b0)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids=input_ids,
+                             attention_mask=attention_mask)
+        x      = self.dropout(outputs.pooler_output)
+        logits = self.classifier(x).squeeze(-1)   # [batch]
+        return logits

preprocess_data.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import os
+import logging
+import torch
+from torch.utils.data import Dataset
+from datasets import load_dataset, load_from_disk
+import pandas as pd
+import nltk
+from config import MODEL_NAME, MAX_LENGTH, OVERLAP, PREPROCESSED_DIR, tokenizer, nlp
+# =============================
+# Logging Setup
+# =============================
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+# =============================
+# One-Time Preprocessing
+# =============================
+def process_data():
+    if not os.path.exists(PREPROCESSED_DIR):
+        logging.info("Preprocessing data... This may take a while.")
+        # Load and filter SNLI
+        snli = load_dataset("snli")
+        snli = snli.filter(lambda x: x["label"] != -1)
+        def build_dependency_graph(sentence):
+            doc = nlp(sentence)
+            tokens = [tok.text for tok in doc]
+            edges = []
+            for tok in doc:
+                if tok.head.i != tok.i:
+                    edges.extend([(tok.i, tok.head.i), (tok.head.i, tok.i)])
+            return tokens, edges
+        def preprocess(examples):
+            premises = examples["premise"]
+            hypotheses = examples["hypothesis"]
+            labels = examples["label"]
+            tokenized = tokenizer(premises, hypotheses,
+                                  truncation=True, padding="max_length",
+                                  max_length=MAX_LENGTH)
+            tokenized["labels"] = labels
+            p_tokens_list, p_edges_list, p_idx_list = [], [], []
+            h_tokens_list, h_edges_list, h_idx_list = [], [], []
+            for p, h, input_ids in zip(premises, hypotheses, tokenized["input_ids"]):
+                p_toks, p_edges = build_dependency_graph(p)
+                h_toks, h_edges = build_dependency_graph(h)
+                wp_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+                def align_tokens(spacy_tokens, wp_tokens):
+                    node_indices, wp_idx = [], 1
+                    for _ in spacy_tokens:
+                        if wp_idx >= len(wp_tokens) - 1: break
+                        node_indices.append(wp_idx)
+                        wp_idx += 1
+                        while wp_idx < len(wp_tokens) - 1 and wp_tokens[wp_idx].startswith("##"):
+                            wp_idx += 1
+                    return node_indices
+                p_idx = align_tokens(p_toks, wp_tokens)
+                h_idx = align_tokens(h_toks, wp_tokens)
+                p_tokens_list.append(p_toks)
+                p_edges_list.append(p_edges)
+                p_idx_list.append(p_idx)
+                h_tokens_list.append(h_toks)
+                h_edges_list.append(h_edges)
+                h_idx_list.append(h_idx)
+            tokenized.update({
+                "premise_graph_tokens": p_tokens_list,
+                "premise_graph_edges": p_edges_list,
+                "premise_node_indices": p_idx_list,
+                "hypothesis_graph_tokens": h_tokens_list,
+                "hypothesis_graph_edges": h_edges_list,
+                "hypothesis_node_indices": h_idx_list,
+            })
+            return tokenized
+        snli = snli.map(preprocess, batched=True)
+        snli.save_to_disk(PREPROCESSED_DIR)
+        logging.info(f"Preprocessing complete. Saved to {PREPROCESSED_DIR}")
+    else:
+        logging.info("Using existing preprocessed data at %s", PREPROCESSED_DIR)
+def chunk_transcript(transcript_text, start_idx, end_idx, tokenizer):
+    encoded = tokenizer(transcript_text,
+                        return_offsets_mapping=True,
+                        add_special_tokens=True,
+                        return_tensors=None,
+                        max_length=1024,
+                        padding=False,
+                        truncation=False)
+    all_input_ids = encoded["input_ids"]
+    all_offsets   = encoded["offset_mapping"]
+    chunks = []
+    i = 0
+    while i < len(all_input_ids):
+        chunk_ids = all_input_ids[i : i + MAX_LENGTH]
+        chunk_offsets = all_offsets[i : i + MAX_LENGTH]
+        attention_mask = [1] * len(chunk_ids)
+        no_span = 1
+        start_token, end_token = -1, -1
+        if start_idx >= 0 and end_idx >= 0:
+            for j, (off_s, off_e) in enumerate(chunk_offsets):
+                if off_s <= start_idx < off_e:
+                    start_token = j
+                if off_s < end_idx <= off_e:
+                    end_token = j
+                    break
+            if 0 <= start_token <= end_token:
+                no_span = 0
+            else:
+                start_token, end_token = -1, -1
+        chunks.append({
+            "input_ids": torch.tensor(chunk_ids, dtype=torch.long),
+            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
+            "start_label": start_token,
+            "end_label": end_token,
+            "no_span_label": no_span,
+        })
+        i += (MAX_LENGTH - OVERLAP)
+    return chunks
+class SpanExtractionChunkedDataset(Dataset):
+    def __init__(self, data):
+        self.samples = []
+        for item in data:
+            chunks = chunk_transcript(
+                item.get("transcript", ""),
+                item.get("start_idx", -1),
+                item.get("end_idx", -1),
+                tokenizer)
+            self.samples.extend(chunks)
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        return self.samples[idx]
+def span_collate_fn(batch):
+    max_len = max(len(x["input_ids"]) for x in batch)
+    inputs, masks, starts, ends, nos = [], [], [], [], []
+    for x in batch:
+        pad = max_len - len(x["input_ids"])
+        inputs.append(torch.cat([x["input_ids"], torch.zeros(pad, dtype=torch.long)]).unsqueeze(0))
+        masks.append(torch.cat([x["attention_mask"], torch.zeros(pad, dtype=torch.long)]).unsqueeze(0))
+        starts.append(x["start_label"])
+        ends.append(x["end_label"])
+        nos.append(x["no_span_label"])
+    return {
+        "input_ids": torch.cat(inputs, dim=0),
+        "attention_mask": torch.cat(masks, dim=0),
+        "start_positions": torch.tensor(starts, dtype=torch.long),
+        "end_positions": torch.tensor(ends, dtype=torch.long),
+        "no_span_label": torch.tensor(nos, dtype=torch.long),
+    }
+nltk.download('punkt')
+nltk.download('punkt_tab')
+class SentenceDataset(Dataset):
+    def __init__(self,
+                 excel_path: str,
+                 tokenizer,
+                 max_length: int = 128):
+        df = pd.read_excel(excel_path)
+        self.samples = []
+        for _, row in df.iterrows():
+            transcript     = str(row['Claude_Call'])
+            gold_sentences = row['Sel_K']
+            # if it's a string repr of list, eval it
+            if isinstance(gold_sentences, str):
+                gold_sentences = eval(gold_sentences)
+            # split into sentences
+            sentences = nltk.sent_tokenize(transcript)
+            for sent in sentences:
+                label = 1 if sent in gold_sentences else 0
+                enc = tokenizer.encode_plus(
+                    sent,
+                    max_length=max_length,
+                    padding='max_length',
+                    truncation=True,
+                    return_tensors='pt'
+                )
+                self.samples.append({
+                    'input_ids':      enc['input_ids'].squeeze(0),
+                    'attention_mask': enc['attention_mask'].squeeze(0),
+                    'label':          torch.tensor(label, dtype=torch.float)
+                })
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        return self.samples[idx]

requirements.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+torch>=1.9.0
+transformers>=4.20.0
+datasets>=2.0.0
+spacy>=3.4.0
+networkx>=2.8
+numpy>=1.21.0
+pandas>=1.3.0
+scikit-learn>=1.0.0
+tqdm>=4.62.0
+matplotlib>=3.5.0
+accelerate>=0.20.0
+peft>=0.4.0
+openpyxl>=3.0.0
+nltk>=3.7
+# requirements-dev.txt
+pytest>=6.0.0
+pytest-cov>=3.0.0
+black>=22.0.0
+isort>=5.10.0
+flake8>=4.0.0
+mypy>=0.950
+pre-commit>=2.15.0
+sphinx>=4.0.0
+sphinx-rtd-theme>=1.0.0
+jupyter>=1.0.0

results/Ablation.txt ADDED Viewed

	@@ -0,0 +1,46 @@

+2025-05-21 16:21:26,783 INFO Using existing preprocessed data at preprocessed_snli
+2025-05-21 16:21:28,008 INFO --- Training Baseline-BERT on 10% of data ---
+2025-05-21 16:33:27,656 INFO Baseline-BERT Epoch 1: train_loss=1.1084
+2025-05-21 16:45:29,223 INFO Baseline-BERT Epoch 2: train_loss=1.1015
+2025-05-21 16:57:31,263 INFO Baseline-BERT Epoch 3: train_loss=1.1008
+2025-05-21 17:09:33,145 INFO Baseline-BERT Epoch 4: train_loss=1.1021
+2025-05-21 17:21:33,012 INFO Baseline-BERT Epoch 5: train_loss=1.1034
+2025-05-21 17:21:43,717 INFO Baseline-BERT on 10% val → acc=0.3232, f1=0.1628
+2025-05-21 17:21:43,717 INFO --- Training GNN-Augmented on 10% of data ---
+2025-05-21 17:35:02,212 INFO GNN-Augmented Epoch 1: train_loss=1.1044
+2025-05-21 17:48:21,592 INFO GNN-Augmented Epoch 2: train_loss=1.1041
+2025-05-21 18:01:40,046 INFO GNN-Augmented Epoch 3: train_loss=1.1025
+2025-05-21 18:14:58,966 INFO GNN-Augmented Epoch 4: train_loss=1.1019
+2025-05-21 18:29:19,473 INFO GNN-Augmented Epoch 5: train_loss=1.1038
+2025-05-21 18:29:34,558 INFO GNN-Augmented on 10% val → acc=0.3232, f1=0.1628
+2025-05-21 16:21:26,783 INFO Using existing preprocessed data at preprocessed_snli
+2025-05-21 16:21:28,008 INFO --- Training Baseline-BERT on 10% of data ---
+2025-05-21 16:33:27,656 INFO Baseline-BERT Epoch 1: train_loss=1.1084
+2025-05-21 16:45:29,223 INFO Baseline-BERT Epoch 2: train_loss=1.1015
+2025-05-21 16:57:31,263 INFO Baseline-BERT Epoch 3: train_loss=1.1008
+2025-05-21 17:09:33,145 INFO Baseline-BERT Epoch 4: train_loss=1.1021
+2025-05-21 17:21:33,012 INFO Baseline-BERT Epoch 5: train_loss=1.1001
+2025-05-21 17:21:43,717 INFO Baseline-BERT on 10% val → acc=0.3232, f1=0.1628
+2025-05-21 17:21:43,717 INFO --- Training GNN-Augmented on 10% of data ---
+2025-05-21 17:35:02,212 INFO GNN-Augmented Epoch 1: train_loss=1.1044
+2025-05-21 17:48:21,592 INFO GNN-Augmented Epoch 2: train_loss=1.1011
+2025-05-21 18:01:40,046 INFO GNN-Augmented Epoch 3: train_loss=0.9025
+2025-05-21 18:14:58,966 INFO GNN-Augmented Epoch 4: train_loss=0.8319
+2025-05-21 18:29:19,473 INFO GNN-Augmented Epoch 5: train_loss=0.7638
+2025-05-21 18:29:34,558 INFO GNN-Augmented on 10% val → acc=0.6937, f1=0.4184
+▶︎ Per‐dataset judge averages:
+  FinQA-10K: Judge1=4.78, Judge2=4.46, Judge3=4.42, Combined=4.55
+  SQuAD: Judge1=5.00, Judge2=4.84, Judge3=4.52, Combined=4.79
+▶︎ Per‐dataset judge averages:
+  FinQA-10K: Judge1=4.96, Judge2=4.86, Judge3=4.68, Combined=4.84
+  SQuAD: Judge1=5.00, Judge2=4.94, Judge3=4.84, Combined=4.93

results/Fin-ExBERT.pptx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ef24ff9e727c97ddca4cb52a818c11ef4b61667d76a516cbddc48da3b4d85b2
+size 14227310

results/ablation_study.png ADDED Viewed

results/combined_results.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a43f4de8658d50e234dd910d74b2b34f5e5d1fd0c910a2e0e830dd3360906a19
+size 127995

results/fine_tuning_results.png ADDED Viewed

results/methods_summary.xlsx ADDED Viewed

Binary file (10.7 kB). View file

utils.py ADDED Viewed

	@@ -0,0 +1,967 @@

+import os
+import logging
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from nltk import sent_tokenize
+from sklearn.metrics import accuracy_score, precision_score, f1_score
+from sklearn.model_selection import train_test_split
+from torch.utils.data import DataLoader, random_split, WeightedRandomSampler
+from transformers import AutoTokenizer, AutoModel, AutoConfig, get_linear_schedule_with_warmup
+from peft import PeftModel, LoraConfig, get_peft_model
+from datasets import load_dataset, DatasetDict, load_from_disk
+import spacy
+import re
+from tqdm.auto import tqdm
+from accelerate import Accelerator
+import matplotlib.pyplot as plt
+from torch.optim import AdamW
+import pandas as pd
+from typing import Optional, Tuple, List, Dict
+from models import GraphAugmentedNLIModel, GraphAugmentedFinNLIModel
+from preprocess_data import SpanExtractionChunkedDataset, process_data, chunk_transcript, span_collate_fn
+# =============================
+# Configuration Constants
+# =============================
+from config import MODEL_NAME, MAX_LENGTH, OVERLAP, PREPROCESSED_DIR, tokenizer, nlp
+#MODEL_NAME = "bert-base-uncased"
+BATCH_SIZE = 16
+#MAX_LENGTH = 128
+#OVERLAP = 32
+LEARNING_RATE = 2e-5
+EPOCHS = 5
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+#PREPROCESSED_DIR = "preprocessed_snli"
+MIXED_PRECISION = "fp16"
+# label mapping
+label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}
+# =============================
+# Logging & Reproducibility
+# =============================
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+def set_seed(seed: int = 42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+# =============================
+# Tokenizer & NLP Model
+# =============================
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+nlp = spacy.load("en_core_web_sm")
+# =============================
+# Dependency Graph Helpers
+# =============================
+def build_dependency_graph(sentence: str):
+    doc = nlp(sentence)
+    tokens = [token.text for token in doc]
+    edges = []
+    for token in doc:
+        if token.head.i != token.i:
+            edges.append((token.i, token.head.i))
+            edges.append((token.head.i, token.i))
+    return tokens, edges
+# =============================
+# Token Alignment
+# =============================
+def align_tokens(spacy_tokens, wp_tokens):
+    node_indices = []
+    wp_idx = 1  # after [CLS]
+    for _ in spacy_tokens:
+        if wp_idx >= len(wp_tokens) - 1:
+            break
+        node_indices.append(wp_idx)
+        wp_idx += 1
+        while wp_idx < len(wp_tokens) - 1 and wp_tokens[wp_idx].startswith("##"):
+            wp_idx += 1
+    return node_indices
+# =============================
+# Data Collation
+# =============================
+def my_collate_fn(batch):
+    input_ids = [torch.tensor(ex["input_ids"], dtype=torch.long) for ex in batch]
+    attention_mask = [torch.tensor(ex["attention_mask"], dtype=torch.long) for ex in batch]
+    labels = [ex.get("labels", None) for ex in batch]
+    premise_graph_tokens = [ex.get("premise_graph_tokens") for ex in batch]
+    premise_graph_edges = [ex.get("premise_graph_edges") for ex in batch]
+    premise_node_indices = [ex.get("premise_node_indices") for ex in batch]
+    hypothesis_graph_tokens = [ex.get("hypothesis_graph_tokens") for ex in batch]
+    hypothesis_graph_edges = [ex.get("hypothesis_graph_edges") for ex in batch]
+    hypothesis_node_indices = [ex.get("hypothesis_node_indices") for ex in batch]
+    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
+    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
+    labels = torch.tensor(labels, dtype=torch.long) if labels and labels[0] is not None else None
+    return {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "labels": labels,
+        "premise_graph_tokens": premise_graph_tokens,
+        "premise_graph_edges": premise_graph_edges,
+        "premise_node_indices": premise_node_indices,
+        "hypothesis_graph_tokens": hypothesis_graph_tokens,
+        "hypothesis_graph_edges": hypothesis_graph_edges,
+        "hypothesis_node_indices": hypothesis_node_indices,
+    }
+# =============================
+# Training Loop
+# =============================
+def train_model(epochs: int = EPOCHS,
+                batch_size: int = BATCH_SIZE,
+                lr: float = LEARNING_RATE,
+                save_model: bool = False,
+                save_path: str = 'gnn_model_weights_3.pt'):
+    set_seed()
+    process_data()
+    logging.info("Loading preprocessed dataset...")
+    snli = load_from_disk(PREPROCESSED_DIR)
+    snli.set_format("python", output_all_columns=True)
+    train_loader = DataLoader(snli["train"], batch_size=batch_size, shuffle=True, collate_fn=my_collate_fn)
+    val_loader   = DataLoader(snli["validation"], batch_size=batch_size, collate_fn=my_collate_fn)
+    model = GraphAugmentedNLIModel(MODEL_NAME).to(DEVICE)
+    if hasattr(model.bert, 'gradient_checkpointing_enable'):
+        model.bert.gradient_checkpointing_enable()
+        logging.info("Enabled gradient checkpointing on BERT.")
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+    num_training_steps = epochs * len(train_loader)
+    lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=num_training_steps)
+    accelerator = Accelerator(mixed_precision=MIXED_PRECISION)
+    model, optimizer, train_loader, val_loader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_loader, val_loader, lr_scheduler
+    )
+    model.train()
+    all_losses = []
+    epoch_losses = []
+    best_val_loss = float('inf')
+    best_epoch = 0
+    for epoch in range(1, epochs + 1):
+        epoch_loss = []
+        progress = tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}", leave=False)
+        for batch in progress:
+            labels = batch["labels"].to(DEVICE) if batch.get("labels") is not None else None
+            outputs = model(
+                input_ids=batch["input_ids"].to(DEVICE),
+                attention_mask=batch["attention_mask"].to(DEVICE),
+                premise_graph_tokens=batch["premise_graph_tokens"],
+                premise_graph_edges=batch["premise_graph_edges"],
+                premise_node_indices=batch["premise_node_indices"],
+                hypothesis_graph_tokens=batch["hypothesis_graph_tokens"],
+                hypothesis_graph_edges=batch["hypothesis_graph_edges"],
+                hypothesis_node_indices=batch["hypothesis_node_indices"],
+                labels=labels
+            )
+            loss = outputs.get("loss") if isinstance(outputs, dict) else outputs
+            optimizer.zero_grad()
+            accelerator.backward(loss)
+            optimizer.step()
+            lr_scheduler.step()
+            loss_val = loss.item()
+            epoch_loss.append(loss_val)
+            all_losses.append(loss_val)
+            progress.set_postfix({"loss": f"{loss_val:.4f}"})
+        avg_epoch_loss = np.mean(epoch_loss)
+        epoch_losses.append(avg_epoch_loss)
+        logging.info(f"Epoch {epoch} completed. Avg Loss: {avg_epoch_loss:.4f}")
+        # Validation
+        model.eval()
+        val_losses = []
+        with torch.no_grad():
+            for batch in val_loader:
+                labels = batch["labels"].to(DEVICE) if batch.get("labels") is not None else None
+                outputs = model(
+                    input_ids=batch["input_ids"].to(DEVICE),
+                    attention_mask=batch["attention_mask"].to(DEVICE),
+                    premise_graph_tokens=batch["premise_graph_tokens"],
+                    premise_graph_edges=batch["premise_graph_edges"],
+                    premise_node_indices=batch["premise_node_indices"],
+                    hypothesis_graph_tokens=batch["hypothesis_graph_tokens"],
+                    hypothesis_graph_edges=batch["hypothesis_graph_edges"],
+                    hypothesis_node_indices=batch["hypothesis_node_indices"],
+                    labels=labels
+                )
+                loss_item = outputs.get("loss").item() if isinstance(outputs, dict) else outputs.item()
+                val_losses.append(loss_item)
+        avg_val_loss = np.mean(val_losses) if val_losses else float('inf')
+        logging.info(f"Validation Loss after Epoch {epoch}: {avg_val_loss:.4f}")
+        if avg_val_loss < best_val_loss:
+            best_val_loss = avg_val_loss
+            best_epoch = epoch
+            if save_model:
+                logging.info(f"Saving best model at epoch {epoch} with val loss {avg_val_loss:.4f}")
+                torch.save(model.state_dict(), save_path)
+        model.train()
+    # Plot losses
+    plt.figure()
+    plt.plot(all_losses)
+    plt.xlabel('Training steps')
+    plt.ylabel('Loss')
+    plt.title('Step-wise Training Loss')
+    plt.show()
+    plt.figure()
+    plt.plot(range(1, epochs+1), epoch_losses, marker='o')
+    plt.xlabel('Epochs')
+    plt.ylabel('Loss')
+    plt.title('Epoch-wise Training Loss')
+    plt.show()
+    logging.info(f"Training complete. Best validation loss {best_val_loss:.4f} at epoch {best_epoch}.")
+    return model
+def predict_nli(premise, hypothesis, tokenizer=tokenizer, model_path='gnn_model_checkpoint.pt'):
+    # 1) instantiate the model exactly as you did during training
+    model = GraphAugmentedNLIModel(MODEL_NAME).to(DEVICE)
+    # 2) load the checkpoint, then hand only the model weights to load_state_dict
+    ckpt = torch.load(model_path, map_location=DEVICE)
+    model.load_state_dict(ckpt["model_state_dict"])
+    model.eval()
+    # 3) tokenize & build graphs (as before)…
+    encoded = tokenizer(
+        premise, hypothesis,
+        truncation=True,
+        padding="max_length",
+        max_length=MAX_LENGTH,
+        return_tensors="pt"
+    )
+    input_ids = encoded["input_ids"]
+    attention_mask = encoded["attention_mask"]
+    # Build dependency graphs
+    p_tokens, p_edges = build_dependency_graph(premise)
+    h_tokens, h_edges = build_dependency_graph(hypothesis)
+    # Convert ids back to tokens for alignment
+    wp_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
+    p_node_indices = align_tokens(p_tokens, wp_tokens)
+    h_node_indices = align_tokens(h_tokens, wp_tokens)
+    # Move tensors to the same device as the model
+    device = next(model.parameters()).device
+    input_ids = input_ids.to(device)
+    attention_mask = attention_mask.to(device)
+    # Prepare inputs for the model: the model expects lists for graph fields
+    # since we used a custom collate_fn logic.
+    premise_graph_tokens = [p_tokens]
+    premise_graph_edges = [p_edges]
+    premise_node_indices = [p_node_indices]
+    hypothesis_graph_tokens = [h_tokens]
+    hypothesis_graph_edges = [h_edges]
+    hypothesis_node_indices = [h_node_indices]
+    with torch.no_grad():
+        outputs = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            premise_graph_tokens=premise_graph_tokens,
+            premise_graph_edges=premise_graph_edges,
+            premise_node_indices=premise_node_indices,
+            hypothesis_graph_tokens=hypothesis_graph_tokens,
+            hypothesis_graph_edges=hypothesis_graph_edges,
+            hypothesis_node_indices=hypothesis_node_indices
+        )
+    logits = outputs["logits"]
+    probs = F.softmax(logits, dim=-1).cpu().numpy()[0]
+    # Get predicted label
+    predicted_label_id = torch.argmax(logits, dim=-1).item()
+    predicted_label = label_map[predicted_label_id]
+    prob_map = dict()
+    for i, cls_label in label_map.items():
+        prob_map[cls_label] = probs[i]
+    return predicted_label, prob_map
+def predict_fin_nli(
+        premise: str,
+        hypothesis: str,
+        tokenizer=tokenizer,
+        model_path: str = 'gnn_model_checkpoint.pt',
+        adapter_dir: str = './lora_finance_adapter',
+) -> (str, list):
+    # 1) Load base GraphAugmentedFinNLIModel and its checkpoint
+    base_model = GraphAugmentedFinNLIModel(MODEL_NAME).to(DEVICE)
+    ckpt = torch.load(model_path, map_location=DEVICE)
+    base_model.load_state_dict(ckpt['model_state_dict'])
+    # 2) Wrap with the same LoRA config you used in training
+    lora_cfg = LoraConfig(
+        r=8,
+        lora_alpha=32,
+        lora_dropout=0.1,
+        bias='none',
+        task_type='SEQ_CLS',
+        target_modules=['query', 'value']
+    )
+    model = get_peft_model(base_model, lora_cfg).to(DEVICE)
+    # 3) Load your adapter checkpoint (the .pt under lora_finance_adapter/)
+    adapter_ckpt = torch.load(os.path.join(adapter_dir, 'training_checkpoint.pt'), map_location=DEVICE)
+    # This checkpoint contains the same 'model_state_dict' keys—so load it leniently:
+    model.load_state_dict(adapter_ckpt['model_state_dict'], strict=False)
+    model.eval()
+    # 4) Tokenize
+    enc = tokenizer(
+        premise, hypothesis,
+        truncation=True,
+        padding='max_length',
+        max_length=MAX_LENGTH,
+        return_tensors='pt'
+    )
+    input_ids = enc['input_ids'].to(DEVICE)
+    attention_mask = enc['attention_mask'].to(DEVICE)
+    # 5) Build & align your dependency graphs
+    p_toks, p_edges = build_dependency_graph(premise)
+    h_toks, h_edges = build_dependency_graph(hypothesis)
+    wp = tokenizer.convert_ids_to_tokens(input_ids[0])
+    p_idx = align_tokens(p_toks, wp)
+    h_idx = align_tokens(h_toks, wp)
+    premise_graph_tokens = [p_toks]
+    premise_graph_edges = [p_edges]
+    premise_node_indices = [p_idx]
+    hypothesis_graph_tokens = [h_toks]
+    hypothesis_graph_edges = [h_edges]
+    hypothesis_node_indices = [h_idx]
+    # 6) Forward
+    with torch.no_grad():
+        out = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            premise_graph_tokens=premise_graph_tokens,
+            premise_graph_edges=premise_graph_edges,
+            premise_node_indices=premise_node_indices,
+            hypothesis_graph_tokens=hypothesis_graph_tokens,
+            hypothesis_graph_edges=hypothesis_graph_edges,
+            hypothesis_node_indices=hypothesis_node_indices
+        )
+    logits = out['logits'][0]  # shape [3]
+    probs = torch.softmax(logits, dim=-1).cpu().numpy()
+    # 7) Collapse to entailment vs. contradiction (ignore neutral)
+    entail, neutral, contra = probs
+    s = entail + contra + 1e-12
+    scores = [entail / s, contra / s]
+    label = 'entailment' if entail >= contra else 'contradiction'
+    return label, scores
+def train_model_with_chkpt(epochs: int = 5,
+                batch_size: int = 16,
+                lr: float = 2e-5,
+                save_model: bool = False,
+                save_path: str = 'gnn_model_checkpoint.pt',
+                resume: bool = False):
+    """
+    Train with mixed precision, gradient checkpointing, and resume support.
+    If resume=True and save_path exists, picks up from last epoch.
+    """
+    set_seed()
+    process_data()
+    logging.info("Loading preprocessed dataset…")
+    snli = load_from_disk(PREPROCESSED_DIR)
+    snli.set_format("python", output_all_columns=True)
+    train_loader = DataLoader(snli["train"], batch_size=batch_size, shuffle=True, collate_fn=my_collate_fn)
+    val_loader   = DataLoader(snli["validation"], batch_size=batch_size, collate_fn=my_collate_fn)
+    model = GraphAugmentedNLIModel(MODEL_NAME).to(DEVICE)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
+    total_steps = epochs * len(train_loader)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=total_steps)
+    # --- Resume checkpoint if requested ---
+    start_epoch = 1
+    if resume and os.path.isfile(save_path):
+        ckpt = torch.load(save_path, map_location=DEVICE)
+        model.load_state_dict(ckpt["model_state_dict"])
+        optimizer.load_state_dict(ckpt["optimizer_state_dict"])
+        scheduler.load_state_dict(ckpt["scheduler_state_dict"])
+        start_epoch = ckpt.get("epoch", 1) + 1
+        logging.info(f"Resuming from epoch {start_epoch}")
+    # Mixed precision setup
+    if hasattr(model.bert, "gradient_checkpointing_enable"):
+        model.bert.gradient_checkpointing_enable()
+        logging.info("Enabled gradient checkpointing on BERT.")
+    accelerator = Accelerator(mixed_precision=MIXED_PRECISION)
+    model, optimizer, train_loader, val_loader, scheduler = accelerator.prepare(
+        model, optimizer, train_loader, val_loader, scheduler
+    )
+    best_val_loss = float("inf")
+    for epoch in range(start_epoch, epochs + 1):
+        model.train()
+        train_losses = []
+        for batch in tqdm(train_loader, desc=f"Epoch {epoch}/{epochs}"):
+            optimizer.zero_grad()
+            outputs = model(
+                input_ids=batch["input_ids"].to(DEVICE),
+                attention_mask=batch["attention_mask"].to(DEVICE),
+                premise_graph_tokens=batch["premise_graph_tokens"],
+                premise_graph_edges=batch["premise_graph_edges"],
+                premise_node_indices=batch["premise_node_indices"],
+                hypothesis_graph_tokens=batch["hypothesis_graph_tokens"],
+                hypothesis_graph_edges=batch["hypothesis_graph_edges"],
+                hypothesis_node_indices=batch["hypothesis_node_indices"],
+                labels=batch.get("labels", None).to(DEVICE) if batch.get("labels") is not None else None
+            )
+            loss = outputs["loss"] if isinstance(outputs, dict) else outputs
+            accelerator.backward(loss)
+            optimizer.step()
+            scheduler.step()
+            train_losses.append(loss.item())
+        avg_train = np.mean(train_losses)
+        logging.info(f"Epoch {epoch} train loss: {avg_train:.4f}")
+        # Validation
+        model.eval()
+        val_losses = []
+        with torch.no_grad():
+            for batch in val_loader:
+                outputs = model(
+                    input_ids=batch["input_ids"].to(DEVICE),
+                    attention_mask=batch["attention_mask"].to(DEVICE),
+                    premise_graph_tokens=batch["premise_graph_tokens"],
+                    premise_graph_edges=batch["premise_graph_edges"],
+                    premise_node_indices=batch["premise_node_indices"],
+                    hypothesis_graph_tokens=batch["hypothesis_graph_tokens"],
+                    hypothesis_graph_edges=batch["hypothesis_graph_edges"],
+                    hypothesis_node_indices=batch["hypothesis_node_indices"],
+                    labels=batch.get("labels", None).to(DEVICE) if batch.get("labels") is not None else None
+                )
+                v_loss = outputs["loss"].item() if isinstance(outputs, dict) else outputs.item()
+                val_losses.append(v_loss)
+        avg_val = np.mean(val_losses) if val_losses else float("inf")
+        logging.info(f"Epoch {epoch} val loss: {avg_val:.4f}")
+        # Save checkpoint
+        ckpt = {
+            "epoch": epoch,
+            "model_state_dict": model.state_dict(),
+            "optimizer_state_dict": optimizer.state_dict(),
+            "scheduler_state_dict": scheduler.state_dict(),
+        }
+        torch.save(ckpt, save_path)
+        logging.info(f"Saved checkpoint: {save_path}")
+        if avg_val < best_val_loss:
+            best_val_loss = avg_val
+    logging.info(f"Training complete. Best val loss: {best_val_loss:.4f}")
+    return model
+def extract_sentences_by_intent(
+    text: str,
+    intent: str,
+    adapter_dir: str = "./lora_finance_adapter",
+    threshold: float = 0.7,
+    top_k: int = None,
+    min_words: int = 4,
+    convo_focus: str = None
+):
+    """
+    Splits `text` into sentences, embeds them (and the `intent`) under your
+    LoRA‐adapted BERT, and returns those whose cosine similarity ≥ `threshold`.
+    Loads the adapter from the single `training_checkpoint.pt` in `adapter_dir`.
+    """
+    # 1) Sentence split & cleanup
+    # 1) Only consider lines spoken by the customer
+    if convo_focus is None:
+        sentences = [sent.text.strip() for sent in nlp(text).sents if sent.text.strip()]
+    elif convo_focus == "customer":
+        customer_lines = [
+            line.strip()
+            for line in text.splitlines()
+            if line.strip().lower().startswith("customer:")
+        ]
+        # 2) Sentence-split each customer line
+        sentences = []
+        for cust_line in customer_lines:
+            for sent in nlp(cust_line).sents:
+                s = sent.text.strip()
+                if s and len(s.split(' '))>6:
+                    sentences.append(s)
+    else:
+        customer_lines = [
+            line.strip()
+            for line in text.splitlines()
+            if line.strip().lower().startswith("agent:")
+        ]
+        # 2) Sentence-split each customer line
+        sentences = []
+        for cust_line in customer_lines:
+            for sent in nlp(cust_line).sents:
+                s = sent.text.strip()
+                if s and len(s.split(' '))>6:
+                    sentences.append(s)
+    # 2) Load base BERT + wrap in same LoRA config
+    base_model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
+    lora_cfg = LoraConfig(
+        r=8,
+        lora_alpha=32,
+        lora_dropout=0.1,
+        bias="none",
+        task_type="CAUSAL_LM",        # must match your fine-tune setting
+    )
+    model = get_peft_model(base_model, lora_cfg).to(DEVICE)
+    # 3) Load your adapter checkpoint
+    chkpt_path = os.path.join(adapter_dir, "training_checkpoint.pt")
+    if not os.path.isfile(chkpt_path):
+        raise FileNotFoundError(f"No LoRA checkpoint at {chkpt_path}")
+    ckpt = torch.load(chkpt_path, map_location=DEVICE)
+    # ckpt["model_state_dict"] contains both base + LoRA weights; strict=False
+    model.load_state_dict(ckpt["model_state_dict"], strict=False)
+    model.eval()
+    # helper: get [CLS] embedding under LoRA-BERT
+    def embed(text_str):
+        toks = tokenizer(
+            text_str,
+            truncation=True,
+            padding="longest",
+            return_tensors="pt"
+        ).to(DEVICE)
+        em_args = {
+            "input_ids": toks["input_ids"],
+            "attention_mask": toks["attention_mask"],
+        }
+        if "token_type_ids" in toks:
+            em_args["token_type_ids"] = toks["token_type_ids"]
+        # unwrap PEFT to call only the base BertModel
+        hf_model = getattr(model, "base_model", model)
+        with torch.no_grad():
+            last_hidden = hf_model(
+                input_ids=em_args["input_ids"],
+                attention_mask=em_args["attention_mask"],
+                **({"token_type_ids": em_args["token_type_ids"]} if "token_type_ids" in em_args else {})
+            ).last_hidden_state
+        return last_hidden[:, 0, :]
+    # now embed(intent) and each sentence using this safe helper
+    intent_emb = embed(intent)
+    results = []
+    with torch.no_grad():
+        for sent in sentences:
+            clean = re.sub(r'^(Agent|Customer):\s*', "", sent)
+            if len(clean.split()) < min_words:
+                continue
+            sent_emb = embed(clean)
+            sim = F.cosine_similarity(sent_emb, intent_emb, dim=1).item()
+            if sim >= threshold:
+                results.append((clean, sim))
+    # 5) sort & trim
+    results.sort(key=lambda x: x[1], reverse=True)
+    return results[:top_k] if top_k else results
+def train_sentence_extractor(
+    model: nn.Module,
+    dataset: torch.utils.data.Dataset,
+    output_dir: str,
+    val_split: float = 0.2,
+    epochs: int      = 3,
+    batch_size: int  = 16,
+    lr: float        = 2e-5,
+    device: str      = "cpu",
+    unfreeze_after_epoch: int = 1,
+    threshold: float = 0.5
+):
+    """
+    Fine-tune `model` on `dataset`, hold out `val_split` for val,
+    compute loss + acc + precision + F1 each epoch, save best checkpoint,
+    and plot all four metrics at the end.
+    """
+    # Split
+    total = len(dataset)
+    val_n = int(total * val_split)
+    train_n = total - val_n
+    train_ds, val_ds = random_split(dataset, [train_n, val_n])
+    # Oversample train
+    train_labels = [train_ds[i]['label'].item() for i in range(len(train_ds))]
+    counts = torch.bincount(torch.tensor(train_labels, dtype=torch.long))
+    weights = (1.0 / counts.float()).tolist()
+    sample_weights = [weights[int(l)] for l in train_labels]
+    sampler = WeightedRandomSampler(sample_weights, num_samples=len(sample_weights), replacement=True)
+    train_loader = DataLoader(train_ds, batch_size=batch_size, sampler=sampler, drop_last=True)
+    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
+    model.to(device)
+    # initially freeze backbone
+    for p in model.bert.parameters(): p.requires_grad = False
+    optimizer = AdamW(model.parameters(), lr=lr)
+    total_steps = epochs * len(train_loader)
+    scheduler   = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=int(0.1 * total_steps),
+        num_training_steps=total_steps
+    )
+    criterion = nn.BCEWithLogitsLoss()
+    # storage for metrics
+    train_losses, val_losses = [], []
+    train_accs,  val_accs  = [], []
+    train_precs, val_precs = [], []
+    train_f1s,   val_f1s   = [], []
+    best_val_loss = float('inf')
+    for epoch in range(1, epochs+1):
+        # —— TRAIN ——
+        model.train()
+        epoch_loss = 0.0
+        preds, labels = [], []
+        for batch in tqdm(train_loader, desc=f"Train {epoch}/{epochs}"):
+            inputs = batch['input_ids'].to(device)
+            masks  = batch['attention_mask'].to(device)
+            labs   = batch['label'].to(device)
+            optimizer.zero_grad()
+            logits = model(inputs, masks)              # raw logits
+            loss   = criterion(logits, labs)
+            loss.backward()
+            optimizer.step()
+            scheduler.step()
+            epoch_loss += loss.item()
+            probs = torch.sigmoid(logits)
+            batch_preds = (probs >= threshold).long()
+            preds.extend(batch_preds.cpu().tolist())
+            labels.extend(labs.cpu().long().tolist())
+        avg_train = epoch_loss / len(train_loader)
+        train_losses.append(avg_train)
+        train_accs.append(  accuracy_score(labels, preds) )
+        train_precs.append( precision_score(labels, preds, zero_division=0) )
+        train_f1s.append(   f1_score(labels, preds, zero_division=0) )
+        print(f"→ Epoch {epoch} Train — loss {avg_train:.4f}, acc {train_accs[-1]:.4f}, prec {train_precs[-1]:.4f}, f1 {train_f1s[-1]:.4f}")
+        # unfreeze if needed
+        if epoch == unfreeze_after_epoch:
+            for p in model.bert.parameters(): p.requires_grad = True
+            optimizer = AdamW([
+                {"params": model.classifier.parameters(), "lr": 1e-3},
+                {"params": model.bert.parameters(),       "lr": 1e-5},
+            ], weight_decay=1e-2)
+            scheduler = get_linear_schedule_with_warmup(
+                optimizer,
+                num_warmup_steps=int(0.1 * total_steps),
+                num_training_steps=total_steps
+            )
+        # —— VALIDATION ——
+        model.eval()
+        epoch_loss = 0.0
+        preds, labels = [], []
+        with torch.no_grad():
+            for batch in tqdm(val_loader, desc=f" Val   {epoch}/{epochs}"):
+                inputs = batch['input_ids'].to(device)
+                masks  = batch['attention_mask'].to(device)
+                labs   = batch['label'].to(device)
+                logits = model(inputs, masks)
+                loss   = criterion(logits, labs)
+                epoch_loss += loss.item()
+                probs = torch.sigmoid(logits)
+                batch_preds = (probs >= threshold).long()
+                preds.extend(batch_preds.cpu().tolist())
+                labels.extend(labs.cpu().long().tolist())
+        avg_val = epoch_loss / len(val_loader)
+        val_losses.append(avg_val)
+        val_accs.append(  accuracy_score(labels, preds) )
+        val_precs.append( precision_score(labels, preds, zero_division=0) )
+        val_f1s.append(   f1_score(labels, preds, zero_division=0) )
+        print(f"→ Epoch {epoch}   Val — loss {avg_val:.4f}, acc {val_accs[-1]:.4f}, prec {val_precs[-1]:.4f}, f1 {val_f1s[-1]:.4f}")
+        # checkpoints
+        os.makedirs(output_dir, exist_ok=True)
+        ckpt = os.path.join(output_dir, f"epo{epoch}_val{avg_val:.4f}.pth")
+        torch.save(model.state_dict(), ckpt)
+        if avg_val < best_val_loss:
+            best_val_loss = avg_val
+            torch.save(model.state_dict(), os.path.join(output_dir, "best_model.pth"))
+            print(f"🎉 New best model saved (val loss {best_val_loss:.4f})")
+    print(f"✔️ Training complete — best val loss: {best_val_loss:.4f}")
+    # —— PLOT METRICS ——
+    epochs = list(range(1, epochs+1))
+    save_metric_plot(
+        epochs,
+        train_losses,
+        val_losses,
+        metric_name="Loss",
+        output_path="results/Loss_Plot.png"
+    )
+    save_metric_plot(
+        epochs,
+        train_accs,
+        val_accs,
+        metric_name="Accuracy",
+        output_path="results/Accuracy_Plot.png",
+        threshold=0.5
+    )
+    save_metric_plot(
+        epochs,
+        train_precs,
+        val_precs,
+        metric_name="Precision",
+        output_path="results/Precision_Plot.png",
+        threshold=0.5
+    )
+    save_metric_plot(
+        epochs,
+        train_f1s,
+        val_f1s,
+        metric_name="F1 Score",
+        output_path="results/F1Score_Plot.png",
+        threshold=0.5
+    )
+def save_metric_plot(
+    epochs,
+    train_vals,
+    val_vals,
+    metric_name: str,
+    output_path: str,
+    threshold: float = None
+):
+    """
+    epochs      – list of epoch indices
+    train_vals  – list of train metric values
+    val_vals    – list of validation metric values
+    metric_name – e.g. "Loss", "Accuracy", "Precision", "F1 Score"
+    output_path – where to save the PNG
+    threshold   – optional horizontal line to draw, e.g. 0.5
+    """
+    fig, ax = plt.subplots(figsize=(8, 5))
+    ax.plot(epochs, train_vals, marker='o', linewidth=2, label=f'Train {metric_name}')
+    ax.plot(epochs, val_vals,   marker='s', linewidth=2, label=f'Val {metric_name}')
+    if threshold is not None:
+        ax.axhline(threshold, color='gray', linestyle='--', linewidth=1, label=f'Threshold = {threshold}')
+    ax.set_title(f'{metric_name} over Epochs', fontsize=14, pad=10)
+    ax.set_xlabel('Epoch', fontsize=12)
+    ax.set_ylabel(metric_name, fontsize=12)
+    ax.grid(True, linestyle='--', alpha=0.4)
+    ax.legend(loc='best', frameon=True, fontsize=10)
+    fig.tight_layout()
+    fig.savefig(output_path, dpi=300)
+    plt.close(fig)
+def demo_on_random_val(
+    model,
+    tokenizer,
+    excel_path: str,
+    ckpt_path: str,
+    max_length: int = 128,
+    device: str    = "cpu",
+    temperature: float = 1.0
+):
+    """
+    Like demo_on_random_val, but instead of a fixed threshold:
+      1) Compute sigmoid(logits / temperature) for each sentence
+      2) Sort probabilities descending
+      3) Find the largest gap between adjacent probs
+      4) Set dynamic_threshold = midpoint of that gap
+      5) Extract all sentences with prob >= dynamic_threshold
+    """
+    # load model
+    model.load_state_dict(torch.load(ckpt_path, map_location=device))
+    model.to(device).eval()
+    # sample one from validation split
+    df = pd.read_excel(excel_path)
+    _, val_df = train_test_split(df, test_size=0.2, random_state=42)
+    row = val_df.sample(n=1, random_state=random.randint(0,999)).iloc[0]
+    transcript = str(row['Claude_Call'])
+    print(f"\n── Transcript (val sample idx={row['idx']}):\n{transcript}\n")
+    # split into sentences & run inference
+    sentences, probs = [], []
+    for sent in sent_tokenize(transcript):
+        enc = tokenizer.encode_plus(
+            sent,
+            max_length=max_length,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt'
+        )
+        logits = model(enc['input_ids'].to(device),
+                       enc['attention_mask'].to(device))
+        prob   = torch.sigmoid(logits / temperature).item()
+        sentences.append(sent)
+        probs.append(prob)
+    # print all
+    print("Sentence probabilities:")
+    for s,p in zip(sentences, probs):
+        print(f"  → {p:.4f} → {s}")
+    # if no variation, fall back to 0.5
+    if len(probs) < 2 or max(probs) - min(probs) < 1e-3:
+        dynamic_thr = 0.5
+    else:
+        # find elbow in sorted probabilities
+        sorted_probs = sorted(probs, reverse=True)
+        diffs = [sorted_probs[i] - sorted_probs[i+1] for i in range(len(sorted_probs)-1)]
+        idx = max(range(len(diffs)), key=lambda i: diffs[i])
+        # threshold is midpoint between the two
+        dynamic_thr = (sorted_probs[idx] + sorted_probs[idx+1]) / 2.0
+    print(f"\nDynamic threshold = {dynamic_thr:.4f}\n")
+    print("Extracted sentences:")
+    for s,p in zip(sentences, probs):
+        if p >= dynamic_thr:
+            print(f"  • {p:.4f} → {s}")
+    print()
+def batch_predict_and_save(
+    model,
+    tokenizer,
+    excel_path: str,
+    ckpt_path: str,
+    output_path: str,
+    n_samples: int     = 40,
+    max_length: int    = 128,
+    device: str        = "cpu",
+    temperature: float = 1.0,
+    random_state: int  = None
+):
+    """
+    1) Loads best checkpoint
+    2) Samples `n_samples` rows
+    3) For each transcript:
+         - tokenize into sentences
+         - compute p = sigmoid(logits/temperature)
+         - compute elbow threshold on sorted p’s
+         - extract all sentences with p >= elbow
+         - if none, pick the highest-p sentence
+    4) Save new Excel with columns:
+         - 'Claude_Call'
+         - 'Predicted Sel_K' (list of extracted sentences)
+    """
+    # load model
+    model.load_state_dict(torch.load(ckpt_path, map_location=device))
+    model.to(device).eval()
+    # sample rows
+    df = pd.read_excel(excel_path)
+    sampled = df.sample(n=n_samples, random_state=random_state) \
+               if random_state is not None else df.sample(n=n_samples)
+    records = []
+    for _, row in tqdm(sampled.iterrows(),
+                       total=len(sampled),
+                       desc="Running Predictions"):
+        transcript = str(row['Claude_Call'])
+        sentences  = sent_tokenize(transcript)
+        # compute probabilities
+        probs = []
+        for sent in sentences:
+            enc = tokenizer.encode_plus(
+                sent,
+                max_length=max_length,
+                padding='max_length',
+                truncation=True,
+                return_tensors='pt'
+            )
+            with torch.no_grad():
+                logits = model(enc['input_ids'].to(device),
+                               enc['attention_mask'].to(device))
+                p = torch.sigmoid(logits / temperature).item()
+            probs.append(p)
+        # dynamic threshold via elbow detection
+        if len(probs) >= 2 and max(probs) - min(probs) > 1e-3:
+            sp = sorted(probs, reverse=True)
+            diffs = [sp[i] - sp[i+1] for i in range(len(sp)-1)]
+            idx  = max(range(len(diffs)), key=lambda i: diffs[i])
+            thr  = (sp[idx] + sp[idx+1]) / 2.0
+        else:
+            thr = 0.5  # fallback
+        # collect all above threshold, else top-1
+        extracted = [s for s,p in zip(sentences, probs) if p >= thr]
+        if not extracted and sentences:
+            best_idx = int(max(range(len(probs)), key=lambda i: probs[i]))
+            extracted = [sentences[best_idx]]
+        records.append({
+            'Claude_Call':     transcript,
+            'Predicted Sel_K': extracted
+        })
+    # save
+    out_df = pd.DataFrame(records)
+    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
+    out_df.to_excel(output_path, index=False)
+    print(f"➡️ Saved {len(out_df)} rows to {output_path}")