Spaces:

text-peccavi
/

TEXT_PECCAVI

Sleeping

+import gradio as gr
+from utils.watermark import Watermarker
+from utils.config import load_config
+from renderers.highlighter import highlight_common_words, highlight_common_words_dict, reparaphrased_sentences_html
+from renderers.tree import generate_subplot1, generate_subplot2
+from pathlib import Path
+import time
+from typing import Dict, List, Tuple, Any
+import plotly.graph_objects as go
+class WatermarkerInterface:
+    def __init__(self, config):
+        self.pipeline = Watermarker(config)
+        self.common_grams = {}
+        self.highlight_info = []
+        self.masked_sentences = []
+        # Add tracking dictionaries for indexing
+        self.masked_sentence_indices = {}  # Maps original sentences to masked indices
+        self.sampled_sentence_indices = {}  # Maps masked sentences to sampling indices
+        self.reparaphrased_indices = {}    # Maps sampled sentences to reparaphrased indices
+    def handle_paraphrase(self, prompt: str) -> Tuple[str, str, str, str]:
+        """Wrapper for paraphrasing that includes highlighting"""
+        start_time = time.time()
+        # Run paraphrasing
+        self.pipeline.Paraphrase(prompt)
+        # Step 1: Process the original sentence first
+        seen_ngrams = {}  # Stores first occurrence index of each n-gram
+        original_indexed_ngrams = []  # Final indexed list for original
+        original_sentence = self.pipeline.user_prompt
+        original_ngrams = self.pipeline.common_grams.get(original_sentence, {})
+        # Step 1.1: Extract n-grams and their first occurrence index
+        ngram_occurrences = [
+            (min(indices, key=lambda x: x[0])[0], gram)  # Get first index
+            for gram, indices in original_ngrams.items()
+        ]
+        # Step 1.2: Sort n-grams based on their first occurrence
+        ngram_occurrences.sort()
+        # Step 1.3: Assign sequential indices
+        for idx, (position, gram) in enumerate(ngram_occurrences, start=1):
+            seen_ngrams[gram] = idx  # Assign sequential index
+            original_indexed_ngrams.append((idx, gram))
+        print("Original Indexed N-grams:", original_indexed_ngrams)
+        #generate highlight_info
+        colors = ["red", "blue", "green", "orange"]
+        highlight_info = [
+            (ngram, colors[i % len(colors)])
+            for i, (index, ngram) in enumerate(original_indexed_ngrams)
+        ]
+        common_grams = original_indexed_ngrams
+        self.highlight_info = highlight_info
+        self.common_grams = common_grams
+        # Step 2: Process paraphrased sentences and match indices
+        paraphrase_indexed_ngrams = {}
+        for sentence in self.pipeline.paraphrased_sentences:
+            sentence_ngrams = []  # Stores n-grams for this sentence
+            sentence_ngrams_dict = self.pipeline.common_grams.get(sentence, {})
+            for gram, indices in sentence_ngrams_dict.items():
+                first_occurrence = min(indices, key=lambda x: x[0])[0]
+                # Use the original's index if exists, otherwise assign a new one
+                if gram in seen_ngrams:
+                    index = seen_ngrams[gram]  # Use the same index as original
+                else:
+                    index = len(seen_ngrams) + 1  # Assign new index
+                    seen_ngrams[gram] = index  # Store it
+                sentence_ngrams.append((index, gram))
+            sentence_ngrams.sort()
+            paraphrase_indexed_ngrams[sentence] = sentence_ngrams
+        print("Paraphrase Indexed N-grams:", paraphrase_indexed_ngrams)
+        # Step 3: Generate highlighted versions using the renderer
+        highlighted_prompt = highlight_common_words(
+            common_grams,
+            [self.pipeline.user_prompt],
+            "Original Prompt with Highlighted Common Sequences"
+        )
+        highlighted_accepted = highlight_common_words_dict(
+            common_grams,
+            self.pipeline.selected_sentences,
+            "Accepted Paraphrased Sentences with Entailment Scores"
+        )
+        highlighted_discarded = highlight_common_words_dict(
+            common_grams,
+            self.pipeline.discarded_sentences,
+            "Discarded Paraphrased Sentences with Entailment Scores"
+        )
+        execution_time = f"<div class='execution-time'>Step 1 completed in {time.time() - start_time:.2f} seconds</div>"
+        return highlighted_prompt, highlighted_accepted, highlighted_discarded, execution_time
+    def handle_masking(self):
+        start_time = time.time()
+        masking_results = self.pipeline.Masking()
+        trees = []
+        highlight_info = self.highlight_info
+        common_grams = self.common_grams
+        sentence_to_masked = {}
+        self.masked_sentence_indices = {}
+        for strategy, sentence_dict in masking_results.items():
+            for sent, data in sentence_dict.items():
+                if sent not in sentence_to_masked:
+                    sentence_to_masked[sent] = []
+                masked_sentence = data.get("masked_sentence", "")
+                if masked_sentence:
+                    sentence_to_masked[sent].append((masked_sentence, strategy))
+        plot_idx = 1
+        for original_sentence, masked_sentences_data in sentence_to_masked.items():
+            if not masked_sentences_data:
+                continue
+            masked_idx = 1
+            for masked_sentence, strategy in masked_sentences_data:
+                index = f"{plot_idx}{masked_idx}"
+                if original_sentence not in self.masked_sentence_indices:
+                    self.masked_sentence_indices[original_sentence] = {}
+                key = f"{strategy}_{masked_sentence}"
+                self.masked_sentence_indices[original_sentence][key] = {
+                    'index': index,
+                    'strategy': strategy,
+                    'masked_sentence': masked_sentence
+                }
+                masked_idx += 1
+            masked_sentences = [ms[0] for ms in masked_sentences_data]
+            indexed_masked_sentences = []
+            verified_strategies = []
+            for masked_sentence, strategy in masked_sentences_data:
+                key = f"{strategy}_{masked_sentence}"
+                entry = self.masked_sentence_indices[original_sentence][key]
+                idx = entry['index']
+                indexed_masked_sentences.append(f"[{idx}] {masked_sentence}")
+                verified_strategies.append(entry['strategy'])
+            try:
+                fig = generate_subplot1(
+                    original_sentence,
+                    indexed_masked_sentences,
+                    verified_strategies,
+                    highlight_info,
+                    common_grams
+                )
+                trees.append(fig)
+            except Exception as e:
+                print(f"Error generating plot: {e}")
+                trees.append(go.Figure())
+            plot_idx += 1
+        while len(trees) < 10:
+            trees.append(go.Figure())
+        execution_time = f"<div class='execution-time'>Step 2 completed in {time.time() - start_time:.2f} seconds</div>"
+        return trees[:10] + [execution_time]
+    def handle_sampling(self) -> Tuple[List[go.Figure], str]:
+        start_time = time.time()
+        sampling_results = self.pipeline.Sampling()
+        trees = []
+        self.sampled_sentence_indices = {}
+        organized_results = {}
+        for sampling_strategy, masking_dict in sampling_results.items():
+            for masking_strategy, sentences in masking_dict.items():
+                for original_sentence, data in sentences.items():
+                    if original_sentence not in organized_results:
+                        organized_results[original_sentence] = {}
+                    if masking_strategy not in organized_results[original_sentence]:
+                        organized_results[original_sentence][masking_strategy] = {
+                            "masked_sentence": data.get("masked_sentence", ""),
+                            "sampled_sentences": {}
+                        }
+                    organized_results[original_sentence][masking_strategy]["sampled_sentences"][sampling_strategy] = data.get("sampled_sentence", "")
+        plot_idx = 1
+        for original_sentence, data in organized_results.items():
+            masked_sentences = []
+            all_sampled_sentences = []
+            indexed_sampled_sentences = []
+            masked_indices = self.masked_sentence_indices.get(original_sentence, {})
+            for masking_strategy, masking_data in list(data.items())[:3]:
+                masked_sentence = masking_data.get("masked_sentence", "")
+                if masked_sentence:
+                    masked_sentences.append(masked_sentence)
+                    masked_idx = None
+                    for ms_key, ms_data in masked_indices.items():
+                        if ms_key == f"{masking_strategy}_{masked_sentence}":
+                            masked_idx = ms_data['index']
+                            break
+                    if not masked_idx:
+                        print(f"Warning: No index found for masked sentence: {masked_sentence}")
+                        continue
+                    sample_count = 1
+                    for sampling_strategy, sampled_sentence in masking_data.get("sampled_sentences", {}).items():
+                        if sampled_sentence:
+                            sample_idx = f"{masked_idx}.{sample_count}"
+                            if masked_sentence not in self.sampled_sentence_indices:
+                                self.sampled_sentence_indices[masked_sentence] = {}
+                            self.sampled_sentence_indices[masked_sentence][sampled_sentence] = {
+                                'index': sample_idx,
+                                'strategy': sampling_strategy
+                            }
+                            indexed_sampled_sentences.append(f"[{sample_idx}] {sampled_sentence}")
+                            all_sampled_sentences.append(sampled_sentence)
+                            sample_count += 1
+            if masked_sentences:
+                indexed_masked_sentences = []
+                for ms in masked_sentences:
+                    idx = ""
+                    for ms_key, ms_data in masked_indices.items():
+                        if ms_key.endswith(f"_{ms}"):
+                            idx = ms_data['index']
+                            break
+                    indexed_masked_sentences.append(f"[{idx}] {ms}")
+                try:
+                    fig = generate_subplot2(
+                        indexed_masked_sentences,
+                        indexed_sampled_sentences,
+                        self.highlight_info,
+                        self.common_grams
+                    )
+                    trees.append(fig)
+                except Exception as e:
+                    print(f"Error generating subplot for {original_sentence}: {e}")
+                    trees.append(go.Figure())
+            plot_idx += 1
+        print("Sampled sentence indices:", self.sampled_sentence_indices)
+        while len(trees) < 10:
+            trees.append(go.Figure())
+        execution_time = f"<div class='execution-time'>Step 3 completed in {time.time() - start_time:.2f} seconds</div>"
+        return trees[:10] + [execution_time]
+    def handle_reparaphrasing(self) -> Tuple[List[str], str]:
+        start_time = time.time()
+        results = self.pipeline.re_paraphrasing()
+        html_outputs = []
+        self.reparaphrased_indices = {}
+        tab_count = 1
+        for sampling_strategy, masking_dict in results.items():
+            for masking_strategy, sentences in masking_dict.items():
+                for original_sent, data in sentences.items():
+                    sampled_sentence = data.get("sampled_sentence", "")
+                    if not sampled_sentence or not data["re_paraphrased_sentences"]:
+                        continue
+                    sampled_index = None
+                    for masked_sent, sampled_dict in self.sampled_sentence_indices.items():
+                        if sampled_sentence in sampled_dict:
+                            sampled_index = sampled_dict[sampled_sentence]['index']
+                            break
+                    if not sampled_index:
+                        sampled_index = "unknown"
+                    indexed_reparaphrased = []
+                    for i, rp_sent in enumerate(data["re_paraphrased_sentences"], 1):
+                        rp_idx = f"{tab_count}.({sampled_index}).{i}"
+                        if sampled_sentence not in self.reparaphrased_indices:
+                            self.reparaphrased_indices[sampled_sentence] = {}
+                        self.reparaphrased_indices[sampled_sentence][rp_sent] = rp_idx
+                        indexed_reparaphrased.append(f"[{rp_idx}] {rp_sent}")
+                    print(f"Reparaphrasing {tab_count}.({sampled_index}): {' '.join(sampled_sentence.split()[:5])}...")
+                    html = reparaphrased_sentences_html(indexed_reparaphrased)
+                    html_outputs.append(html)
+                    tab_count += 1
+        print("Reparaphrased indices:", self.reparaphrased_indices)
+        while len(html_outputs) < 120:
+            html_outputs.append("")
+        execution_time = f"<div class='execution-time'>Step 4 completed in {time.time() - start_time:.2f} seconds</div>"
+        return html_outputs[:120] + [execution_time]
+def create_gradio_interface(config):
+    """Creates the Gradio interface with the updated pipeline"""
+    interface = WatermarkerInterface(config)
+    with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
+        #CSS to enable scrolling for reparaphrased sentences and sampling plots
+        demo.css = """
+/* Set fixed height for the reparaphrased tabs container only */
+.gradio-container .tabs[id="reparaphrased-tabs"],
+.gradio-container .tabs[id="sampling-tabs"] {
+    overflow-x: hidden;
+    white-space: normal;
+    border-radius: 8px;
+    max-height: 600px; /* Set fixed height for the entire tabs component */
+    overflow-y: auto; /* Enable vertical scrolling inside the container */
+}
+/* Tab content styling for reparaphrased and sampling tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tabitem,
+.gradio-container .tabs[id="sampling-tabs"] .tabitem {
+    overflow-x: hidden;
+    white-space: normal;
+    display: block;
+    border-radius: 8px;
+}
+/* Make the tab navigation fixed at the top for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav {
+    display: flex;
+    overflow-x: auto;
+    white-space: nowrap;
+    scrollbar-width: thin;
+    border-radius: 8px;
+    scrollbar-color: #888 #f1f1f1;
+    position: sticky;
+    top: 0;
+    background: white;
+    z-index: 100;
+}
+/* Dropdown menu for scrollable tabs styling */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown {
+    position: relative;
+    display: inline-block;
+}
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown-content,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown-content {
+    display: none;
+    position: absolute;
+    background-color: #f9f9f9;
+    min-width: 160px;
+    box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+    z-index: 1;
+    max-height: 300px;
+    overflow-y: auto;
+}
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown:hover .tab-dropdown-content,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown:hover .tab-dropdown-content {
+    display: block;
+}
+/* Scrollbar styling for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav::-webkit-scrollbar,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav::-webkit-scrollbar {
+    height: 8px;
+    border-radius: 8px;
+}
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav::-webkit-scrollbar-track,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav::-webkit-scrollbar-track {
+    background: #f1f1f1;
+    border-radius: 8px;
+}
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav::-webkit-scrollbar-thumb,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav::-webkit-scrollbar-thumb {
+    background: #888;
+    border-radius: 8px;
+}
+/* Tab button styling for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-item,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-item {
+    flex: 0 0 auto;
+    border-radius: 8px;
+}
+/* Plot container styling specifically for sampling tabs */
+.gradio-container .tabs[id="sampling-tabs"] .plot-container {
+    min-height: 600px;
+    max-height: 1800px;
+    overflow-y: auto;
+}
+/* Ensure text wraps in HTML components */
+.gradio-container .prose {
+    white-space: normal;
+    word-wrap: break-word;
+    overflow-wrap: break-word;
+}
+/* Dropdown button styling for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown button,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown button {
+    background-color: #f0f0f0;
+    border: 1px solid #ddd;
+    border-radius: 4px;
+    padding: 5px 10px;
+    cursor: pointer;
+    margin: 2px;
+}
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown button:hover,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown button:hover {
+    background-color: #e0e0e0;
+}
+/* Style dropdown content items for scrollable tabs */
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown-content div,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown-content div {
+    padding: 8px 12px;
+    cursor: pointer;
+}
+.gradio-container .tabs[id="reparaphrased-tabs"] .tab-nav .tab-dropdown-content div:hover,
+.gradio-container .tabs[id="sampling-tabs"] .tab-nav .tab-dropdown-content div:hover {
+    background-color: #e0e0e0;
+}
+/* Custom styling for execution time display */
+.execution-time {
+    text-align: right;
+    padding: 8px 16px;
+    font-family: inherit;
+    color: #555;
+    font-size: 0.9rem;
+    font-style: italic;
+    margin-left: auto;
+    width: 100%;
+    border-top: 1px solid #eee;
+    margin-top: 8px;
+}
+/* Layout for section headers with execution time */
+.section-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    width: 100%;
+    margin-bottom: 12px;
+}
+.section-header h3 {
+    margin: 0;
+}
+"""
+        gr.Markdown("# **AIISC Watermarking Model**")
+        with gr.Column():
+            gr.Markdown("## Input Prompt")
+            user_input = gr.Textbox(
+                label="Enter Your Prompt",
+                placeholder="Type your text here..."
+            )
+        with gr.Row():
+            with gr.Column(scale=3):
+                gr.Markdown("## Step 1: Paraphrasing, LCS and Entailment Analysis")
+            with gr.Column(scale=1):
+                step1_time = gr.HTML()
+        paraphrase_button = gr.Button("Generate Paraphrases")
+        highlighted_user_prompt = gr.HTML(label="Highlighted User Prompt")
+        with gr.Tabs():
+            with gr.TabItem("Accepted Paraphrased Sentences"):
+                highlighted_accepted_sentences = gr.HTML()
+            with gr.TabItem("Discarded Paraphrased Sentences"):
+                highlighted_discarded_sentences = gr.HTML()
+        with gr.Row():
+            with gr.Column(scale=3):
+                gr.Markdown("## Step 2: Where to Mask?")
+            with gr.Column(scale=1):
+                step2_time = gr.HTML()
+        masking_button = gr.Button("Apply Masking")
+        gr.Markdown("### Masked Sentence Trees")
+        tree1_plots = []
+        with gr.Tabs() as tree1_tabs:
+            for i in range(10):
+                with gr.TabItem(f"Masked Sentence {i+1}"):
+                    tree1 = gr.Plot()
+                    tree1_plots.append(tree1)
+        with gr.Row():
+            with gr.Column(scale=3):
+                gr.Markdown("## Step 3: How to Mask?")
+            with gr.Column(scale=1):
+                step3_time = gr.HTML()
+        sampling_button = gr.Button("Sample Words")
+        gr.Markdown("### Sampled Sentence Trees")
+        tree2_plots = []
+        # Add elem_id to make this tab container scrollable
+        with gr.Tabs(elem_id="sampling-tabs") as tree2_tabs:
+            for i in range(10):
+                with gr.TabItem(f"Sampled Sentence {i+1}"):
+                    # Add a custom class to the container to enable proper styling
+                    with gr.Column(elem_classes=["plot-container"]):
+                        tree2 = gr.Plot()
+                        tree2_plots.append(tree2)
+        with gr.Row():
+            with gr.Column(scale=3):
+                gr.Markdown("## Step 4: Re-paraphrasing")
+            with gr.Column(scale=1):
+                step4_time = gr.HTML()
+        reparaphrase_button = gr.Button("Re-paraphrase")
+        gr.Markdown("### Reparaphrased Sentences")
+        reparaphrased_sentences_tabs = []
+        with gr.Tabs(elem_id="reparaphrased-tabs") as reparaphrased_tabs:
+            for i in range(120):
+                with gr.TabItem(f"Reparaphrased Batch {i+1}"):
+                    reparaphrased_sent_html = gr.HTML()
+                    reparaphrased_sentences_tabs.append(reparaphrased_sent_html)
+        # Connect the interface functions to the buttons
+        paraphrase_button.click(
+            interface.handle_paraphrase,
+            inputs=user_input,
+            outputs=[
+                highlighted_user_prompt,
+                highlighted_accepted_sentences,
+                highlighted_discarded_sentences,
+                step1_time
+            ]
+        )
+        masking_button.click(
+            interface.handle_masking,
+            inputs=None,
+            outputs=tree1_plots + [step2_time]
+        )
+        sampling_button.click(
+            interface.handle_sampling,
+            inputs=None,
+            outputs=tree2_plots + [step3_time]
+        )
+        reparaphrase_button.click(
+            interface.handle_reparaphrasing,
+            inputs=None,
+            outputs=reparaphrased_sentences_tabs + [step4_time]
+        )
+    return demo
+if __name__ == "__main__":
+    project_root = Path(__file__).parent.parent
+    config_path = project_root / "utils" / "config.yaml"
+    config = load_config(config_path)['PECCAVI_TEXT']
+    create_gradio_interface(config).launch()

app.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import gradio as gr
+from UI.gradio import create_gradio_interface
+from pathlib import Path
+from utils.config import load_config
+project_root = Path(__file__).resolve().parent
+config_path = project_root / "utils" / "config.yaml"
+config = load_config(config_path)['PECCAVI_TEXT']
+def main():
+    """
+    This function is the entry point for the PECCAVI Watermarking Model.
+    It creates the Gradio interface for the model and runs it.
+    """
+    create_gradio_interface(config).launch()
+if __name__ == "__main__":
+    main()

environment.yml ADDED Viewed

	@@ -0,0 +1,245 @@

+name: panda
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - asttokens=2.4.1=pyhd8ed1ab_0
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2024.8.30=hbcca054_0
+  - comm=0.2.2=pyhd8ed1ab_0
+  - debugpy=1.8.6=py310hf71b8c6_0
+  - decorator=5.1.1=pyhd8ed1ab_0
+  - exceptiongroup=1.2.2=pyhd8ed1ab_0
+  - executing=2.1.0=pyhd8ed1ab_0
+  - ipykernel=6.29.5=pyh3099207_0
+  - ipython=8.27.0=pyh707e725_0
+  - jedi=0.19.1=pyhd8ed1ab_0
+  - jupyter_client=8.6.3=pyhd8ed1ab_0
+  - jupyter_core=5.7.2=pyh31011fe_1
+  - krb5=1.21.3=h143b758_0
+  - ld_impl_linux-64=2.40=h12ee557_0
+  - libedit=3.1.20230828=h5eee18b_0
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc=14.1.0=h77fa898_1
+  - libgcc-ng=14.1.0=h69a702a_1
+  - libgomp=14.1.0=h77fa898_1
+  - libsodium=1.0.20=h4ab18f5_0
+  - libstdcxx=14.1.0=hc0a3c3a_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - matplotlib-inline=0.1.7=pyhd8ed1ab_0
+  - ncurses=6.4=h6a678d5_0
+  - nest-asyncio=1.6.0=pyhd8ed1ab_0
+  - openssl=3.3.2=hb9d3cd8_0
+  - packaging=24.1=pyhd8ed1ab_0
+  - parso=0.8.4=pyhd8ed1ab_0
+  - pexpect=4.9.0=pyhd8ed1ab_0
+  - pickleshare=0.7.5=py_1003
+  - pip=24.2=py310h06a4308_0
+  - platformdirs=4.3.6=pyhd8ed1ab_0
+  - prompt-toolkit=3.0.48=pyha770c72_0
+  - ptyprocess=0.7.0=pyhd3deb0d_0
+  - pure_eval=0.2.3=pyhd8ed1ab_0
+  - pygments=2.18.0=pyhd8ed1ab_0
+  - python=3.10.14=h955ad1f_1
+  - python_abi=3.10=2_cp310
+  - pyzmq=26.2.0=py310h71f11fc_2
+  - readline=8.2=h5eee18b_0
+  - setuptools=75.1.0=py310h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - stack_data=0.6.2=pyhd8ed1ab_0
+  - tk=8.6.14=h39e8969_0
+  - tornado=6.4.1=py310ha75aee5_1
+  - traitlets=5.14.3=pyhd8ed1ab_0
+  - typing_extensions=4.12.2=pyha770c72_0
+  - wcwidth=0.2.13=pyhd8ed1ab_0
+  - wheel=0.44.0=py310h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - zeromq=4.3.5=ha4adb4c_5
+  - zlib=1.2.13=h5eee18b_1
+  - pip:
+      - absl-py==2.1.0
+      - accelerate==0.33.0
+      - aiofiles==23.2.1
+      - aiohappyeyeballs==2.3.5
+      - aiohttp==3.10.3
+      - aiosignal==1.3.1
+      - altgraph==0.17.4
+      - annotated-types==0.7.0
+      - anyio==4.6.0
+      - astunparse==1.6.3
+      - async-timeout==4.0.3
+      - attrs==24.2.0
+      - av==12.0.0
+      - backports-tarfile==1.2.0
+      - beautifulsoup4==4.12.3
+      - build==1.2.2
+      - cachetools==5.5.0
+      - certifi==2024.7.4
+      - cffi==1.17.1
+      - charset-normalizer==3.3.2
+      - clean-fid==0.1.35
+      - click==8.1.7
+      - colorama==0.4.6
+      - contextlib2==21.6.0
+      - contourpy==1.2.1
+      - cryptography==43.0.1
+      - cycler==0.12.1
+      - datasets==2.21.0
+      - diffusers==0.27.2
+      - dill==0.3.8
+      - docker-pycreds==0.4.0
+      - docutils==0.21.2
+      - fastapi==0.115.0
+      - ffmpy==0.4.0
+      - filelock==3.15.4
+      - flatbuffers==24.3.25
+      - fonttools==4.53.1
+      - frozenlist==1.4.1
+      - fsspec==2024.6.1
+      - gast==0.4.0
+      - gdown==5.2.0
+      - gitdb==4.0.11
+      - gitpython==3.1.43
+      - google-auth==2.35.0
+      - google-auth-oauthlib==0.4.6
+      - google-pasta==0.2.0
+      - gradio==4.44.0
+      - gradio-client==1.3.0
+      - grpcio==1.65.4
+      - h11==0.14.0
+      - h5py==3.11.0
+      - httpcore==1.0.6
+      - httpx==0.27.2
+      - huggingface-hub==0.25.2
+      - idna==3.7
+      - imageio==2.35.0
+      - importlib-metadata==8.2.0
+      - importlib-resources==6.4.5
+      - jaraco-classes==3.4.0
+      - jaraco-context==6.0.1
+      - jaraco-functools==4.1.0
+      - jeepney==0.8.0
+      - jinja2==3.1.4
+      - joblib==1.4.2
+      - json-with-comments==1.2.7
+      - keras==3.5.0
+      - keras-preprocessing==1.1.2
+      - keyring==25.4.1
+      - kiwisolver==1.4.5
+      - kornia==0.7.4
+      - kornia-rs==0.1.7
+      - lazy-loader==0.4
+      - libclang==18.1.1
+      - markdown==3.6
+      - markdown-it-py==3.0.0
+      - markupsafe==2.1.5
+      - matplotlib==3.9.2
+      - mdurl==0.1.2
+      - ml-collections==0.1.1
+      - ml-dtypes==0.4.0
+      - more-itertools==10.5.0
+      - multidict==6.0.5
+      - multiprocess==0.70.16
+      - namex==0.0.8
+      - networkx==3.3
+      - nh3==0.2.18
+      - nltk==3.9.1
+      - numpy==1.26.4
+      - nvidia-cublas-cu11==11.10.3.66
+      - nvidia-cuda-nvrtc-cu11==11.7.99
+      - nvidia-cuda-runtime-cu11==11.7.99
+      - nvidia-cudnn-cu11==8.5.0.96
+      - oauthlib==3.2.2
+      - opencv-python==4.10.0.84
+      - opencv-python-headless==4.10.0.84
+      - opt-einsum==3.3.0
+      - optree==0.12.1
+      - orjson==3.10.7
+      - pandas==2.2.2
+      - pillow==10.4.0
+      - pkginfo==1.10.0
+      - plotly==5.24.1
+      - protobuf==4.25.5
+      - psutil==5.9.8
+      - pyarrow==17.0.0
+      - pyasn1==0.6.1
+      - pyasn1-modules==0.4.1
+      - pycparser==2.22
+      - pydantic==2.9.2
+      - pydantic-core==2.23.4
+      - pydub==0.25.1
+      - pyinstaller==6.10.0
+      - pyinstaller-hooks-contrib==2024.8
+      - pyparsing==3.1.2
+      - pyproject-hooks==1.1.0
+      - pysocks==1.7.1
+      - python-dateutil==2.9.0.post0
+      - python-multipart==0.0.12
+      - pytorch-msssim==1.0.0
+      - pytorchcv==0.0.73
+      - pytz==2023.3.post1
+      - pyyaml==6.0.2
+      - readme-renderer==44.0
+      - regex==2024.7.24
+      - requests==2.32.3
+      - requests-oauthlib==2.0.0
+      - requests-toolbelt==1.0.0
+      - rfc3986==2.0.0
+      - rich==13.7.1
+      - rsa==4.9
+      - ruff==0.6.9
+      - safetensors==0.4.4
+      - saliency==0.2.1
+      - scikit-image==0.24.0
+      - scikit-learn==1.6.0
+      - scipy==1.14.0
+      - secretstorage==3.3.3
+      - semantic-version==2.10.0
+      - sentence-transformers==3.3.1
+      - sentry-sdk==2.15.0
+      - setproctitle==1.3.3
+      - shapely==2.0.5
+      - shellingham==1.5.4
+      - six==1.12.0
+      - smmap==5.0.1
+      - sniffio==1.3.1
+      - soupsieve==2.6
+      - spaces==0.30.2
+      - starlette==0.38.6
+      - tenacity==9.0.0
+      - tensorboard==2.17.1
+      - tensorboard-data-server==0.7.2
+      - tensorboard-plugin-wit==1.8.1
+      - tensorflow==2.17.0
+      - tensorflow-estimator==2.10.0
+      - tensorflow-hub==0.16.1
+      - tensorflow-intel==0.0.1
+      - tensorflow-io-gcs-filesystem==0.31.0
+      - termcolor==1.1.0
+      - tf-keras==2.17.0
+      - threadpoolctl==3.5.0
+      - tifffile==2024.8.10
+      - timm==1.0.10
+      - tokenizers==0.19.1
+      - tomli==2.0.1
+      - tomlkit==0.12.0
+      - torch==1.13.1
+      - torchvision==0.14.1
+      - tqdm==4.66.5
+      - transformers==4.43.3
+      - twine==5.1.1
+      - typer==0.12.5
+      - tzdata==2024.1
+      - urllib3==2.2.2
+      - uvicorn==0.31.0
+      - wandb==0.18.3
+      - websockets==12.0
+      - werkzeug==3.0.4
+      - wrapt==1.11.2
+      - xxhash==3.4.1
+      - yarl==1.9.4
+      - zipp==3.20.0
+prefix: /home/ashhar21137/miniconda3/envs/panda

metrics/detectability.py ADDED Viewed

	@@ -0,0 +1,323 @@

+# Import necessary libraries
+import nltk
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import BertModel, BertTokenizer
+from scipy import stats
+# Download NLTK data if not already present
+nltk.download('punkt', quiet=True)
+detectability_val = {}
+class SentenceDetectabilityCalculator:
+    """
+    A class to calculate and analyze detectability metrics between an original sentence and paraphrased sentences.
+    """
+    def __init__(self, original_sentence, paraphrased_sentences):
+        """
+        Initialize the calculator with the original sentence and a list of paraphrased sentences.
+        """
+        self.original_sentence = original_sentence
+        self.paraphrased_sentences = paraphrased_sentences
+        # Raw metric dictionaries
+        self.z_scores = {}
+        self.p_values = {}
+        self.metric_values = []
+        # Normalized metric dictionaries
+        self.normalized_z_scores = {}
+        self.normalized_p_values = {}
+        # Combined detectability dictionary
+        self.combined_detectabilities = {}
+        # Load pre-trained BERT for embeddings
+        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
+        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    def calculate_all_metrics(self):
+        """
+        Calculate detectability metrics for each paraphrased sentence.
+        """
+        original_embedding = self._get_sentence_embedding(self.original_sentence)
+        # First, compute the metric values (cosine similarities)
+        for idx, paraphrased_sentence in enumerate(self.paraphrased_sentences):
+            paraphrase_embedding = self._get_sentence_embedding(paraphrased_sentence)
+            cosine_sim = cosine_similarity([original_embedding], [paraphrase_embedding])[0][0]
+            self.metric_values.append(cosine_sim)
+        # Compute mean and standard deviation of the metric values
+        metric_mean = np.mean(self.metric_values)
+        metric_std = np.std(self.metric_values)
+        # Compute z-scores and p-values
+        for idx, (paraphrased_sentence, metric_value) in enumerate(zip(self.paraphrased_sentences, self.metric_values)):
+            key = f"Sentence_{idx+1}"
+            z_score = (metric_value - metric_mean) / metric_std if metric_std != 0 else 0.0
+            p_value = stats.norm.sf(abs(z_score)) * 2  # two-tailed p-value
+            self.z_scores[key] = z_score
+            self.p_values[key] = p_value
+    def normalize_metrics(self):
+        """
+        Normalize z-scores and p-values to be between 0 and 1.
+        """
+        self.normalized_z_scores = self._normalize_dict(self.z_scores)
+        self.normalized_p_values = self._normalize_dict(self.p_values)
+    def calculate_combined_detectability(self):
+        """
+        Calculate the combined detectability using the root mean square of the normalized metrics.
+        """
+        for key in self.normalized_z_scores.keys():
+            rms = np.sqrt(
+                (
+                    self.normalized_z_scores[key] ** 2 +
+                    self.normalized_p_values[key] ** 2
+                ) / 2
+            )
+            self.combined_detectabilities[key] = rms
+    def plot_metrics(self):
+        """
+        Plot each normalized metric and the combined detectability in separate graphs.
+        """
+        keys = list(self.normalized_z_scores.keys())
+        indices = np.arange(len(keys))
+        # Prepare data for plotting
+        metrics = {
+            'Z-Score': [self.normalized_z_scores[key] for key in keys],
+            'P-Value': [self.normalized_p_values[key] for key in keys],
+            'Combined Detectability': [self.combined_detectabilities[key] for key in keys]
+        }
+        # Plot each metric separately
+        for metric_name, values in metrics.items():
+            plt.figure(figsize=(12, 6))
+            plt.plot(indices, values, marker='o', color=np.random.rand(3,))
+            plt.xlabel('Sentence Index')
+            plt.ylabel('Normalized Value (0-1)')
+            plt.title(f'Normalized {metric_name}')
+            plt.grid(True)
+            plt.tight_layout()
+            plt.show()
+    # Private methods
+    def _get_sentence_embedding(self, sentence):
+        """
+        Get sentence embedding using BERT.
+        """
+        tokens = self.bert_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
+        with torch.no_grad():
+            outputs = self.bert_model(**tokens)
+        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+    def _normalize_dict(self, metric_dict):
+        """
+        Normalize the values in a dictionary to be between 0 and 1.
+        """
+        values = np.array(list(metric_dict.values()))
+        min_val = values.min()
+        max_val = values.max()
+        # Avoid division by zero if all values are the same
+        if max_val - min_val == 0:
+            normalized_values = np.zeros_like(values)
+        else:
+            normalized_values = (values - min_val) / (max_val - min_val)
+        return dict(zip(metric_dict.keys(), normalized_values))
+    # Getter methods
+    def get_normalized_metrics(self):
+        """
+        Get all normalized metrics as a dictionary.
+        """
+        return {
+            'Z-Score': self.normalized_z_scores,
+            'P-Value': self.normalized_p_values
+        }
+    def get_combined_detectabilities(self):
+        """
+        Get the dictionary of combined detectability values.
+        """
+        return self.combined_detectabilities
+# Example usage
+if __name__ == "__main__":
+    # Original sentence
+    original_sentence = "The quick brown fox jumps over the lazy dog"
+    # Paraphrased sentences
+    paraphrased_sentences = [
+    # Original 1: "A swift auburn fox leaps across a sleepy canine."
+    "The swift auburn fox leaps across a sleepy canine.",
+    "A quick auburn fox leaps across a sleepy canine.",
+    "A swift ginger fox leaps across a sleepy canine.",
+    "A swift auburn fox bounds across a sleepy canine.",
+    "A swift auburn fox leaps across a tired canine.",
+    "Three swift auburn foxes leap across a sleepy canine.",
+    "The vulpine specimen rapidly traverses over a dormant dog.",
+    "Like lightning, the russet hunter soars over the drowsy guardian.",
+    "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
+    "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
+    "A swift auburn predator navigates across a lethargic pet.",
+    "Subject A (fox) demonstrates velocity over Subject B (dog).",
+    # Original 2: "The agile russet fox bounds over an idle hound."
+    "Some agile russet foxes bound over an idle hound.",
+    "The nimble russet fox bounds over an idle hound.",
+    "The agile brown fox bounds over an idle hound.",
+    "The agile russet fox jumps over an idle hound.",
+    "The agile russet fox bounds over a lazy hound.",
+    "Two agile russet foxes bound over an idle hound.",
+    "A dexterous vulpine surpasses a stationary canine.",
+    "Quick as thought, the copper warrior sails over the guardian.",
+    "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
+    "A dexterous V. vulpes exceeds the plane of an inactive canine.",
+    "An agile russet hunter maneuvers above a resting hound.",
+    "Test subject F-1 achieves displacement superior to subject D-1.",
+    # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
+    "The nimble mahogany vulpine vaults above a drowsy dog.",
+    "A swift mahogany vulpine vaults above a drowsy dog.",
+    "A nimble reddish vulpine vaults above a drowsy dog.",
+    "A nimble mahogany fox vaults above a drowsy dog.",
+    "A nimble mahogany vulpine leaps above a drowsy dog.",
+    "Four nimble mahogany vulpines vault above a drowsy dog.",
+    "An agile specimen of reddish fur surpasses a somnolent canine.",
+    "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
+    "Tha quick brown beastie jumps o'er the tired pup, aye.",
+    "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
+    "A nimble rust-colored predator crosses above a drowsy pet.",
+    "Observed: Subject Red executes vertical motion over Subject Gray.",
+    # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
+    "A speedy copper-colored fox hops over the lethargic pup.",
+    "The quick copper-colored fox hops over the lethargic pup.",
+    "The speedy bronze fox hops over the lethargic pup.",
+    "The speedy copper-colored fox jumps over the lethargic pup.",
+    "The speedy copper-colored fox hops over the tired pup.",
+    "Multiple speedy copper-colored foxes hop over the lethargic pup.",
+    "A rapid vulpine of bronze hue traverses an inactive young canine.",
+    "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
+    "Tha fast copper beastie leaps o'er the sleepy wee dog.",
+    "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
+    "A fleet copper-toned predator moves past a sluggish young dog.",
+    "Field note: Adult fox subject exceeds puppy subject vertically.",
+    # Original 5: "A rapid tawny fox springs over a sluggish dog."
+    "The rapid tawny fox springs over a sluggish dog.",
+    "A quick tawny fox springs over a sluggish dog.",
+    "A rapid golden fox springs over a sluggish dog.",
+    "A rapid tawny fox jumps over a sluggish dog.",
+    "A rapid tawny fox springs over a lazy dog.",
+    "Six rapid tawny foxes spring over a sluggish dog.",
+    "An expeditious yellowish vulpine surpasses a torpid canine.",
+    "Fast as a bullet, the golden hunter vaults over the idle guard.",
+    "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
+    "One V. vulpes displays rapid transit over one inactive C. familiaris.",
+    "A speedy yellow-brown predator bypasses a motionless dog.",
+    "Log entry: Vulpine subject achieves swift vertical displacement.",
+    # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
+    "A fleet-footed chestnut fox soars above an indolent canine.",
+    "The swift chestnut fox soars above an indolent canine.",
+    "The fleet-footed brown fox soars above an indolent canine.",
+    "The fleet-footed chestnut fox leaps above an indolent canine.",
+    "The fleet-footed chestnut fox soars above a lazy canine.",
+    "Several fleet-footed chestnut foxes soar above an indolent canine.",
+    "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
+    "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
+    "Tha quick brown beastie sails o'er the sleepy hound, ken.",
+    "Single agile V. vulpes achieves elevation above stationary canine.",
+    "A nimble brown predator glides over an unmoving domestic animal.",
+    "Research note: Brown subject displays superior vertical mobility.",
+    # Original 7: "A fast ginger fox hurdles past a slothful dog."
+    "The fast ginger fox hurdles past a slothful dog.",
+    "A quick ginger fox hurdles past a slothful dog.",
+    "A fast red fox hurdles past a slothful dog.",
+    "A fast ginger fox jumps past a slothful dog.",
+    "A fast ginger fox hurdles past a lazy dog.",
+    "Five fast ginger foxes hurdle past a slothful dog.",
+    "A rapid orange vulpine bypasses a lethargic canine.",
+    "Quick as lightning, the flame-colored hunter races past the lazy guard.",
+    "Tha swift ginger beastie leaps past the tired doggy, ye see.",
+    "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
+    "A speedy red-orange predator overtakes a motionless dog.",
+    "Data point: Orange subject demonstrates rapid transit past Gray subject.",
+    # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
+    "A spry rusty-colored fox jumps across a dozing hound.",
+    "The agile rusty-colored fox jumps across a dozing hound.",
+    "The spry reddish fox jumps across a dozing hound.",
+    "The spry rusty-colored fox leaps across a dozing hound.",
+    "The spry rusty-colored fox jumps across a sleeping hound.",
+    "Multiple spry rusty-colored foxes jump across a dozing hound.",
+    "An agile rust-toned vulpine traverses a somnolent canine.",
+    "Nimble as thought, the copper hunter bounds over the resting guard.",
+    "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
+    "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
+    "A lithe rust-tinted predator moves past a slumbering dog.",
+    "Observation: Russet subject exhibits agility over dormant subject.",
+    # Original 9: "A quick tan fox leaps over an inactive dog."
+    "The quick tan fox leaps over an inactive dog.",
+    "A swift tan fox leaps over an inactive dog.",
+    "A quick beige fox leaps over an inactive dog.",
+    "A quick tan fox jumps over an inactive dog.",
+    "A quick tan fox leaps over a motionless dog.",
+    "Seven quick tan foxes leap over an inactive dog.",
+    "A rapid light-brown vulpine surpasses a stationary canine.",
+    "Fast as wind, the sand-colored hunter soars over the still guard.",
+    "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
+    "One agile fawn V. vulpes traverses one immobile C. familiaris.",
+    "A fleet tan-colored predator bypasses an unmoving dog.",
+    "Field report: Tan subject demonstrates movement over static subject.",
+    # Original 10: "The brisk auburn vulpine bounces over a listless canine."
+    "Some brisk auburn vulpines bounce over a listless canine.",
+    "The quick auburn vulpine bounces over a listless canine.",
+    "The brisk russet vulpine bounces over a listless canine.",
+    "The brisk auburn fox bounces over a listless canine.",
+    "The brisk auburn vulpine jumps over a listless canine.",
+    "Five brisk auburn vulpines bounce over a listless canine.",
+    "The expeditious specimen supersedes a quiescent Canis lupus.",
+    "Swift as wind, the russet hunter vaults over the idle guardian.",
+    "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
+    "One V. vulpes achieves displacement over inactive C. familiaris.",
+    "A high-velocity auburn predator traverses an immobile animal.",
+    "Final observation: Red subject shows mobility over Gray subject."
+    ]
+    # Initialize the calculator
+    calculator = SentenceDetectabilityCalculator(original_sentence, paraphrased_sentences)
+    # Calculate all metrics
+    calculator.calculate_all_metrics()
+    # Normalize the metrics
+    calculator.normalize_metrics()
+    # Calculate combined detectability
+    calculator.calculate_combined_detectability()
+    # Retrieve the normalized metrics and combined detectabilities
+    normalized_metrics = calculator.get_normalized_metrics()
+    combined_detectabilities = calculator.get_combined_detectabilities()
+    detectability_val = combined_detectabilities
+    # Display the results
+    print("\nCombined Detectabilities:")
+    for each in combined_detectabilities.items():
+        print(f"{each[1]}")
+    # Plot the metrics (optional)
+    #calculator.plot_metrics()

metrics/distortion.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import os
+import sys
+from tqdm import tqdm
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+from bert_score import BERTScorer
+from bert_score.utils import model2layers
+from nltk.tokenize import word_tokenize
+from Levenshtein import distance as levenshtein_distance
+from sentence_transformers import SentenceTransformer
+from sklearn.feature_extraction.text import TfidfVectorizer
+from scipy.spatial.distance import cdist
+from scipy.optimize import linear_sum_assignment
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from config.config import load_config
+config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml')
+config = load_config(config_path)['PECCAVI_TEXT']['Metrics']
+class SentenceDistortionCalculator:
+    """
+    A class to calculate and analyze distortion metrics between an original sentence and modified sentences.
+    """
+    def __init__(self, config, original_sentence, paraphrased_sentences):
+        """
+        Initialize the calculator with the original sentence and a list of modified sentences.
+        """
+        self.original_sentence = original_sentence
+        self.paraphrased_sentences = paraphrased_sentences
+        self.levenshtein_distances = {}
+        self.bert_scores = {}
+        self.mover_scores = {}
+        self.normalized_levenshtein = {}
+        self.normalized_bert_scores = {}
+        self.normalized_mover_scores = {}
+        self.combined_distortions = {}
+        self.tokenizer = GPT2TokenizerFast.from_pretrained(config['Distortion'])
+        self.model = GPT2LMHeadModel.from_pretrained(config['Distortion'])
+        self.model.eval()
+    def calculate_all_metrics(self):
+        """
+        Calculate all distortion metrics for each modified sentence.
+        """
+        for idx, modified_sentence in tqdm(enumerate(self.paraphrased_sentences), total=len(self.paraphrased_sentences), desc="Calculating Metrics"):
+            key = f"Sentence_{idx+1}"
+            self.levenshtein_distances[key] = self._calculate_levenshtein_distance(modified_sentence)
+            self.bert_scores[key] = self._calculate_bert_score(modified_sentence)
+            self.mover_scores[key] = self._calculate_mover_score(modified_sentence)
+    def normalize_metrics(self):
+        """
+        Normalize all metrics to be between 0 and 1.
+        """
+        for _ in tqdm(range(1), desc="Normalizing Metrics"):  # Add tqdm here (wrap the normalization process)
+            self.normalized_levenshtein = self._normalize_dict(self.levenshtein_distances)
+            self.normalized_bert_scores = self._normalize_dict(self.bert_scores)
+            self.normalized_mover_scores = self._normalize_dict(self.mover_scores)
+    def calculate_combined_distortion(self):
+        """
+        Calculate the combined distortion using the root mean square of the normalized metrics.
+        """
+        for _ in tqdm(range(1), desc="Calculating Combined Distortion"):  # Add tqdm here
+            for key in self.normalized_levenshtein.keys():
+                rms = np.sqrt(
+                    (
+                        self.normalized_levenshtein[key] ** 2 +
+                        self.normalized_bert_scores[key] ** 2+
+                        self.normalized_mover_scores[key] **2
+                    ) / 3
+                )
+                self.combined_distortions[key] = rms
+    def plot_metrics(self):
+        """
+        Plot each normalized metric and the combined distortion in separate graphs.
+        """
+        keys = list(self.normalized_levenshtein.keys())
+        indices = np.arange(len(keys))
+        # Prepare data for plotting
+        metrics = {
+            'Levenshtein Distance': [self.normalized_levenshtein[key] for key in keys],
+            'BERTScore': [self.normalized_bert_scores[key] for key in keys],
+            'MOVERscore':[self.normalized_mover_scores[key] for key in keys],
+            'Combined Distortion': [self.combined_distortions[key] for key in keys]
+        }
+        # Plot each metric separately
+        for metric_name, values in tqdm(metrics.items(), desc="Plotting Metrics"):  # Add tqdm here
+            plt.figure(figsize=(12, 6))
+            plt.plot(indices, values, marker='o', color=np.random.rand(3,))
+            plt.xlabel('Sentence Index')
+            plt.ylabel('Normalized Value (0-1)')
+            plt.title(f'Normalized {metric_name}')
+            plt.grid(True)
+            plt.tight_layout()
+            plt.show()
+    def _calculate_levenshtein_distance(self, modified_sentence):
+        """
+        Calculate the word-level Levenshtein distance between the original and modified sentence.
+        """
+        words1 = word_tokenize(self.original_sentence)
+        words2 = word_tokenize(modified_sentence)
+        lev_distance = levenshtein_distance(words1, words2)
+        return (lev_distance / max(len(words1), len(words2)))
+    def _calculate_bert_score(self, modified_sentence):
+        """
+        Compute the BERTScore similarity between the original and modified sentence.
+        Returns 1 - F1 score to represent dissimilarity.
+        """
+        if not hasattr(self, 'original_sentence'):
+            raise ValueError("original_sentence is not set. Please set self.original_sentence before calling this function.")
+        if not isinstance(modified_sentence, str):
+            raise ValueError("modified_sentence must be a string.")
+        model_type = "microsoft/deberta-xlarge-mnli"
+        num_layers = model2layers[model_type]
+        if not hasattr(self, "cached_bertscorer"):
+            self.cached_bertscorer = BERTScorer(
+                model_type=model_type,
+                num_layers=num_layers,
+                batch_size=1,  # Single sentence comparison
+                nthreads=4,
+                all_layers=False,
+                idf=False,
+                device="cuda" if torch.cuda.is_available() else "cpu",
+                lang="en"
+            )
+        # Compute BERTScore
+        _, _, F1 = self.cached_bertscorer.score(
+            cands=[modified_sentence],
+            refs=[self.original_sentence],
+            verbose=False,
+            batch_size=1
+        )
+        return 1 - F1.item()  # Return dissimilarity score
+    def _calculate_mover_score(self,modified_sentence,model_name='all-MiniLM-L6-v2'):
+        """Compute MoverScore correctly using word-level embeddings."""
+        if not self.original_sentence:
+            raise ValueError("Original sentence not provided.")
+        # Tokenize sentences
+        original_tokens = self.original_sentence.split()
+        modified_tokens = modified_sentence.split()
+        model = SentenceTransformer(model_name)
+        # Compute word embeddings
+        original_embeddings = model.encode(original_tokens, convert_to_numpy=True)
+        modified_embeddings = model.encode(modified_tokens, convert_to_numpy=True)
+        # Compute cost matrix (cosine distance)
+        cost_matrix = cdist(original_embeddings, modified_embeddings, metric='cosine')
+        # Solve optimal transport problem (Hungarian Algorithm)
+        row_ind, col_ind = linear_sum_assignment(cost_matrix)
+        # Compute IDF weights
+        vectorizer = TfidfVectorizer()
+        vectorizer.fit([self.original_sentence, modified_sentence])
+        idf_values = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))
+        # Apply IDF weighting to aligned word pairs
+        idf_weights_original = np.array([idf_values.get(word.lower(), 1.0) for word in original_tokens])
+        idf_weights_modified = np.array([idf_values.get(word.lower(), 1.0) for word in modified_tokens])
+        combined_idf_weights = (idf_weights_original[row_ind] + idf_weights_modified[col_ind]) / 2
+        weighted_score = np.sum((1 - cost_matrix[row_ind, col_ind]) * combined_idf_weights) / np.sum(combined_idf_weights)
+        return 1-weighted_score  # Higher score = more dissimilar
+    def _normalize_dict(self, metric_dict):
+        """
+        Normalize the values in a dictionary to be between 0 and 1.
+        """
+        values = np.array(list(metric_dict.values()))
+        min_val = values.min()
+        max_val = values.max()
+        if max_val - min_val == 0:
+            normalized_values = np.zeros_like(values)
+        else:
+            normalized_values = (values - min_val) / (max_val - min_val)
+        return dict(zip(metric_dict.keys(), normalized_values))
+    def get_normalized_metrics(self):
+        """
+        Get all normalized metrics as a dictionary.
+        """
+        return {
+            'Min Edit Distance': self.normalized_levenshtein,
+            'BERTScore': self.normalized_bert_scores,
+            'Mover Score': self.normalized_mover_scores
+        }
+    def get_combined_distortions(self):
+        """
+        Get the dictionary of combined distortion values.
+        """
+        return self.combined_distortions
+# Example usage
+if __name__ == "__main__":
+    config = load_config(config_path)['PECCAVI_TEXT']['Metrics']
+    # Original sentence
+    original_sentence = "The quick brown fox jumps over the lazy dog"
+    # Paraphrased sentences
+    paraphrased_sentences = [
+    # Original 1: "A swift auburn fox leaps across a sleepy canine."
+    "The swift auburn fox leaps across a sleepy canine.",
+    "A quick auburn fox leaps across a sleepy canine.",
+    "A swift ginger fox leaps across a sleepy canine.",
+    "A swift auburn fox bounds across a sleepy canine.",
+    "A swift auburn fox leaps across a tired canine.",
+    "Three swift auburn foxes leap across a sleepy canine.",
+    "The vulpine specimen rapidly traverses over a dormant dog.",
+    "Like lightning, the russet hunter soars over the drowsy guardian.",
+    "Tha quick ginger fox jumps o'er the lazy hound, ye ken.",
+    "One rapid Vulpes vulpes traverses the path of a quiescent canine.",
+    "A swift auburn predator navigates across a lethargic pet.",
+    "Subject A (fox) demonstrates velocity over Subject B (dog).",
+    # Original 2: "The agile russet fox bounds over an idle hound."
+    "Some agile russet foxes bound over an idle hound.",
+    "The nimble russet fox bounds over an idle hound.",
+    "The agile brown fox bounds over an idle hound.",
+    "The agile russet fox jumps over an idle hound.",
+    "The agile russet fox bounds over a lazy hound.",
+    "Two agile russet foxes bound over an idle hound.",
+    "A dexterous vulpine surpasses a stationary canine.",
+    "Quick as thought, the copper warrior sails over the guardian.",
+    "Tha nimble reddish fox jumps o'er the doggo, don't ya know.",
+    "A dexterous V. vulpes exceeds the plane of an inactive canine.",
+    "An agile russet hunter maneuvers above a resting hound.",
+    "Test subject F-1 achieves displacement superior to subject D-1.",
+    # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog."
+    "The nimble mahogany vulpine vaults above a drowsy dog.",
+    "A swift mahogany vulpine vaults above a drowsy dog.",
+    "A nimble reddish vulpine vaults above a drowsy dog.",
+    "A nimble mahogany fox vaults above a drowsy dog.",
+    "A nimble mahogany vulpine leaps above a drowsy dog.",
+    "Four nimble mahogany vulpines vault above a drowsy dog.",
+    "An agile specimen of reddish fur surpasses a somnolent canine.",
+    "Fleet as wind, the earth-toned hunter soars over the sleepy guard.",
+    "Tha quick brown beastie jumps o'er the tired pup, aye.",
+    "Single V. vulpes demonstrates vertical traverse over C. familiaris.",
+    "A nimble rust-colored predator crosses above a drowsy pet.",
+    "Observed: Subject Red executes vertical motion over Subject Gray.",
+    # Original 4: "The speedy copper-colored fox hops over the lethargic pup."
+    "A speedy copper-colored fox hops over the lethargic pup.",
+    "The quick copper-colored fox hops over the lethargic pup.",
+    "The speedy bronze fox hops over the lethargic pup.",
+    "The speedy copper-colored fox jumps over the lethargic pup.",
+    "The speedy copper-colored fox hops over the tired pup.",
+    "Multiple speedy copper-colored foxes hop over the lethargic pup.",
+    "A rapid vulpine of bronze hue traverses an inactive young canine.",
+    "Swift as a dart, the metallic hunter bounds over the lazy puppy.",
+    "Tha fast copper beastie leaps o'er the sleepy wee dog.",
+    "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.",
+    "A fleet copper-toned predator moves past a sluggish young dog.",
+    "Field note: Adult fox subject exceeds puppy subject vertically.",
+    # Original 5: "A rapid tawny fox springs over a sluggish dog."
+    "The rapid tawny fox springs over a sluggish dog.",
+    "A quick tawny fox springs over a sluggish dog.",
+    "A rapid golden fox springs over a sluggish dog.",
+    "A rapid tawny fox jumps over a sluggish dog.",
+    "A rapid tawny fox springs over a lazy dog.",
+    "Six rapid tawny foxes spring over a sluggish dog.",
+    "An expeditious yellowish vulpine surpasses a torpid canine.",
+    "Fast as a bullet, the golden hunter vaults over the idle guard.",
+    "Tha swift yellowy fox jumps o'er the lazy mutt, aye.",
+    "One V. vulpes displays rapid transit over one inactive C. familiaris.",
+    "A speedy yellow-brown predator bypasses a motionless dog.",
+    "Log entry: Vulpine subject achieves swift vertical displacement.",
+    # Original 6: "The fleet-footed chestnut fox soars above an indolent canine."
+    "A fleet-footed chestnut fox soars above an indolent canine.",
+    "The swift chestnut fox soars above an indolent canine.",
+    "The fleet-footed brown fox soars above an indolent canine.",
+    "The fleet-footed chestnut fox leaps above an indolent canine.",
+    "The fleet-footed chestnut fox soars above a lazy canine.",
+    "Several fleet-footed chestnut foxes soar above an indolent canine.",
+    "A rapid brown vulpine specimen traverses a lethargic domestic dog.",
+    "Graceful as a bird, the nutbrown hunter flies over the lazy guard.",
+    "Tha quick brown beastie sails o'er the sleepy hound, ken.",
+    "Single agile V. vulpes achieves elevation above stationary canine.",
+    "A nimble brown predator glides over an unmoving domestic animal.",
+    "Research note: Brown subject displays superior vertical mobility.",
+    # Original 7: "A fast ginger fox hurdles past a slothful dog."
+    "The fast ginger fox hurdles past a slothful dog.",
+    "A quick ginger fox hurdles past a slothful dog.",
+    "A fast red fox hurdles past a slothful dog.",
+    "A fast ginger fox jumps past a slothful dog.",
+    "A fast ginger fox hurdles past a lazy dog.",
+    "Five fast ginger foxes hurdle past a slothful dog.",
+    "A rapid orange vulpine bypasses a lethargic canine.",
+    "Quick as lightning, the flame-colored hunter races past the lazy guard.",
+    "Tha swift ginger beastie leaps past the tired doggy, ye see.",
+    "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.",
+    "A speedy red-orange predator overtakes a motionless dog.",
+    "Data point: Orange subject demonstrates rapid transit past Gray subject.",
+    # Original 8: "The spry rusty-colored fox jumps across a dozing hound."
+    "A spry rusty-colored fox jumps across a dozing hound.",
+    "The agile rusty-colored fox jumps across a dozing hound.",
+    "The spry reddish fox jumps across a dozing hound.",
+    "The spry rusty-colored fox leaps across a dozing hound.",
+    "The spry rusty-colored fox jumps across a sleeping hound.",
+    "Multiple spry rusty-colored foxes jump across a dozing hound.",
+    "An agile rust-toned vulpine traverses a somnolent canine.",
+    "Nimble as thought, the copper hunter bounds over the resting guard.",
+    "Tha lively rust-colored beastie hops o'er the snoozin' hound.",
+    "Single dexterous V. vulpes crosses path of dormant C. familiaris.",
+    "A lithe rust-tinted predator moves past a slumbering dog.",
+    "Observation: Russet subject exhibits agility over dormant subject.",
+    # Original 9: "A quick tan fox leaps over an inactive dog."
+    "The quick tan fox leaps over an inactive dog.",
+    "A swift tan fox leaps over an inactive dog.",
+    "A quick beige fox leaps over an inactive dog.",
+    "A quick tan fox jumps over an inactive dog.",
+    "A quick tan fox leaps over a motionless dog.",
+    "Seven quick tan foxes leap over an inactive dog.",
+    "A rapid light-brown vulpine surpasses a stationary canine.",
+    "Fast as wind, the sand-colored hunter soars over the still guard.",
+    "Tha nimble tan beastie jumps o'er the quiet doggy, aye.",
+    "One agile fawn V. vulpes traverses one immobile C. familiaris.",
+    "A fleet tan-colored predator bypasses an unmoving dog.",
+    "Field report: Tan subject demonstrates movement over static subject.",
+    # Original 10: "The brisk auburn vulpine bounces over a listless canine."
+    "Some brisk auburn vulpines bounce over a listless canine.",
+    "The quick auburn vulpine bounces over a listless canine.",
+    "The brisk russet vulpine bounces over a listless canine.",
+    "The brisk auburn fox bounces over a listless canine.",
+    "The brisk auburn vulpine jumps over a listless canine.",
+    "Five brisk auburn vulpines bounce over a listless canine.",
+    "The expeditious specimen supersedes a quiescent Canis lupus.",
+    "Swift as wind, the russet hunter vaults over the idle guardian.",
+    "Tha quick ginger beastie hops o'er the lazy mutt, aye.",
+    "One V. vulpes achieves displacement over inactive C. familiaris.",
+    "A high-velocity auburn predator traverses an immobile animal.",
+    "Final observation: Red subject shows mobility over Gray subject."
+    ]
+    distortion_calculator = SentenceDistortionCalculator(config, original_sentence, paraphrased_sentences)
+    for _ in tqdm(range(1)):
+        distortion_calculator.calculate_all_metrics()
+        distortion_calculator.normalize_metrics()
+        distortion_calculator.calculate_combined_distortion()
+        distortion_calculator.plot_metrics()
+    print("Normalized Metrics:", distortion_calculator.get_normalized_metrics())
+    print("Combined Distortion:", distortion_calculator.get_combined_distortions())

renderers/__pycache__/highlighter.cpython-310.pyc ADDED Viewed

Binary file (4.98 kB). View file

renderers/__pycache__/highlighter.cpython-311.pyc ADDED Viewed

Binary file (8.07 kB). View file

renderers/__pycache__/plot_3d.cpython-310.pyc ADDED Viewed

Binary file (4.34 kB). View file

renderers/__pycache__/plot_3d.cpython-311.pyc ADDED Viewed

Binary file (6 kB). View file

renderers/__pycache__/tree.cpython-310.pyc ADDED Viewed

Binary file (10.6 kB). View file

renderers/__pycache__/tree.cpython-311.pyc ADDED Viewed

Binary file (21.1 kB). View file

renderers/highlighter.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import re
+def create_matching_pattern(word):
+    """Creates appropriate regex pattern based on word characteristics"""
+    escaped_word = re.escape(word)
+    # Check for special cases
+    if any(char in word for char in '&-/.\'()[]$€£¥+') or ' ' in word:
+        # Special handling for phrases with special characters or spaces
+        return rf'{escaped_word}'
+    elif word.endswith('%'):
+        # Special handling for percentage values
+        numeric_part = word[:-1]
+        return rf'\b{re.escape(numeric_part)}\s*%'
+    elif re.search(r'[0-9]', word) and re.search(r'[a-zA-Z]', word):
+        # Special handling for alphanumeric combinations
+        return rf'{escaped_word}'
+    else:
+        # Standard word boundary pattern for simple words
+        return rf'\b{escaped_word}\b'
+def highlight_common_words(common_words, sentences, title):
+    """
+    Highlight common words in sentences by adding color-coded background and unique IDs.
+    Args:
+        common_words (list of tuples): List of tuples where each tuple contains a word's index and the word.
+        sentences (list of str): List of sentences to search through.
+        title (str): The title for the HTML output.
+    Returns:
+        str: HTML string with the highlighted sentences.
+    """
+    color_map = {}
+    color_index = 0
+    highlighted_html = []
+    pastel_colors = ['#E199C6','#7FB3D5', '#E57373',  '#B388EB', '#80D9AA', '#F0B66B',
+                     "#73D9A0", "#9A89EB", "#E38AD4", "#7FAFDB", "#DDAF8C", "#DA7FC1",
+                     "#65CFA5", "#B38FDE", "#E6C97A"]
+    # Process each sentence
+    for idx, sentence in enumerate(sentences, start=1):
+        sentence_with_idx = f"{idx}. {sentence}"
+        highlighted_sentence = sentence_with_idx
+        # Highlight common words in each sentence
+        for index, word in common_words:
+            if word not in color_map:
+                color_map[word] = pastel_colors[color_index % len(pastel_colors)]
+                color_index += 1
+            # Create appropriate pattern based on word characteristics
+            pattern = create_matching_pattern(word)
+            # Replace the word with highlighted version
+            highlighted_sentence = re.sub(
+                pattern,
+                lambda m, idx=index, color=color_map[word]: (
+                    f'<span style="background-color: {color}; font-weight: bold;'
+                    f' padding: 2px 4px; border-radius: 2px; position: relative;">'
+                    f'<span style="background-color: black; color: white; border-radius: 50%;'
+                    f' padding: 2px 5px; margin-right: 5px;">{idx}</span>'
+                    f'{m.group(0)}'
+                    f'</span>'
+                ),
+                highlighted_sentence,
+                flags=re.IGNORECASE
+            )
+        highlighted_html.append(highlighted_sentence)
+    # Format the HTML output with the title
+    final_html = "<br><br>".join(highlighted_html)
+    return f'''
+    <div style="border: solid 1px #FFFFFF; padding: 16px; background-color: #000000; color: #FFFFFF; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
+        <h3 style="margin-top: 0; font-size: 1em; color: #FFFFFF;">{title}</h3>
+        <div style="background-color: #333333; line-height: 1.6; padding: 15px; border-radius: 8px; color: #FFFFFF;">{final_html}</div>
+    </div>
+    '''
+def highlight_common_words_dict(common_words, sentences, title):
+    """
+    Highlight common words in sentences (from a dictionary) by adding color-coded background and unique IDs.
+    Args:
+        common_words (list of tuples): List of tuples where each tuple contains a word's index and the word.
+        sentences (dict): A dictionary of sentences where the key is the sentence and the value is an entailment score.
+        title (str): The title for the HTML output.
+    Returns:
+        str: HTML string with the highlighted sentences and their entailment scores.
+    """
+    color_map = {}
+    color_index = 0
+    highlighted_html = []
+    pastel_colors = ['#E199C6','#7FB3D5', '#E57373',  '#B388EB', '#80D9AA', '#F0B66B',
+                     "#73D9A0", "#9A89EB", "#E38AD4", "#7FAFDB", "#DDAF8C", "#DA7FC1",
+                     "#65CFA5", "#B38FDE", "#E6C97A"]
+    # Process each sentence and its score
+    for idx, (sentence, score) in enumerate(sentences.items(), start=1):
+        sentence_with_idx = f"{idx}. {sentence}"
+        highlighted_sentence = sentence_with_idx
+        # Highlight common words in each sentence
+        for index, word in common_words:
+            if word not in color_map:
+                color_map[word] = pastel_colors[color_index % len(pastel_colors)]
+                color_index += 1
+            # Create appropriate pattern based on word characteristics
+            pattern = create_matching_pattern(word)
+            # Replace the word with highlighted version
+            highlighted_sentence = re.sub(
+                pattern,
+                lambda m, idx=index, color=color_map[word]: (
+                    f'<span style="background-color: {color}; font-weight: bold;'
+                    f' padding: 1px 2px; border-radius: 2px; position: relative;">'
+                    f'<span style="background-color: black; color: white; border-radius: 50%;'
+                    f' padding: 1px 3px; margin-right: 3px; font-size: 0.8em;">{idx}</span>'
+                    f'{m.group(0)}'
+                    f'</span>'
+                ),
+                highlighted_sentence,
+                flags=re.IGNORECASE
+            )
+        # Add the entailment score
+        highlighted_html.append(
+            f'<div style="margin-bottom: 5px;">'
+            f'{highlighted_sentence}'
+            f'<div style="display: inline-block; margin-left: 5px; padding: 3px 5px; border-radius: 3px; '
+            f'background-color: #333333; color: white; font-size: 0.9em;">'
+            f'Entailment Score: {score}</div></div>'
+        )
+    # Format the HTML output with the title
+    final_html = "<br>".join(highlighted_html)
+    return f'''
+    <div style="background-color: #000000; color: #FFFFFF;border: solid 1px #FFFFFF; border-radius: 8px;">
+        <h3 style="margin-top: 0; font-size: 1em; color: #FFFFFF;">{title}</h3>
+        <div style="background-color: #333333; line-height: 1.6; padding: 15px; border-radius: 8px; color: #FFFFFF;">{final_html}</div>
+    </div>
+    '''
+def reparaphrased_sentences_html(sentences):
+    """
+    Create an HTML representation of sentences with numbering.
+    Args:
+        sentences (list of str): List of sentences to format.
+    Returns:
+        str: HTML string with numbered sentences.
+    """
+    formatted_sentences = []
+    # Process each sentence
+    for idx, sentence in enumerate(sentences, start=1):
+        sentence_with_idx = f"{idx}. {sentence}"
+        formatted_sentences.append(sentence_with_idx)
+    # Format the HTML output
+    final_html = "<br><br>".join(formatted_sentences)
+    return f'''
+    <div style="border: solid 1px #FFFFFF; background-color: #000000; color: #FFFFFF;
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
+        <div style="background-color: #333333; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
+    </div>
+    '''
+if __name__ == "__main__":
+    # Example usage
+    common_words = [(1, "highlight"), (2, "numbering"), (3, "S&P 500")]
+    sentences = ["This is a test to highlight words.", "Numbering is important for clarity.", "The S&P 500 index rose 2% today."]
+    # Test highlight_common_words
+    highlighted_html = highlight_common_words(common_words, sentences, "Test Highlighting")
+    print(highlighted_html)
+    # Test highlight_common_words_dict
+    sentences_with_scores = {"Highlight words in this text.": 0.95, "Number sentences for clarity.": 0.8, "The S&P 500 index is a market benchmark.": 0.88}
+    highlighted_html_dict = highlight_common_words_dict(common_words, sentences_with_scores, "Test Dict Highlighting")
+    print(highlighted_html_dict)

renderers/plot_3d.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""
+This file contains the code to plot a 3d tree
+"""
+import numpy as np
+import plotly.graph_objects as go
+from scipy.interpolate import griddata
+def gen_three_D_plot(detectability_val, distortion_val, euclidean_val):
+    """
+    Generates a 3D surface plot showing the relationship between detectability, distortion,
+    and Euclidean distance, with a focus on highlighting the "sweet spot" based on a composite score.
+    The function takes three sets of values: detectability, distortion, and Euclidean distance,
+    normalizes them to a [0, 1] range, and computes a composite score that combines these three metrics.
+    The "sweet spot" is the point where the composite score is maximized. This sweet spot is plotted
+    as a red marker on the 3D surface plot.
+    The function then uses a grid interpolation method (`griddata`) to generate a smooth surface
+    for the Euclidean distance over the detectability and distortion values. The result is a surface plot
+    where the contours represent different Euclidean distances.
+    Args:
+        detectability_val (list or array): A list or array of detectability scores.
+        distortion_val (list or array): A list or array of distortion scores.
+        euclidean_val (list or array): A list or array of Euclidean distances.
+    Returns:
+        plotly.graph_objects.Figure: A Plotly figure object representing the 3D surface plot,
+                                     with contour lines and a marker for the sweet spot.
+    Raises:
+        ValueError: If `griddata` fails to generate a valid interpolation, which could happen if the
+                    input data does not allow for a proper interpolation.
+    Example:
+        # Example of usage:
+        detectability_vals = [0.1, 0.3, 0.5, 0.7, 0.9]
+        distortion_vals = [0.2, 0.4, 0.6, 0.8, 1.0]
+        euclidean_vals = [0.5, 0.3, 0.2, 0.4, 0.6]
+        fig = gen_three_D_plot(detectability_vals, distortion_vals, euclidean_vals)
+        fig.show()  # Displays the plot in a web browser
+    Notes:
+        - The composite score is calculated as:
+          `composite_score = norm_detectability - (norm_distortion + norm_euclidean)`,
+          where the goal is to maximize detectability and minimize distortion and Euclidean distance.
+        - The `griddata` function uses linear interpolation to create a smooth surface for the plot.
+        - The function uses the "Plasma" colorscale for the surface plot, which provides a perceptually uniform color scheme.
+    """
+    detectability = np.array(detectability_val)
+    distortion = np.array(distortion_val)
+    euclidean = np.array(euclidean_val)
+    # Normalize the values to range [0, 1]
+    norm_detectability = (detectability - min(detectability)) / (max(detectability) - min(detectability))
+    norm_distortion = (distortion - min(distortion)) / (max(distortion) - min(distortion))
+    norm_euclidean = (euclidean - min(euclidean)) / (max(euclidean) - min(euclidean))
+    # Composite score: maximize detectability, minimize distortion and Euclidean distance
+    composite_score = norm_detectability - (norm_distortion + norm_euclidean)
+    # Find the index of the maximum score (sweet spot)
+    sweet_spot_index = np.argmax(composite_score)
+    # Sweet spot values
+    sweet_spot_detectability = detectability[sweet_spot_index]
+    sweet_spot_distortion = distortion[sweet_spot_index]
+    sweet_spot_euclidean = euclidean[sweet_spot_index]
+    # Create a meshgrid from the data
+    x_grid, y_grid = np.meshgrid(np.linspace(min(detectability), max(detectability), 30),
+                                 np.linspace(min(distortion), max(distortion), 30))
+    # Interpolate z values (Euclidean distances) to fit the grid using 'nearest' method
+    z_grid = griddata((detectability, distortion), euclidean, (x_grid, y_grid), method='nearest')
+    if z_grid is None:
+        raise ValueError("griddata could not generate a valid interpolation. Check your input data.")
+    # Create the 3D contour plot with the Plasma color scale
+    fig = go.Figure(data=go.Surface(
+        z=z_grid,
+        x=x_grid,
+        y=y_grid,
+        contours={
+            "z": {"show": True, "start": min(euclidean), "end": max(euclidean), "size": 0.1, "usecolormap": True}
+        },
+        colorscale='Plasma'
+    ))
+    # Add a marker for the sweet spot
+    fig.add_trace(go.Scatter3d(
+        x=[sweet_spot_detectability],
+        y=[sweet_spot_distortion],
+        z=[sweet_spot_euclidean],
+        mode='markers+text',
+        marker=dict(size=10, color='red', symbol='circle'),
+        text=["Sweet Spot"],
+        textposition="top center"
+    ))
+    # Set axis labels
+    fig.update_layout(
+        scene=dict(
+            xaxis_title='Detectability Score',
+            yaxis_title='Distortion Score',
+            zaxis_title='Euclidean Distance'
+        ),
+        margin=dict(l=0, r=0, b=0, t=0)
+    )
+    return fig
+if __name__ == "__main__":
+    # Example input data
+    detectability_vals = [0.1, 0.3, 0.5, 0.7, 0.9]
+    distortion_vals = [0.2, 0.4, 0.6, 0.8, 1.0]
+    euclidean_vals = [0.5, 0.3, 0.2, 0.4, 0.6]
+    # Call the function with example data
+    fig = gen_three_D_plot(detectability_vals, distortion_vals, euclidean_vals)
+    # Show the plot
+    fig.show()

renderers/tree.py ADDED Viewed

	@@ -0,0 +1,490 @@

+import plotly.graph_objects as go
+import textwrap
+import re
+from collections import defaultdict
+def generate_subplot1(paraphrased_sentence, masked_sentences, strategies, highlight_info, common_grams):
+    """
+    Generates a subplot visualizing paraphrased and masked sentences in a tree structure.
+    Highlights common words with specific colors and applies Longest Common Subsequence (LCS) numbering.
+    Args:
+        paraphrased_sentence (str): The paraphrased sentence to be visualized.
+        masked_sentences (list of str): A list of masked sentences to be visualized.
+        strategies (list of str, optional): List of strategies used for each masked sentence.
+        highlight_info (list of tuples): A list of tuples where each tuple contains a word and its associated color for highlighting.
+        common_grams (list of tuples): A list of tuples containing an index and a common word or phrase for LCS numbering.
+    Returns:
+        plotly.graph_objects.Figure: A Plotly figure representing the tree structure with highlighted words and labeled edges.
+    """
+    # Combine nodes into one list with appropriate labels
+    if isinstance(masked_sentences, str):
+        masked_sentences = [masked_sentences]
+    nodes = [paraphrased_sentence] + masked_sentences
+    nodes[0] += ' L0'  # Paraphrased sentence is level 0
+    if len(nodes) < 2:
+        print("[ERROR] Insufficient nodes for visualization")
+        return go.Figure()
+    for i in range(1, len(nodes)):
+        nodes[i] += ' L1'  # masked sentences are level 1
+    def apply_lcs_numbering(sentence, common_grams):
+        """
+        Applies LCS numbering to the sentence based on the common_grams.
+        Args:
+            sentence (str): The sentence to which the LCS numbering should be applied.
+            common_grams (list of tuples): A list of common grams to be replaced with LCS numbers.
+        Returns:
+            str: The sentence with LCS numbering applied.
+        """
+        for idx, lcs in common_grams:
+            sentence = re.sub(rf"\b{lcs}\b", f"({idx}){lcs}", sentence)
+        return sentence
+    # Apply LCS numbering
+    nodes = [apply_lcs_numbering(node, common_grams) for node in nodes]
+    def highlight_words(sentence, color_map):
+        """
+        Highlights words in the sentence based on the color_map.
+        Args:
+            sentence (str): The sentence where the words will be highlighted.
+            color_map (dict): A dictionary mapping words to their colors.
+        Returns:
+            str: The sentence with highlighted words.
+        """
+        for word, color in color_map.items():
+            sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
+        return sentence
+    # Clean and wrap nodes, and highlight specified words globally
+    cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
+    global_color_map = dict(highlight_info)
+    highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
+    wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=55)) for node in highlighted_nodes]
+    def get_levels_and_edges(nodes, strategies=None):
+        """
+        Determines tree levels and creates edges dynamically.
+        Args:
+            nodes (list of str): The nodes representing the sentences.
+            strategies (list of str, optional): The strategies used for each edge.
+        Returns:
+            tuple: A tuple containing two dictionaries:
+                - levels: A dictionary mapping node indices to their levels.
+                - edges: A list of edges where each edge is represented by a tuple of node indices.
+        """
+        levels = {}
+        edges = []
+        for i, node in enumerate(nodes):
+            level = int(node.split()[-1][1])
+            levels[i] = level
+        # Add edges from L0 to all L1 nodes
+        root_node = next((i for i, level in levels.items() if level == 0), 0)
+        for i, level in levels.items():
+            if level == 1:
+                edges.append((root_node, i))
+        return levels, edges
+    # Get levels and dynamic edges
+    levels, edges = get_levels_and_edges(nodes, strategies)
+    max_level = max(levels.values(), default=0)
+    # Calculate positions
+    positions = {}
+    level_heights = defaultdict(int)
+    for node, level in levels.items():
+        level_heights[level] += 1
+    y_offsets = {level: - (height - 1) / 2 for level, height in level_heights.items()}
+    x_gap = 2
+    l1_y_gap = 10
+    for node, level in levels.items():
+        if level == 1:
+            positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
+        else:
+            positions[node] = (-level * x_gap, y_offsets[level] * l1_y_gap)
+        y_offsets[level] += 1
+    def color_highlighted_words(node, color_map):
+        """
+        Colors the highlighted words in the node text.
+        Args:
+            node (str): The node text to be highlighted.
+            color_map (dict): A dictionary mapping words to their colors.
+        Returns:
+            str: The node text with highlighted words.
+        """
+        parts = re.split(r'(\{\{.*?\}\})', node)
+        colored_parts = []
+        for part in parts:
+            match = re.match(r'\{\{(.*?)\}\}', part)
+            if match:
+                word = match.group(1)
+                color = color_map.get(word, 'black')
+                colored_parts.append(f"<span style='color: {color};'>{word}</span>")
+            else:
+                colored_parts.append(part)
+        return ''.join(colored_parts)
+    # Define the text for each edge
+    default_edge_texts = [
+        "Highest Entropy Masking", "Pseudo-random Masking", "Random Masking",
+        "Greedy Sampling", "Temperature Sampling", "Exponential Minimum Sampling",
+        "Inverse Transform Sampling", "Greedy Sampling", "Temperature Sampling",
+        "Exponential Minimum Sampling", "Inverse Transform Sampling", "Greedy Sampling",
+        "Temperature Sampling", "Exponential Minimum Sampling", "Inverse Transform Sampling"
+    ]
+    if len(nodes) < 2:
+        print("[ERROR] Insufficient nodes for visualization")
+        return go.Figure()
+    # Create figure
+    fig1 = go.Figure()
+    # Add nodes to the figure
+    for i, node in enumerate(wrapped_nodes):
+        colored_node = color_highlighted_words(node, global_color_map)
+        x, y = positions[i]
+        fig1.add_trace(go.Scatter(
+            x=[-x],  # Reflect the x coordinate
+            y=[y],
+            mode='markers',
+            marker=dict(size=20, color='blue', line=dict(color='black', width=2)),
+            hoverinfo='none'
+        ))
+        fig1.add_annotation(
+            x=-x,  # Reflect the x coordinate
+            y=y,
+            text=colored_node,
+            showarrow=False,
+            xshift=15,
+            align="center",
+            font=dict(size=12),
+            bordercolor='black',
+            borderwidth=2,
+            borderpad=4,
+            bgcolor='white',
+            width=400,
+            height=100
+        )
+    # Add edges and text above each edge
+    for i, edge in enumerate(edges):
+        x0, y0 = positions[edge[0]]
+        x1, y1 = positions[edge[1]]
+        # Use strategy if available, otherwise use default edge text
+        if strategies and i < len(strategies):
+            edge_text = strategies[i]
+        else:
+            edge_text = default_edge_texts[i % len(default_edge_texts)]
+        fig1.add_trace(go.Scatter(
+            x=[-x0, -x1],  # Reflect the x coordinates
+            y=[y0, y1],
+            mode='lines',
+            line=dict(color='black', width=1)
+        ))
+        # Calculate the midpoint of the edge
+        mid_x = (-x0 + -x1) / 2
+        mid_y = (y0 + y1) / 2
+        # Adjust y position to shift text upwards
+        text_y_position = mid_y + 0.8  # Increase this value to shift the text further upwards
+        # Add text annotation above the edge
+        fig1.add_annotation(
+            x=mid_x,
+            y=text_y_position,
+            text=edge_text,  # Use the text specific to this edge
+            showarrow=False,
+            font=dict(size=12),
+            align="center"
+        )
+    fig1.update_layout(
+        showlegend=False,
+        margin=dict(t=50, b=50, l=50, r=50),
+        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        width=800 + max_level * 200,  # Adjusted width to accommodate more levels
+        height=300 + len(nodes) * 100,   # Adjusted height to accommodate more levels
+        plot_bgcolor='rgba(240,240,240,0.2)',
+        paper_bgcolor='white'
+    )
+    return fig1
+def generate_subplot2(masked_sentences, sampled_sentences, highlight_info, common_grams):
+    """
+    Generates a subplot visualizing multiple masked sentences and their sampled variants in a tree structure.
+    Each masked sentence will have multiple sampled sentences derived from it using different sampling techniques.
+    Args:
+        masked_sentences (list of str): A list of masked sentences to be visualized as root nodes.
+        sampled_sentences (list of str): A list of sampled sentences derived from masked sentences.
+        highlight_info (list of tuples): A list of tuples where each tuple contains a word and its associated color for highlighting.
+        common_grams (list of tuples): A list of tuples containing an index and a common word or phrase for LCS numbering.
+    Returns:
+        plotly.graph_objects.Figure: A Plotly figure representing the tree structure with highlighted words and labeled edges.
+    """
+    # Define sampling techniques
+    sampling_techniques = [
+        "Greedy Sampling",
+        "Temperature Sampling",
+        "Exponential Minimum Sampling",
+        "Inverse Transform Sampling"
+    ]
+    # Calculate total number of nodes
+    num_masked = len(masked_sentences)
+    num_sampled_per_masked = len(sampling_techniques)
+    total_nodes = num_masked + (num_masked * num_sampled_per_masked)
+    # Combine all sentences into nodes list with appropriate labels
+    nodes = []
+    # Level 0: masked sentences (root nodes)
+    nodes.extend([s + ' L0' for s in masked_sentences])
+    # Level 1: sampled sentences (branch nodes)
+    # For each masked sentence, we should have samples from each technique
+    sampled_nodes = []
+    # Validate if we have the expected number of sampled sentences
+    expected_sampled_count = num_masked * num_sampled_per_masked
+    if len(sampled_sentences) < expected_sampled_count:
+        # If insufficient samples provided, pad with placeholder sentences
+        print(f"Warning: Expected {expected_sampled_count} sampled sentences, but got {len(sampled_sentences)}")
+        while len(sampled_sentences) < expected_sampled_count:
+            sampled_sentences.append(f"Placeholder sampled sentence {len(sampled_sentences) + 1}")
+    # Add all sampled sentences with level information
+    for s in sampled_sentences[:expected_sampled_count]:
+        sampled_nodes.append(s + ' L1')
+    nodes.extend(sampled_nodes)
+    def apply_lcs_numbering(sentence, common_grams):
+        """
+        Applies LCS numbering to the sentence based on the common_grams.
+        """
+        for idx, lcs in common_grams:
+            sentence = re.sub(rf"\b{lcs}\b", f"({idx}){lcs}", sentence)
+        return sentence
+    # Apply LCS numbering
+    nodes = [apply_lcs_numbering(node, common_grams) for node in nodes]
+    def highlight_words(sentence, color_map):
+        """
+        Highlights words in the sentence based on the color_map.
+        """
+        for word, color in color_map.items():
+            sentence = re.sub(f"\\b{word}\\b", f"{{{{{word}}}}}", sentence, flags=re.IGNORECASE)
+        return sentence
+    # Helper function to color highlighted words
+    def color_highlighted_words(node, color_map):
+        """
+        Colors the highlighted words in the node text.
+        """
+        parts = re.split(r'(\{\{.*?\}\})', node)
+        colored_parts = []
+        for part in parts:
+            match = re.match(r'\{\{(.*?)\}\}', part)
+            if match:
+                word = match.group(1)
+                color = color_map.get(word, 'black')
+                colored_parts.append(f"<span style='color: {color};'>{word}</span>")
+            else:
+                colored_parts.append(part)
+        return ''.join(colored_parts)
+    # Clean nodes, highlight words, and wrap text
+    cleaned_nodes = [re.sub(r'\sL[0-9]$', '', node) for node in nodes]
+    global_color_map = dict(highlight_info)
+    highlighted_nodes = [highlight_words(node, global_color_map) for node in cleaned_nodes]
+    wrapped_nodes = ['<br>'.join(textwrap.wrap(node, width=80)) for node in highlighted_nodes]
+    # Generate edges based on the tree structure
+    def get_levels_and_edges(nodes):
+        levels = {}
+        edges = []
+        # Extract level info from node labels
+        for i, node in enumerate(nodes):
+            level = int(node.split()[-1][1])
+            levels[i] = level
+        # Create edges from masked sentences to their sampled variants
+        for masked_idx in range(num_masked):
+            # For each masked sentence, create edges to its sampled variants
+            for technique_idx in range(num_sampled_per_masked):
+                sampled_idx = num_masked + (masked_idx * num_sampled_per_masked) + technique_idx
+                if sampled_idx < len(nodes):
+                    edges.append((masked_idx, sampled_idx))
+        return levels, edges
+    levels, edges = get_levels_and_edges(nodes)
+    # Calculate positions with improved spacing
+    positions = {}
+    # Calculate horizontal spacing for the root nodes (masked sentences)
+    root_x_spacing = 0  # All root nodes at x=0
+    root_y_spacing = 8.0  # Vertical spacing between root nodes
+    # Calculate positions for sampled nodes
+    sampled_x = 3  # X position for all sampled nodes
+    # Calculate y positions for root nodes (masked sentences)
+    root_y_start = -(num_masked - 1) * root_y_spacing / 2
+    for i in range(num_masked):
+        positions[i] = (root_x_spacing, root_y_start + i * root_y_spacing)
+    # Calculate y positions for sampled nodes
+    for masked_idx in range(num_masked):
+        root_y = positions[masked_idx][1]  # Y position of parent masked sentence
+        # Calculate y-spacing for children of this root
+        children_y_spacing = 1.5  # Vertical spacing between children of the same root
+        children_y_start = root_y - (num_sampled_per_masked - 1) * children_y_spacing / 2
+        # Position each child
+        for technique_idx in range(num_sampled_per_masked):
+            child_idx = num_masked + (masked_idx * num_sampled_per_masked) + technique_idx
+            child_y = children_y_start + technique_idx * children_y_spacing
+            positions[child_idx] = (sampled_x, child_y)
+    # Create figure
+    fig2 = go.Figure()
+    # Add nodes
+    for i, node in enumerate(wrapped_nodes):
+        x, y = positions[i]
+        # Define node color based on level
+        node_color = 'blue' if levels[i] == 0 else 'green'
+        # Add the node marker
+        fig2.add_trace(go.Scatter(
+            x=[x],
+            y=[y],
+            mode='markers',
+            marker=dict(size=20, color=node_color, line=dict(color='black', width=2)),
+            hoverinfo='none'
+        ))
+        # Add node label with highlighting
+        colored_node = color_highlighted_words(node, global_color_map)
+        fig2.add_annotation(
+            x=x,
+            y=y,
+            text=colored_node,
+            showarrow=False,
+            xshift=15,
+            align="left",
+            font=dict(size=12),
+            bordercolor='black',
+            borderwidth=2,
+            borderpad=4,
+            bgcolor='white',
+            width=450,
+            height=100
+        )
+    # Add edges with labels
+    for i, (src, dst) in enumerate(edges):
+        x0, y0 = positions[src]
+        x1, y1 = positions[dst]
+        # Draw the edge
+        fig2.add_trace(go.Scatter(
+            x=[x0, x1],
+            y=[y0, y1],
+            mode='lines',
+            line=dict(color='black', width=1)
+        ))
+        # Add sampling technique label
+        # Determine which sampling technique this is
+        parent_idx = src
+        technique_count = sum(1 for k, (s, _) in enumerate(edges) if s == parent_idx and k < i)
+        technique_label = sampling_techniques[technique_count % len(sampling_techniques)]
+        # Calculate midpoint for the label
+        mid_x = (x0 + x1) / 2
+        mid_y = (y0 + y1) / 2
+        # Add slight offset to avoid overlap
+        label_offset = 0.1
+        fig2.add_annotation(
+            x=mid_x,
+            y=mid_y + label_offset,
+            text=technique_label,
+            showarrow=False,
+            font=dict(size=8),
+            align="center"
+        )
+    # Update layout
+    fig2.update_layout(
+        showlegend=False,
+        margin=dict(t=20, b=20, l=20, r=20),
+        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+        width=1200,  # Adjusted width to accommodate more levels
+        height=2000,   # Adjusted height to accommodate more levels
+        plot_bgcolor='rgba(240,240,240,0.2)',
+        paper_bgcolor='white'
+    )
+    return fig2
+if __name__ == "__main__":
+    paraphrased_sentence = "The quick brown fox jumps over the lazy dog."
+    masked_sentences = [
+        "A fast brown fox leaps over the lazy dog.",
+        "A quick brown fox hops over a lazy dog."
+    ]
+    highlight_info = [
+        ("quick", "red"),
+        ("brown", "green"),
+        ("fox", "blue"),
+        ("lazy", "purple")
+    ]
+    common_grams = [
+        (1, "quick brown fox"),
+        (2, "lazy dog")
+    ]
+    fig1 = generate_subplot1(paraphrased_sentence, masked_sentences, highlight_info, common_grams)
+    fig1.show()
+    sampled_sentence = ["A fast brown fox jumps over a lazy dog."]
+    fig2 = generate_subplot2(masked_sentences, sampled_sentence, highlight_info, common_grams)
+    fig2.show()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from utils.watermark import Watermarker
+from utils.paraphraser import Paraphraser
+from utils.entailment import EntailmentAnalyzer
+from utils.sampling import SamplingProcessor
+from utils.config import load_config

utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (404 Bytes). View file

utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (509 Bytes). View file

utils/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (594 Bytes). View file

utils/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (971 Bytes). View file

utils/__pycache__/entailment.cpython-310.pyc ADDED Viewed

Binary file (3.69 kB). View file

utils/__pycache__/entailment.cpython-311.pyc ADDED Viewed

Binary file (5.33 kB). View file

utils/__pycache__/masking_methods.cpython-310.pyc ADDED Viewed

Binary file (11.1 kB). View file

utils/__pycache__/masking_methods.cpython-311.pyc ADDED Viewed

Binary file (35.6 kB). View file

utils/__pycache__/ngram_index_manager.cpython-311.pyc ADDED Viewed

Binary file (19.5 kB). View file

utils/__pycache__/non_melting_point.cpython-310.pyc ADDED Viewed

Binary file (5.05 kB). View file

utils/__pycache__/non_melting_point.cpython-311.pyc ADDED Viewed

Binary file (33.1 kB). View file

utils/__pycache__/paraphraser.cpython-310.pyc ADDED Viewed

Binary file (2.85 kB). View file

utils/__pycache__/paraphraser.cpython-311.pyc ADDED Viewed

Binary file (4.89 kB). View file

utils/__pycache__/sampling.cpython-310.pyc ADDED Viewed

Binary file (5.06 kB). View file

utils/__pycache__/sampling.cpython-311.pyc ADDED Viewed

Binary file (9.2 kB). View file

utils/__pycache__/watermark.cpython-310.pyc ADDED Viewed

Binary file (11.8 kB). View file

utils/__pycache__/watermark.cpython-311.pyc ADDED Viewed

Binary file (20.9 kB). View file

utils/config.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""
+This file loads config from config.yaml
+"""
+import yaml
+def load_config(path):
+    """
+    Function to load config from config.yaml
+    """
+    try:
+        with open(path, "r") as file:
+            config = yaml.safe_load(file)
+        return config
+    except FileNotFoundError:
+        raise FileNotFoundError("Config file not found")
+    except Exception as e:
+        raise e

utils/config.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# This is the official config file.
+PECCAVI_TEXT:
+  Entailment:
+    task: "text-classification"
+    model: "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
+  Masking:
+    task: "fill-mask"
+    tokenizer: "bert-base-uncased"
+    model: "bert-base-uncased"
+    # tokenizer: "bert-large-cased-whole-word-masking"
+    # model: "bert-large-cased-whole-word-masking"
+  Vocabulary:
+    tokenizer: "bert-base-uncased"
+    model: "bert-base-uncased"
+    # permissible_ratio: 0.5
+    # tokenizer: "bert-large-cased-whole-word-masking"
+    # model: "bert-large-cased-whole-word-masking"
+    permissible_ratio: 1.0
+  Sampling:
+    tokenizer: "bert-base-uncased"
+    model: "bert-base-uncased"
+    # tokenizer: "bert-large-cased-whole-word-masking"
+    # model: "bert-large-cased-whole-word-masking"
+  Metrics:
+    EuclideanDistance: "sentence-transformers/all-MiniLM-L6-v2"
+    Distortion: "gpt2"
+  Detector:
+    tokenizer: "bert-base-uncased"
+    model: "bert-base-uncased"
+    # tokenizer: "bert-large-cased-whole-word-masking"
+    # model: "bert-large-cased-whole-word-masking"
+  Paraphrase:
+    tokenizer: "humarin/chatgpt_paraphraser_on_T5_base"
+    model: "humarin/chatgpt_paraphraser_on_T5_base"
+    num_beams: 10
+    num_beam_groups: 10
+    num_return_sequences: 10
+    repetition_penalty: 10.0
+    diversity_penalty: 3.0
+    no_repeat_ngram_size: 2
+    temperature: 0.7
+    max_length: 64

utils/entailment.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import numpy as np
+from transformers import pipeline
+from typing import List
+from utils.config import load_config
+class EntailmentAnalyzer:
+    # def __init__(self, config_path: str):
+    def __init__(self, config):
+        """
+        Initialize the EntailmentAnalyzer with the config file path.
+        Args:
+        config_path: The path to the configuration file.
+        """
+        # self.config = load_config(config_path)['PECCAVI_TEXT']['Entailment']
+        self.config = config
+        self.entailment_pipeline = pipeline(task=self.config['task'], model=self.config['model'])
+    def check_entailment(self, premise: str, hypothesis: str) -> float:
+        """
+        Check entailment between the premise and hypothesis.
+        Args:
+        premise: The premise sentence.
+        hypothesis: The hypothesis sentence.
+        Returns:
+        float: The entailment score.
+        """
+        results = self.entailment_pipeline(f"{premise} [SEP] {hypothesis}", top_k=None)
+        entailment_score = next(item['score'] for item in results if item['label'] == 'entailment')
+        return entailment_score
+    def analyze_entailment(self, original_sentence: str, paraphrased_sentences: List[str], threshold: float) -> tuple:
+        """
+        Analyze entailment scores for paraphrased sentences. If no selected sentences are found,
+        lower the threshold and rerun the analysis.
+        Args:
+        original_sentence: The original sentence.
+        paraphrased_sentences: List of paraphrased sentences.
+        threshold: Minimum score to select a sentence.
+        Returns:
+        tuple: A dictionary of all scores, selected sentences, and discarded sentences.
+        """
+        all_sentences = {}
+        selected_sentences = {}
+        discarded_sentences = {}
+        # Loop to reduce threshold if no sentences are selected
+        while not selected_sentences:
+            for paraphrased_sentence in paraphrased_sentences:
+                entailment_score = self.check_entailment(original_sentence, paraphrased_sentence)
+                all_sentences[paraphrased_sentence] = entailment_score
+                if entailment_score >= threshold:
+                    selected_sentences[paraphrased_sentence] = entailment_score
+                else:
+                    discarded_sentences[paraphrased_sentence] = entailment_score
+            # If no sentences are selected, lower the threshold
+            if not selected_sentences:
+                print(f"No selected sentences found. Lowering the threshold by 0.1 (from {threshold} to {threshold - 0.1}).")
+                threshold -= 0.1
+                if threshold <= 0:
+                    print("Threshold has reached 0. No sentences meet the criteria.")
+                    break
+        return all_sentences, selected_sentences, discarded_sentences
+if __name__ == "__main__":
+    config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml')
+    config_path = '/home/ashhar21137/text_wm/scratch/utils/config/config.yaml'
+    config = load_config(config_path)
+    entailment_analyzer = EntailmentAnalyzer(config['PECCAVI_TEXT']['Entailment'])
+    all_sentences, selected_sentences, discarded_sentences = entailment_analyzer.analyze_entailment(
+        "The weather is nice today",
+        [
+            "The climate is pleasant today",
+            "It's a good day weather-wise",
+            "Today, the weather is terrible",
+            "What a beautiful day it is",
+            "The sky is clear and the weather is perfect",
+            "It's pouring rain outside today",
+            "The weather isn't bad today",
+            "A lovely day for outdoor activities"
+        ],
+        0.7
+    )
+    print("----------------------- All Sentences -----------------------")
+    print(all_sentences)
+    print("----------------------- Discarded Sentences -----------------------")
+    print(discarded_sentences)
+    print("----------------------- Selected Sentences -----------------------")
+    print(selected_sentences)

utils/masking_methods.py ADDED Viewed

	@@ -0,0 +1,583 @@

+import random
+import torch
+import logging
+import string
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+from tqdm import tqdm
+# Set logging to WARNING for a cleaner terminal.
+logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# Ensure stopwords are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+def clean_word(word):
+    """More robust cleaning for consistent matching"""
+    # Remove possessive 's before other punctuation
+    if word.lower().endswith("'s"):
+        word = word[:-2]
+    return word.lower().strip().translate(str.maketrans('', '', string.punctuation))
+class MaskingProcessor:
+    def __init__(self, tokenizer, model):
+        self.tokenizer = tokenizer
+        self.model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.stop_words = set(stopwords.words('english'))
+        tqdm.write(f"[MaskingProcessor] Initialized on device: {self.device}")
+    def is_stopword(self, word):
+        """Check if a word is a stopword, handling punctuation and case"""
+        return clean_word(word) in self.stop_words
+    def verify_and_correct_ngram_positions(self, sentence, common_ngrams):
+        """Verify ngram positions match actual words in sentence and correct if needed."""
+        words = sentence.split()
+        corrected_ngrams = {}
+        for ngram, positions in common_ngrams.items():
+            corrected_positions = []
+            ngram_words = ngram.split()
+            # Convert ngram words to clean format for matching
+            clean_ngram_words = [clean_word(word) for word in ngram_words]
+            # Scan the sentence to find actual occurrences of the ngram
+            for i in range(len(words) - len(ngram_words) + 1):
+                is_match = True
+                for j, ngram_word in enumerate(clean_ngram_words):
+                    if clean_word(words[i + j]) != ngram_word:
+                        is_match = False
+                        break
+                if is_match:
+                    # Found a matching position, add it
+                    corrected_positions.append((i, i + len(ngram_words) - 1))
+            if corrected_positions:
+                corrected_ngrams[ngram] = corrected_positions
+            else:
+                # Log the issue and perform a more flexible search
+                print(f"Warning: Could not find exact match for '{ngram}' in the sentence.")
+                print(f"Attempting flexible matching...")
+                # Try a more flexible approach by looking for individual words
+                for i in range(len(words)):
+                    if clean_word(words[i]) == clean_ngram_words[0]:
+                        # We found the first word of the ngram
+                        if len(ngram_words) == 1 or (
+                            i + len(ngram_words) <= len(words) and
+                            all(clean_word(words[i+j]).startswith(clean_ngram_words[j]) for j in range(len(ngram_words)))
+                        ):
+                            corrected_positions.append((i, i + len(ngram_words) - 1))
+                if corrected_positions:
+                    print(f"Found flexible matches for '{ngram}': {corrected_positions}")
+                    corrected_ngrams[ngram] = corrected_positions
+                else:
+                    # If still no match, keep original positions as fallback
+                    print(f"No matches found for '{ngram}'. Keeping original positions.")
+                    corrected_ngrams[ngram] = positions
+        # Log changes
+        if corrected_ngrams != common_ngrams:
+            print(f"Original ngram positions: {common_ngrams}")
+            print(f"Corrected ngram positions: {corrected_ngrams}")
+        return corrected_ngrams
+    def in_any_ngram(self, idx, ngram_positions):
+            """Check if an original sentence index is part of any n-gram span"""
+            return any(start <= idx <= end for start, end in ngram_positions)
+    def create_fallback_mask(self, sentence, ngrams):
+        """Create a fallback mask when normal strategies fail."""
+        try:
+            words = sentence.split()
+            if not words:
+                return None
+            # Find any non-stopword that isn't in an ngram
+            ngram_positions = []
+            for positions in ngrams.values():
+                for start, end in positions:
+                    ngram_positions.append((start, end))
+            ngram_positions.sort()
+            # Find first eligible word
+            for idx, word in enumerate(words):
+                if not self.is_stopword(word) and not self.in_any_ngram(idx, ngram_positions):
+                    masked_words = words.copy()
+                    masked_words[idx] = self.tokenizer.mask_token
+                    tqdm.write(f"[INFO] Fallback mask created at position {idx}: '{word}'")
+                    return " ".join(masked_words), [idx]
+            # If no eligible word found, just mask the first non-stop word
+            for idx, word in enumerate(words):
+                if not self.is_stopword(word):
+                    masked_words = words.copy()
+                    masked_words[idx] = self.tokenizer.mask_token
+                    tqdm.write(f"[INFO] Last resort fallback mask created at position {idx}: '{word}'")
+                    return " ".join(masked_words), [idx]
+            # If still nothing, mask the first word
+            if words:
+                masked_words = words.copy()
+                masked_words[0] = self.tokenizer.mask_token
+                return " ".join(masked_words), [0]
+            return None
+        except Exception as e:
+            tqdm.write(f"[ERROR] Error creating fallback mask: {e}")
+            return None
+    def mask_sentence_random(self, sentence, common_ngrams):
+        """Mask random non-stopwords that are not part of common ngrams with controlled positioning."""
+        common_ngrams = self.verify_and_correct_ngram_positions(sentence, common_ngrams)
+        tqdm.write(f"[MaskingProcessor] Masking (random) sentence: {sentence}")
+        original_words = sentence.split()
+        # Handle punctuation
+        has_punctuation = False
+        punctuation = ''
+        if original_words and original_words[-1][-1] in ['.', ',', '!', '?', ';', ':', '"', "'"]:
+            has_punctuation = True
+            punctuation = original_words[-1][-1]
+            original_words[-1] = original_words[-1][:-1]
+            if not original_words[-1]:  # If the word was just punctuation
+                original_words.pop()
+        # Get flattened ngram positions
+        ngram_positions = []
+        for positions in common_ngrams.values():
+            for start, end in positions:
+                ngram_positions.append((start, end))
+        ngram_positions.sort()
+        # Find all candidate indices (non-stopwords not in ngrams)
+        candidate_indices = []
+        for idx, word in enumerate(original_words):
+            if not self.is_stopword(word) and not self.in_any_ngram(idx, ngram_positions):
+                candidate_indices.append(idx)
+        # Debug print candidate words
+        print("Candidate words for masking:")
+        for idx in candidate_indices:
+            print(f"  Position {idx}: '{original_words[idx]}'")
+        selected_indices = []
+        if ngram_positions:
+            # Before first ngram
+            before_first = [idx for idx in candidate_indices if idx < ngram_positions[0][0]]
+            if before_first:
+                num_to_select = min(1, len(before_first))  # Select 1 word
+                if num_to_select > 0:
+                    selected = random.sample(before_first, num_to_select)
+                    selected_indices.extend(selected)
+            # Between ngrams
+            for i in range(len(ngram_positions) - 1):
+                between = [idx for idx in candidate_indices
+                        if ngram_positions[i][1] < idx < ngram_positions[i+1][0]]
+                if between:
+                    num_to_select = min(2, len(between))  # Select between 1-2 words
+                    if num_to_select > 0:
+                        selected = random.sample(between, num_to_select)
+                        selected_indices.extend(selected)
+            # After last ngram
+            after_last = [idx for idx in candidate_indices if idx > ngram_positions[-1][1]]
+            if after_last:
+                num_to_select = min(1, len(after_last))  # Select 1 word
+                if num_to_select > 0:
+                    selected = random.sample(after_last, num_to_select)
+                    selected_indices.extend(selected)
+        else:
+            # If no ngrams, pick up to 6 random candidates
+            if candidate_indices:
+                selected_indices = random.sample(candidate_indices,
+                                            min(6, len(candidate_indices)))
+        masked_words = original_words.copy()
+        for idx in selected_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        if has_punctuation:
+            masked_words.append(punctuation)
+        # Debug prints
+        print("Original sentence:", sentence)
+        print("Common ngrams:", common_ngrams)
+        print("Common ngram positions:", ngram_positions)
+        print("Candidate indices for masking:", candidate_indices)
+        print("Selected for masking:", selected_indices)
+        print("Masked sentence:", " ".join(masked_words))
+        return " ".join(masked_words), selected_indices
+    def mask_sentence_pseudorandom(self, sentence, common_ngrams):
+        """Mask specific non-stopwords based on their position relative to ngrams."""
+        common_ngrams = self.verify_and_correct_ngram_positions(sentence, common_ngrams)
+        tqdm.write(f"[MaskingProcessor] Masking (pseudorandom) sentence: {sentence}")
+        random.seed(3)  # Fixed seed for pseudorandom behavior
+        original_words = sentence.split()
+        # Handle punctuation
+        has_punctuation = False
+        punctuation = ''
+        if original_words and original_words[-1][-1] in ['.', ',', '!', '?', ';', ':', '"', "'"]:
+            has_punctuation = True
+            punctuation = original_words[-1][-1]
+            original_words[-1] = original_words[-1][:-1]
+            if not original_words[-1]:  # If the word was just punctuation
+                original_words.pop()
+        # Get flattened ngram positions
+        ngram_positions = []
+        for positions in common_ngrams.values():
+            for start, end in positions:
+                ngram_positions.append((start, end))
+        ngram_positions.sort()
+        # Find all candidate indices (non-stopwords not in ngrams)
+        candidate_indices = []
+        for idx, word in enumerate(original_words):
+            if not self.is_stopword(word) and not self.in_any_ngram(idx, ngram_positions):
+                candidate_indices.append(idx)
+        # Debug print candidate words
+        print("Candidate words for masking:")
+        for idx in candidate_indices:
+            print(f"  Position {idx}: '{original_words[idx]}'")
+        # PSEUDORANDOM SPECIFIC LOGIC:
+        selected_indices = []
+        if ngram_positions:
+            # Before first ngram
+            before_first = [idx for idx in candidate_indices if idx < ngram_positions[0][0]]
+            if before_first:
+                num_to_select = min(1, len(before_first))  # Select 1 word
+                if num_to_select > 0:
+                    selected = random.sample(before_first, num_to_select)
+                    selected_indices.extend(selected)
+            # Between ngrams
+            for i in range(len(ngram_positions) - 1):
+                between = [idx for idx in candidate_indices
+                        if ngram_positions[i][1] < idx < ngram_positions[i+1][0]]
+                if between:
+                    num_to_select = min(2, len(between))  # Select between 1-2 words
+                    if num_to_select > 0:
+                        selected = random.sample(between, num_to_select)
+                        selected_indices.extend(selected)
+            # After last ngram
+            after_last = [idx for idx in candidate_indices if idx > ngram_positions[-1][1]]
+            if after_last:
+                num_to_select = min(1, len(after_last))  # Select 1 word
+                if num_to_select > 0:
+                    selected = random.sample(after_last, num_to_select)
+                    selected_indices.extend(selected)
+        else:
+            # If no ngrams, pick up to 6 random candidates
+            if candidate_indices:
+                selected_indices = random.sample(candidate_indices,
+                                            min(6, len(candidate_indices)))
+        masked_words = original_words.copy()
+        for idx in selected_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        if has_punctuation:
+            masked_words.append(punctuation)
+        # Debug prints
+        print("Original sentence:", sentence)
+        print("Common ngrams:", common_ngrams)
+        print("Common ngram positions:", ngram_positions)
+        print("Candidate indices for masking:", candidate_indices)
+        print("Selected for masking:", selected_indices)
+        print("Masked sentence:", " ".join(masked_words))
+        return " ".join(masked_words), selected_indices
+    def mask_sentence_entropy(self, sentence, common_ngrams):
+        """Mask words with highest entropy that are not part of common ngrams."""
+        common_ngrams = self.verify_and_correct_ngram_positions(sentence, common_ngrams)
+        tqdm.write(f"[MaskingProcessor] Masking (entropy) sentence: {sentence}")
+        original_words = sentence.split()
+        # Handle punctuation
+        has_punctuation = False
+        punctuation = ''
+        if original_words and original_words[-1][-1] in ['.', ',', '!', '?', ';', ':', '"', "'"]:
+            has_punctuation = True
+            punctuation = original_words[-1][-1]
+            original_words[-1] = original_words[-1][:-1]
+            if not original_words[-1]:  # If the word was just punctuation
+                original_words.pop()
+        # Get flattened ngram positions
+        ngram_positions = []
+        for positions in common_ngrams.values():
+            for start, end in positions:
+                ngram_positions.append((start, end))
+        ngram_positions.sort()
+        # Find all candidate indices (non-stopwords not in ngrams)
+        candidate_indices = []
+        for idx, word in enumerate(original_words):
+            if not self.is_stopword(word) and not self.in_any_ngram(idx, ngram_positions):
+                candidate_indices.append(idx)
+        # Debug print candidate words
+        print("Candidate words for masking:")
+        for idx in candidate_indices:
+            print(f"  Position {idx}: '{original_words[idx]}'")
+        # ENTROPY SPECIFIC LOGIC:
+        # Calculate entropy for each candidate word
+        selected_indices = []
+        if candidate_indices:
+            # Organize candidates by position relative to ngrams
+            if ngram_positions:
+                # Group candidates by position
+                before_first = []
+                between_ngrams = {}
+                after_last = []
+                for idx in candidate_indices:
+                    if idx < ngram_positions[0][0]:
+                        before_first.append(idx)
+                    elif idx > ngram_positions[-1][1]:
+                        after_last.append(idx)
+                    else:
+                        # Find which ngram gap this belongs to
+                        for i in range(len(ngram_positions) - 1):
+                            if ngram_positions[i][1] < idx < ngram_positions[i+1][0]:
+                                if i not in between_ngrams:
+                                    between_ngrams[i] = []
+                                between_ngrams[i].append(idx)
+                # Before first ngram: select 1 highest entropy words
+                if before_first:
+                    entropies = [(idx, self.calculate_word_entropy(sentence, idx)) for idx in before_first]
+                    entropies.sort(key=lambda x: x[1], reverse=True)  # Sort by entropy (highest first)
+                    num_to_select = min(1, len(entropies))  # Select 1 word
+                    selected_indices.extend([idx for idx, _ in entropies[:num_to_select]])
+                # For each gap between ngrams: select 1-2 highest entropy words
+                for group, indices in between_ngrams.items():
+                    if indices:
+                        entropies = [(idx, self.calculate_word_entropy(sentence, idx)) for idx in indices]
+                        entropies.sort(key=lambda x: x[1], reverse=True)  # Sort by entropy (highest first)
+                        num_to_select = min(2, len(entropies))  # Select between 1-2 words
+                        selected_indices.extend([idx for idx, _ in entropies[:num_to_select]])
+                # After last ngram: select 1 highest entropy words
+                if after_last:
+                    entropies = [(idx, self.calculate_word_entropy(sentence, idx)) for idx in after_last]
+                    entropies.sort(key=lambda x: x[1], reverse=True)  # Sort by entropy (highest first)
+                    num_to_select = min(1, len(entropies))  # Select 1 word
+                    selected_indices.extend([idx for idx, _ in entropies[:num_to_select]])
+            else:
+                # If no ngrams, calculate entropy for all candidates
+                entropies = [(idx, self.calculate_word_entropy(sentence, idx)) for idx in candidate_indices]
+                # Sort by entropy (highest first)
+                entropies.sort(key=lambda x: x[1], reverse=True)
+                # Take top 6 or all if fewer
+                selected_indices = [idx for idx, _ in entropies[:min(6, len(entropies))]]
+        masked_words = original_words.copy()
+        for idx in selected_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        if has_punctuation:
+            masked_words.append(punctuation)
+        # Debug prints
+        print("Original sentence:", sentence)
+        print("Common ngrams:", common_ngrams)
+        print("Common ngram positions:", ngram_positions)
+        print("Candidate indices for masking:", candidate_indices)
+        print("Selected for masking:", selected_indices)
+        print("Masked sentence:", " ".join(masked_words))
+        return " ".join(masked_words), selected_indices
+    def calculate_mask_logits(self, original_sentence, original_mask_indices):
+        """Calculate logits for masked positions."""
+        logger.info(f"Calculating mask logits for sentence: {original_sentence}")
+        words = original_sentence.split()
+        mask_logits = {}
+        for idx in original_mask_indices:
+            masked_words = words.copy()
+            masked_words[idx] = self.tokenizer.mask_token
+            masked_sentence = " ".join(masked_words)
+            input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"].to(self.device)
+            mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+            with torch.no_grad():
+                outputs = self.model(input_ids)
+                logits = outputs.logits
+            mask_logits_tensor = logits[0, mask_token_index, :]
+            top_mask_logits, top_mask_indices = torch.topk(mask_logits_tensor, 100, dim=-1)
+            top_tokens = []
+            top_logits = []
+            seen_words = set()
+            for token_id, logit in zip(top_mask_indices[0], top_mask_logits[0]):
+                token = self.tokenizer.convert_ids_to_tokens(token_id.item())
+                if token.startswith('##'):
+                    continue
+                word = self.tokenizer.convert_tokens_to_string([token]).strip()
+                if word and word not in seen_words:
+                    seen_words.add(word)
+                    top_tokens.append(word)
+                    top_logits.append(logit.item())
+                    if len(top_tokens) == 50:
+                        break
+            mask_logits[idx] = {
+                "tokens": top_tokens,
+                "logits": top_logits
+            }
+        logger.info("Completed calculating mask logits.")
+        return mask_logits
+    def calculate_word_entropy(self, sentence, word_position):
+        """Calculate entropy for a word at a specific position."""
+        logger.info(f"Calculating word entropy for position {word_position} in sentence: {sentence}")
+        words = sentence.split()
+        masked_words = words.copy()
+        masked_words[word_position] = self.tokenizer.mask_token
+        masked_sentence = " ".join(masked_words)
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"].to(self.device)
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        probs = torch.nn.functional.softmax(logits[0, mask_token_index], dim=-1)
+        entropy = -torch.sum(probs * torch.log(probs + 1e-9))
+        logger.info(f"Computed entropy: {entropy.item()}")
+        return entropy.item()
+    def process_sentences(self, sentences_list, common_grams, method="random"):
+        """Process multiple sentences with the specified masking method."""
+        tqdm.write(f"[MaskingProcessor] Processing sentences using method: {method}")
+        results = {}
+        for sentence in tqdm(sentences_list, desc="Masking Sentences"):
+            try:
+                ngrams = common_grams.get(sentence, {})
+                if method == "random":
+                    masked_sentence, original_mask_indices = self.mask_sentence_random(sentence, ngrams)
+                elif method == "pseudorandom":
+                    masked_sentence, original_mask_indices = self.mask_sentence_pseudorandom(sentence, ngrams)
+                else:  # entropy
+                    masked_sentence, original_mask_indices = self.mask_sentence_entropy(sentence, ngrams)
+                # Skip if no masks were applied
+                if not original_mask_indices:
+                    tqdm.write(f"[WARNING] No mask indices found for sentence with method {method}: {sentence[:50]}...")
+                    # Create a fallback masked sentence with at least one mask
+                    fallback_result = self.create_fallback_mask(sentence, ngrams)
+                    if fallback_result:
+                        masked_sentence, original_mask_indices = fallback_result
+                        tqdm.write(f"[INFO] Created fallback mask for sentence")
+                    else:
+                        tqdm.write(f"[WARNING] Could not create fallback mask, skipping sentence")
+                        continue
+                logits = self.calculate_mask_logits(sentence, original_mask_indices)
+                results[sentence] = {
+                    "masked_sentence": masked_sentence,
+                    "mask_logits": logits
+                }
+                logger.info(f"Processed sentence: {sentence}")
+            except Exception as e:
+                tqdm.write(f"[ERROR] Failed to process sentence with method {method}: {e}")
+                tqdm.write(f"Sentence: {sentence[:100]}...")
+                import traceback
+                tqdm.write(traceback.format_exc())
+        tqdm.write("[MaskingProcessor] Completed processing sentences.")
+        return results
+    @staticmethod
+    def identify_common_ngrams(sentences, entities):
+        """Enhanced to handle possessive forms better"""
+        common_grams = {}
+        # Pre-process entities to handle variations
+        processed_entities = []
+        for entity in entities:
+            processed_entities.append(entity)
+            # Add possessive form if not already there
+            if not entity.endswith("'s") and not entity.endswith("s"):
+                processed_entities.append(f"{entity}'s")
+        for sentence in sentences:
+            words = sentence.split()
+            common_grams[sentence] = {}
+            # Look for each entity in the sentence
+            for entity in processed_entities:
+                entity_words = entity.split()
+                entity_len = len(entity_words)
+                # Convert entity words for matching
+                clean_entity_words = [clean_word(word) for word in entity_words]
+                # Find all occurrences
+                for i in range(len(words) - entity_len + 1):
+                    is_match = True
+                    for j, entity_word in enumerate(clean_entity_words):
+                        if clean_word(words[i + j]) != entity_word:
+                            is_match = False
+                            break
+                    if is_match:
+                        # Use canonical form from entity list for consistency
+                        base_entity = entity
+                        if entity.endswith("'s") and any(e == entity[:-2] for e in processed_entities):
+                            base_entity = entity[:-2]
+                        if base_entity not in common_grams[sentence]:
+                            common_grams[sentence][base_entity] = []
+                        common_grams[sentence][base_entity].append((i, i + entity_len - 1))
+        return common_grams
+if __name__ == "__main__":
+    #example test
+    # test_sentence = "Kevin De Bruyne scored for Manchester City as they won the 2019-20 Premier League title."
+    # entities to preserve
+    # entities = ["Kevin De Bruyne", "Manchester City", "Premier League"]
+    # Identify common n-grams
+    common_grams = MaskingProcessor.identify_common_ngrams([test_sentence], entities)
+    # Print detected n-grams
+    print(f"Detected common n-grams: {common_grams[test_sentence]}")
+    # Initialize the processor
+    processor = MaskingProcessor(
+        BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking"),
+        BertForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+    )
+    # Test all three masking methods
+    print("\nTesting Random Masking:")
+    masked_random, indices_random = processor.mask_sentence_random(test_sentence, common_grams[test_sentence])
+    print("\nTesting Pseudorandom Masking:")
+    masked_pseudorandom, indices_pseudorandom = processor.mask_sentence_pseudorandom(test_sentence, common_grams[test_sentence])
+    print("\nTesting Entropy Masking:")
+    masked_entropy, indices_entropy = processor.mask_sentence_entropy(test_sentence, common_grams[test_sentence])
+    # Print results
+    print("\nResults:")
+    print(f"Original: {test_sentence}")
+    print(f"Random Masked: {masked_random}")
+    print(f"Pseudorandom Masked: {masked_pseudorandom}")
+    print(f"Entropy Masked: {masked_entropy}")

utils/non_melting_point.py ADDED Viewed

	@@ -0,0 +1,590 @@

+import nltk
+import logging
+import spacy
+from nltk.corpus import stopwords
+from nltk.util import ngrams
+from collections import Counter
+import re
+from tqdm import tqdm
+# Logging setup
+logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+class NgramProcessor:
+    def __init__(self, models=None):
+        try:
+            nltk.data.find('corpora/stopwords')
+        except LookupError:
+            nltk.download('stopwords')
+        self.stop_words = set(stopwords.words('english'))
+        # Default to standard model if none specified
+        if models is None:
+            models = ["en_core_web_trf"]
+        # Load specified model
+        self.models = {}
+        for model_name in models:
+            try:
+                self.models[model_name] = spacy.load(model_name)
+                tqdm.write(f"[NgramProcessor] Loaded model: {model_name}")
+            except IOError:
+                tqdm.write(f"[NgramProcessor] Error: Model '{model_name}' not found. Please install it with:")
+                tqdm.write(f"python -m spacy download {model_name}")
+            except Exception as e:
+                tqdm.write(f"[NgramProcessor] Error loading model '{model_name}': {str(e)}")
+        # Set primary NLP model for other processes
+        if "en_core_web_trf" in self.models:
+            self.nlp = self.models["en_core_web_trf"]
+        elif len(self.models) > 0:
+            # Use first available model as primary if preferred one isn't available
+            self.nlp = next(iter(self.models.values()))
+        else:
+            raise ValueError("No spaCy model was successfully loaded")
+        # Add custom entity patterns for numerical ranges to primary model
+        if "entity_ruler" not in self.nlp.pipe_names:
+            ruler = self.nlp.add_pipe("entity_ruler", before="ner")
+            patterns = [
+                {"label": "CARDINAL", "pattern": [{"TEXT": {"REGEX": "\\d+-\\d+"}}]},  # Pattern for ranges like "7-10"
+                {"label": "PERCENT", "pattern": [{"TEXT": {"REGEX": "\\d+%"}}]}           # Pattern for percentages
+            ]
+            ruler.add_patterns(patterns)
+        # Create special pattern for numerical ranges
+        self.number_range_pattern = re.compile(r'\b(\d+(?:-\d+)+)\b')
+        tqdm.write("[NgramProcessor] Initialized with stopwords, spaCy NLP model, and numerical range detection.")
+    def remove_stopwords(self, text):
+        words = re.findall(r'\w+', text.lower())
+        filtered_words = [word for word in words if word not in self.stop_words]
+        return ' '.join(filtered_words)
+    def extract_number_ranges(self, sentences):
+        """Extract numerical ranges like '7-10' from sentences"""
+        tqdm.write("[NgramProcessor] Extracting numerical ranges...")
+        number_ranges = []
+        range_counts = Counter()
+        for sentence in sentences:
+            # Find all numerical ranges in the sentence
+            matches = self.number_range_pattern.findall(sentence)
+            for match in matches:
+                range_counts[match] += 1
+        # Add all ranges that appear in all sentences (threshold for ranges)
+        for range_text, count in range_counts.items():
+            if count >= 1:
+                number_ranges.append(range_text)
+        tqdm.write(f"[NgramProcessor] Found {len(number_ranges)} numerical ranges: {number_ranges}")
+        return number_ranges
+    def extract_standalone_numbers(self, sentences):
+        """Extract standalone numerical values from sentences"""
+        tqdm.write("[NgramProcessor] Extracting standalone numbers...")
+        # Two patterns: one for percentages, one for regular numbers
+        percentage_pattern = re.compile(r'\b\d+%\b')  # Only matches numbers with % sign
+        number_pattern = re.compile(r'\b\d+\b')       # Only matches standalone numbers
+        percentage_counts = Counter()
+        number_counts = Counter()
+        percentage_values = set()  # Store the numeric part of percentages for cross-reference
+        # First pass: Find all percentages
+        for sentence in sentences:
+            # Extract all percentages first
+            percentage_matches = percentage_pattern.findall(sentence)
+            for match in percentage_matches:
+                percentage_counts[match] += 1
+                # Store the numeric part for later comparison
+                numeric_part = match.rstrip('%')
+                percentage_values.add(numeric_part)
+        # Second pass: Find standalone numbers
+        for sentence in sentences:
+            # Only look for standalone numbers now
+            number_matches = number_pattern.findall(sentence)
+            for match in number_matches:
+                # Avoid double counting numbers that we already counted as percentages
+                if match not in percentage_values:
+                    number_counts[match] += 1
+        # Process percentages first (they have priority)
+        threshold = max(1, int(len(sentences) * 1.0))
+        standalone_numbers = []
+        # Add percentages that meet the threshold
+        for num, count in percentage_counts.items():
+            if count >= threshold:
+                standalone_numbers.append(num)  # Already has % sign
+        # Then add standalone numbers, converting to percentage format if needed
+        for num, count in number_counts.items():
+            if count >= threshold:
+                # If this number also appeared as part of a percentage, use the percentage format
+                if num in percentage_values:
+                    standalone_numbers.append(f"{num}%")
+                else:
+                    standalone_numbers.append(num)
+        tqdm.write(f"[NgramProcessor] Found {len(standalone_numbers)} standalone numbers: {standalone_numbers}")
+        return standalone_numbers
+    def extract_regex_subsequences(self, sentences):
+        """Extract potential subsequences using regex patterns before applying NLP"""
+        tqdm.write("[NgramProcessor] Extracting regex subsequences...")
+        # Find potential multi-word subsequences (2-5 words) that occur across sentences
+        potential_subsequences = set()
+        # Process each sentence to find multi-word phrases
+        for sentence in sentences:
+            # First, clean the sentence by removing punctuation and converting to lowercase
+            clean_sentence = re.sub(r'[^\w\s&-./\'()[\]$€£¥+%]', ' ', sentence.lower())
+            # Extract sequences of 2-6 words
+            for i in range(2, 7):  # Try sequences of length 2-6 words
+                pattern = r'\b(\w+(?:[-&\s./\'()[\]$€£¥+%]+\w+){' + str(i-1) + r'})\b'
+                matches = re.findall(pattern, clean_sentence)
+                potential_subsequences.update(matches)
+        # Filter out sequences that consist only of stopwords (but preserve numbers)
+        filtered_subsequences = []
+        for subseq in potential_subsequences:
+            words = re.split(r'[\s-]+', subseq)  # Split on spaces or hyphens
+            # Function to check if a word is a number or percentage
+            def is_numeric(word):
+                return bool(re.match(r'^\d+(\.\d+)?%?$|^\d+-\d+$', word))
+            # Skip if ALL words are stopwords and none are numeric
+            if all((word in self.stop_words and not is_numeric(word)) for word in words):
+                tqdm.write(f"[NgramProcessor] Skipping all-stopword phrase: {subseq}")
+                continue
+            # Keep if sequence has significant words (not just stopwords)
+            # OR if it contains numbers/percentages
+            if len(words) > 1 and (
+                any(word not in self.stop_words and (len(word) > 2 or is_numeric(word)) for word in words)
+            ):
+                # Additional check to reject if standalone "the" or other common stopwords
+                if not (len(words) == 1 and words[0] in self.stop_words and not is_numeric(words[0])):
+                    filtered_subsequences.append(subseq)
+        # Count occurrences across all sentences
+        subseq_counts = Counter()
+        for subseq in filtered_subsequences:
+            for sentence in sentences:
+                if re.search(r'\b' + re.escape(subseq) + r'\b', sentence.lower()):
+                    subseq_counts[subseq] += 1
+        # Keep only subsequences that appear in multiple sentences
+        threshold = max(2, int(len(sentences) * 1.0))  # threshold to catch all patterns
+        regex_candidates = [subseq for subseq, count in subseq_counts.items()
+                        if count >= threshold]
+        tqdm.write(f"[NgramProcessor] Found {len(regex_candidates)} regex subsequences")
+        return regex_candidates
+    def filter_standalone_stopwords(self, ngrams_dict):
+        """Remove standalone stopwords and very short terms from the ngrams dictionary"""
+        filtered_dict = {}
+        for sentence, ngrams in ngrams_dict.items():
+            filtered_dict[sentence] = {}
+            for ngram, indices in ngrams.items():
+                words = ngram.split()
+                # Skip single stopwords and very short terms UNLESS they are numbers
+                if (len(words) == 1 and (words[0] in self.stop_words or len(words[0]) < 3)):
+                    # Exception for numbers
+                    if len(words) == 1 and re.match(r'^\d+$', words[0]):
+                        filtered_dict[sentence][ngram] = indices
+                        continue
+                    else:
+                        continue
+                # Skip if ALL words are stopwords
+                if all(word in self.stop_words for word in words):
+                    continue
+                filtered_dict[sentence][ngram] = indices
+        return filtered_dict
+    def extract_named_entities(self, sentences):
+        entity_counter = Counter()
+        # Process each sentence with each model
+        for model_name, nlp_model in self.models.items():
+            tqdm.write(f"[NgramProcessor] Extracting entities with model: {model_name}")
+            docs = list(nlp_model.pipe(sentences))
+            # Process each sentence
+            for doc in docs:
+                for ent in doc.ents:
+                    # Include entity types relevant to this model
+                    # This is a comprehensive list - some models may not use all these types
+                    if ent.label_ in {
+                        # People, organizations, locations
+                        "PERSON", "ORG", "GPE", "LOC", "NORP",
+                        # Facilities and products
+                        "FAC", "PRODUCT", "WORK_OF_ART", "EVENT",
+                        # Numeric entities
+                        "DATE", "TIME", "MONEY", "QUANTITY", "PERCENT", "CARDINAL", "ORDINAL",
+                        # Others
+                        "LAW", "LANGUAGE",
+                        # Scientific entities
+                        "SCIENTIFIC", "SUBSTANCE", "CHEMICAL", "TECHNOLOGY",
+                        # Medical entities
+                        "DISEASE", "MEDICAL", "CLINICAL", "TREATMENT", "SYMPTOM", "DIAGNOSTIC",
+                        "ANATOMICAL", "BIOLOGY", "GENE", "PROTEIN", "DRUG",
+                        # Legal entities
+                        "LEGAL", "COURT", "STATUTE", "PROVISION", "CASE_CITATION", "JUDGE",
+                        "LEGAL_ROLE", "REGULATION", "CONTRACT"
+                    }:
+                        # Handle possessive forms by stripping 's
+                        clean_entity = re.sub(r"'s\b", "", ent.text.lower()).strip()
+                        # Add model name prefix to distinguish sources
+                        entity_counter[clean_entity] += 1
+        threshold = max(1, len(sentences) * 1.0)  # Adjusted threshold for entities
+        return [ent for ent, count in entity_counter.items() if count >= threshold]
+    def extract_domain_specific_entities(self, text):
+        """Extract entities from all models and categorize by domain"""
+        domain_entities = {}
+        for model_name, nlp_model in self.models.items():
+            doc = nlp_model(text)
+            domain_entities[model_name] = [(ent.text, ent.label_) for ent in doc.ents]
+        return domain_entities
+    def is_substring_of_any(self, ngram, common_ngrams):
+        for other_ngram in common_ngrams:
+            if ngram != other_ngram and ngram in other_ngram:
+                return True
+        return False
+    def find_filtered_ngrams(self, sentences):
+        tqdm.write("[NgramProcessor] Processing...")
+        # Step 1: First extract numerical ranges or standalone numbers (special priority)
+        number_ranges = self.extract_number_ranges(sentences)
+        standalone_numbers = self.extract_standalone_numbers(sentences)
+        # Step 2: Use regex to find common subsequences
+        regex_subsequences = self.extract_regex_subsequences(sentences)
+        tqdm.write(f"[NgramProcessor] Regex Subsequences: {regex_subsequences}")
+        # Step 3: Then apply spaCy to detect named entities
+        named_entities = self.extract_named_entities(sentences)
+        # Make sure percentage values have proper format
+        for i, entity in enumerate(named_entities):
+            if re.match(r'\d+$', entity) and any(f"{entity}%" in sentence for sentence in sentences):
+                # Replace standalone digit with percentage if it appears as percentage in text
+                named_entities[i] = f"{entity}%"
+        tqdm.write(f"[NgramProcessor] Named Entities: {named_entities}")
+        # Step 4: Consolidate and filter all detected patterns
+        # Collect all patterns in one list
+        all_patterns = number_ranges + regex_subsequences + named_entities + standalone_numbers
+        # Sort by length (longer first) to prioritize more specific patterns
+        all_patterns.sort(key=len, reverse=True)
+        # Remove duplicates while preserving order
+        unique_patterns = []
+        seen = set()
+        for pattern in all_patterns:
+            if pattern not in seen:
+                # Check if this pattern is a substring of any already selected pattern
+                is_substring = False
+                for selected_pattern in unique_patterns:
+                    if pattern in selected_pattern and pattern != selected_pattern:
+                        is_substring = True
+                        break
+                if not is_substring:
+                    unique_patterns.append(pattern)
+                    seen.add(pattern)
+        # Re-index sequentially
+        indexed_patterns = [(i+1, pattern) for i, pattern in enumerate(unique_patterns)]
+        self.indexed_patterns = indexed_patterns
+        non_melting_points = [pattern for _, pattern in indexed_patterns]
+        tqdm.write(f"[NgramProcessor] Filtered non_melting_points: {non_melting_points}")
+        tqdm.write(f"[NgramProcessor] Filtered non-melting points: {len(non_melting_points)}")
+        # Filter out patterns that are substrings of longer patterns or standalone numbers
+        standalone_numbers_set = set(standalone_numbers)
+        non_melting_points = []
+        for pattern in unique_patterns:
+            is_substring = False
+            for longer_pattern in non_melting_points:
+                # Check if pattern is contained within a longer pattern
+                if pattern in longer_pattern:
+                    is_substring = True
+                    break
+            if not is_substring or pattern in standalone_numbers_set:
+                non_melting_points.append(pattern)
+        # For remaining cases that might have been missed, apply NLTK n-gram extraction
+        # Only on cleaned sentences (less computationally expensive now)
+        clean_to_original = {}
+        sentences_cleaned = []
+        # Process sentences with spaCy to preserve entity information
+        docs = list(self.nlp.pipe(sentences))
+        for i, doc in enumerate(docs):
+            original_sentence = sentences[i]
+            entity_texts = {ent.text.lower() for ent in doc.ents if len(ent.text.split()) > 1}
+            # Tokenize while preserving entities and numerical ranges
+            tokens = []
+            j = 0
+            words = [token.text for token in doc]
+            while j < len(words):
+                # First check for numerical ranges
+                current_word = words[j].lower()
+                if self.number_range_pattern.match(current_word):
+                    tokens.append(current_word)
+                    j += 1
+                    continue
+                # Then check for entities
+                matched_entity = None
+                for ent in sorted(entity_texts, key=len, reverse=True):
+                    ent_words = ent.split()
+                    if j + len(ent_words) <= len(words) and [w.lower() for w in words[j:j+len(ent_words)]] == ent_words:
+                        matched_entity = " ".join(words[j:j+len(ent_words)])
+                        tokens.append(matched_entity.lower())  # preserve full entity
+                        j += len(ent_words)
+                        break
+                if not matched_entity:
+                    word = words[j].lower()
+                    if word not in self.stop_words and re.match(r'\w+', word):
+                        tokens.append(word)
+                    j += 1
+            cleaned = " ".join(tokens)
+            sentences_cleaned.append(cleaned)
+            clean_to_original[cleaned] = original_sentence
+        # Step 5: Only run n-gram extraction on gaps not covered by regex and named entities
+        ngram_lengths = [4, 3, 2, 1]  # Consider shorter n-grams now since we already have longer phrases
+        all_ngrams_by_length = {}
+        for n in ngram_lengths:
+            all_ngrams = []
+            for sentence in sentences_cleaned:
+                tokens = sentence.split()
+                if len(tokens) >= n:
+                    sent_ngrams = list(ngrams(tokens, n))
+                    all_ngrams.extend(sent_ngrams)
+            all_ngrams_by_length[n] = Counter(all_ngrams)
+        # Step 6: Add additional n-grams that are frequent but weren't caught by regex or named entities
+        threshold_factor = 1.0  # threshold since we're focusing on gaps
+        for n_size in sorted(ngram_lengths, reverse=True):
+            ngram_counts = all_ngrams_by_length[n_size]
+            threshold = max(2, int(len(sentences) * threshold_factor))
+            # Sort by count for efficiency
+            for ngram, count in ngram_counts.most_common():
+                if count >= threshold:
+                    ngram_str = ' '.join(ngram)
+                    # Skip if is a substring of existing n-grams or already in our collection
+                    if ngram_str not in non_melting_points and not self.is_substring_of_any(ngram_str, non_melting_points):
+                        non_melting_points.append(ngram_str)
+        # Create sorted version for efficient lookup
+        final_non_melting_points = non_melting_points.copy()
+        sorted_non_melting_points = sorted(final_non_melting_points, key=len, reverse=True)
+        final_indexed_patterns = [(i+1, pattern) for i, pattern in enumerate(sorted_non_melting_points)]
+        #Filter out n-grams that consist entirely of stop words
+        filtered_patterns = []
+        for idx, pattern in final_indexed_patterns:
+            words = pattern.lower().split()
+            # Check if the pattern is a number or contains a number
+            has_number = any(re.match(r'.*\d+.*', word) for word in words)
+            # If the pattern has a number OR has any non-stop word, keep it
+            if has_number or any(word not in self.stop_words for word in words):
+                filtered_patterns.append((idx, pattern))
+            else:
+                tqdm.write(f"[NgramProcessor] Removing n-gram with all stop words: {pattern}")
+        # Reassign filtered patterns with reindexed values
+        self.indexed_patterns = [(i+1, pattern) for i, (_, pattern) in enumerate(filtered_patterns)]
+        # Generate the results with more efficient regex matching
+        result = {}
+        for sentence in sentences:
+            sentence_result = {}
+            for _,ngram in self.indexed_patterns:  # Use the filtered patterns
+                # Skip single word stopwords and short terms
+                words = ngram.split()
+                if len(words) == 1 and (words[0] in self.stop_words or len(words[0]) < 3):
+                    continue
+                # Handle numerical ranges differently - need exact matching
+                if self.number_range_pattern.match(ngram):
+                    pattern = re.compile(r'\b' + re.escape(ngram) + r'\b', re.IGNORECASE)
+                else:
+                    # Compile the regex pattern once per n-gram - modified to handle special characters
+                    pattern = re.compile(r'(?<!\w)' + re.escape(ngram) + r'(?!\w)', re.IGNORECASE)
+                matches = list(pattern.finditer(sentence))
+                if matches:
+                    indices = []
+                    for match in matches:
+                        # Calculate word indices with improved handling for hyphenated terms
+                        start_pos = match.start()
+                        text_before = sentence[:start_pos]
+                        # More accurate word counting that handles hyphenated terms
+                        start_idx = len(re.findall(r'\s+', text_before)) + (0 if text_before.strip() == "" else 1)
+                        # Count words in the matched n-gram (handling hyphens as single terms)
+                        if self.number_range_pattern.match(ngram):
+                            # Numerical ranges count as one term
+                            ngram_word_count = 1
+                        else:
+                            ngram_word_count = len(re.findall(r'\S+', ngram))
+                        end_idx = start_idx + ngram_word_count - 1
+                        indices.append((start_idx, end_idx))
+                    if indices:  # Only add if we found valid indices
+                        sentence_result[ngram] = indices
+            result[sentence] = sentence_result
+        # Apply the stopword filter before returning
+        result = self.filter_standalone_stopwords(result)
+        return result, dict(self.indexed_patterns)
+    def find_relative_order(self, sentence, common_ngrams):
+        # First, identify all possible matches without modifying the sentence
+        all_matches = []
+        for ngram in common_ngrams:
+            # Special handling for percentages
+            if any(char in ngram for char in '&-/.\'()[]$€£¥+%'):
+                pattern = re.compile(r'\b' + re.escape(ngram) + r'\b', re.IGNORECASE)
+            # Handle numerical ranges
+            elif self.number_range_pattern.match(ngram):
+                pattern = re.compile(r'\b' + re.escape(ngram) + r'\b', re.IGNORECASE)
+            else:
+                pattern = re.compile(r'(?<!\w)' + re.escape(ngram) + r"(?:'s)?(?!\w)", re.IGNORECASE)
+            for match in pattern.finditer(sentence):
+                start, end = match.span()
+                #store character position range, ngram text, and token count
+                all_matches.append((start, end, ngram, len(ngram.split())))
+        # Pre-process: identify all word spans in the original sentence
+        words = []
+        word_spans = []
+        for match in re.finditer(r'\S+', sentence):
+            words.append(match.group())
+            word_spans.append((match.start(), match.end()))
+        # Create a mapping from character positions to word indices
+        char_to_word_idx = {}
+        for i, (start, end) in enumerate(word_spans):
+            for pos in range(start, end + 1):
+                char_to_word_idx[pos] = i
+        # Sort by length in characters first, then by word count
+        all_matches.sort(key=lambda x: (-len(x[2]), -x[3], x[0]))
+        # Filter out ngrams that overlap with already claimed ranges
+        filtered_matches = []
+        claimed_ranges = []
+        for start, end, ngram, length in all_matches:
+            # Check if this match overlaps with any existing claimed range
+            is_overlapping = False
+            for c_start, c_end in claimed_ranges:
+                # Check for any overlap
+                if max(start, c_start) < min(end, c_end):
+                    is_overlapping = True
+                    break
+            if not is_overlapping:
+                # Add this ngram to our filtered list
+                filtered_matches.append((start, end, ngram, length))
+                # Claim its range
+                claimed_ranges.append((start, end))
+        # Sort filtered matches by position for final ordering
+        filtered_matches.sort(key=lambda x: x[0])
+        # Create word-level indices for the final matches
+        word_level_matches = []
+        for start, end, ngram, _ in filtered_matches:
+            # Find the word index for the start and end positions
+            try:
+                start_word_idx = char_to_word_idx.get(start, char_to_word_idx.get(start+1))
+                end_word_idx = char_to_word_idx.get(end-1, char_to_word_idx.get(end-2))
+                if start_word_idx is not None and end_word_idx is not None:
+                    word_level_matches.append((start_word_idx, end_word_idx, ngram))
+            except (KeyError, IndexError):
+                # Skip this match if we can't determine word indices
+                continue
+        # Create the final order with 1-based indexing
+        ngram_to_index = {pattern: idx for idx, pattern in self.indexed_patterns}
+        relative_order = [(ngram_to_index.get(ngram, i+1), ngram) for i, (_, _, ngram) in enumerate(word_level_matches)]
+        return relative_order, sentence
+# Example usage
+if __name__ == "__main__":
+    # Test with NBA Play-In Tournament example
+    sentences = [
+        "The NBA Play-In Tournament tips off tonight as the No. 7-10 teams in each conference battle for a spot in the playoffs. Here's everything you need to know as the action unfolds.",
+        "Tonight the NBA Play-In Tournament begins with No. 7-10 teams from each conference competing for playoff spots. Here's your guide to following all the action.",
+        "The NBA Play-In Tournament kicks off this evening featuring the No. 7-10 teams across both conferences fighting for playoff positions. Here's what you should know about the upcoming games.",
+        "Starting tonight, the NBA Play-In Tournament will showcase the No. 7-10 teams from each conference as they compete for remaining playoff berths. Here's your complete guide to the action.",
+        "The NBA Play-In Tournament begins tonight with the No. 7-10 teams in both conferences battling for playoff spots. Here's everything you need to know about the upcoming games.",
+        "Tonight marks the start of the NBA Play-In Tournament where No. 7-10 teams in each conference compete for playoff positions. Here's your essential guide to following the action.",
+        "The NBA Play-In Tournament tips off tonight, featuring No. 7-10 teams from both conferences fighting for playoff berths. Here's what you need to know about the tournament.",
+        "Beginning tonight, the NBA Play-In Tournament will pit the No. 7-10 teams in each conference against each other for playoff spots. Here's everything you should know about the games.",
+        "The NBA Play-In Tournament starts tonight with No. 7-10 teams across both conferences competing for playoff positions. Here's your complete guide to all the action.",
+        "Tonight is the tip-off of the NBA Play-In Tournament where the No. 7-10 teams from each conference battle for remaining playoff spots. Here's what you need to know as the games unfold."
+    ]
+    # Initialize with multiple models
+    processor = NgramProcessor(models=["en_core_web_trf"])
+    # Process with all models combined
+    common_ngrams,indexed_ngrams = processor.find_filtered_ngrams(sentences)
+    # Print results
+    print("Common n-grams with indices per sentence:")
+    for sentence in sentences:
+        order, updated_sentence = processor.find_relative_order(sentence, common_ngrams[sentence])
+        print(f"Sentence: {sentence}")
+        print(f"Order: {order}")
+        print()

utils/old/masking/masking_methods.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+# Ensure stopwords are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+class MaskingProcessor:
+    def __init__(self, ):
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+        self.stop_words = set(stopwords.words('english'))
+    def adjust_ngram_indices(self, words, common_ngrams, remove_stopwords):
+        """
+        Adjust indices of common n-grams after removing stop words.
+        Args:
+            words (list): List of words in the original sentence.
+            common_ngrams (dict): Common n-grams and their indices.
+        Returns:
+            dict: Adjusted common n-grams and their indices.
+        """
+        if not remove_stopwords:
+            return common_ngrams
+        non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+        adjusted_ngrams = {}
+        for ngram, positions in common_ngrams.items():
+            adjusted_positions = []
+            for start, end in positions:
+                try:
+                    new_start = non_stop_word_indices.index(start)
+                    new_end = non_stop_word_indices.index(end)
+                    adjusted_positions.append((new_start, new_end))
+                except ValueError:
+                    continue  # Skip if indices cannot be mapped
+            adjusted_ngrams[ngram] = adjusted_positions
+        return adjusted_ngrams
+    # def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords):
+    #     """
+    #     Mask one word before the first common n-gram, one between two n-grams,
+    #     and one after the last common n-gram (random selection).
+    #     Args:
+    #         original_sentence (str): Original sentence
+    #         common_ngrams (dict): Common n-grams and their indices
+    #     Returns:
+    #         str: Masked sentence with original stop words retained
+    #     """
+    #     words = original_sentence.split()
+    #     if remove_stopwords:
+    #         non_stop_words = [word for word in words if word.lower() not in self.stop_words]
+    #         non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+    #     else:
+    #         non_stop_words = words
+    #         non_stop_word_indices = list(range(len(words)))
+    #     # non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words
+    #     adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+    #     mask_indices = []
+    #     # Handle before the first common n-gram
+    #     if adjusted_ngrams:
+    #         first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+    #         if first_ngram_start > 0:
+    #             mask_indices.append(random.randint(0, first_ngram_start - 1))
+    #     # Handle between common n-grams
+    #     ngram_positions = list(adjusted_ngrams.values())
+    #     for i in range(len(ngram_positions) - 1):
+    #         end_prev = ngram_positions[i][-1][1]
+    #         start_next = ngram_positions[i + 1][0][0]
+    #         if start_next > end_prev + 1:
+    #             mask_indices.append(random.randint(end_prev + 1, start_next - 1))
+    #     # Handle after the last common n-gram
+    #     last_ngram_end = ngram_positions[-1][-1][1]
+    #     if last_ngram_end < len(non_stop_words) - 1:
+    #         mask_indices.append(random.randint(last_ngram_end + 1, len(non_stop_words) - 1))
+    #     # Mask the chosen indices
+    #     original_masked_sentence = words[:]
+    #     # for idx in mask_indices:
+    #     #     if idx not in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+    #     #         non_stop_words[idx] = self.tokenizer.mask_token
+    #     #         original_masked_sentence[idx] = self.tokenizer.mask_token
+    #     for idx in mask_indices:
+    #         if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+    #             continue  # Skip if index belongs to common n-grams
+    #         if remove_stopwords:
+    #             original_idx = non_stop_word_indices[idx]  # Map back to original indices
+    #             original_masked_sentence[original_idx] = self.tokenizer.mask_token
+    #         else:
+    #             original_masked_sentence[idx] = self.tokenizer.mask_token
+    #     return " ".join(original_masked_sentence)
+    def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords):
+        """
+        Mask one word before the first common n-gram, one between two n-grams,
+        and one after the last common n-gram (random selection).
+        Args:
+            original_sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+            remove_stopwords (bool): Whether to remove stop words
+        Returns:
+            str: Masked sentence with original stop words retained
+        """
+        words = original_sentence.split()
+        if remove_stopwords:
+            non_stop_words = [word for word in words if word.lower() not in self.stop_words]
+            non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+        else:
+            non_stop_words = words
+            non_stop_word_indices = list(range(len(words)))
+        adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+        # Collect all indices corresponding to common n-grams
+        common_ngram_indices = {
+            idx for ngram_positions in adjusted_ngrams.values()
+            for start, end in ngram_positions
+            for idx in range(start, end + 1)
+        }
+        mask_indices = []
+        # Handle before the first common n-gram
+        if adjusted_ngrams:
+            first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+            if first_ngram_start > 0:
+                potential_indices = [i for i in range(first_ngram_start) if i not in common_ngram_indices]
+                if potential_indices:
+                    mask_indices.append(random.choice(potential_indices))
+        # Handle between common n-grams
+        ngram_positions = list(adjusted_ngrams.values())
+        for i in range(len(ngram_positions) - 1):
+            end_prev = ngram_positions[i][-1][1]
+            start_next = ngram_positions[i + 1][0][0]
+            potential_indices = [i for i in range(end_prev + 1, start_next) if i not in common_ngram_indices]
+            if potential_indices:
+                mask_indices.append(random.choice(potential_indices))
+        # Handle after the last common n-gram
+        last_ngram_end = ngram_positions[-1][-1][1]
+        if last_ngram_end < len(non_stop_words) - 1:
+            potential_indices = [i for i in range(last_ngram_end + 1, len(non_stop_words)) if i not in common_ngram_indices]
+            if potential_indices:
+                mask_indices.append(random.choice(potential_indices))
+        # Mask the chosen indices
+        original_masked_sentence = words[:]
+        for idx in mask_indices:
+            if remove_stopwords:
+                original_idx = non_stop_word_indices[idx]  # Map back to original indices
+                original_masked_sentence[original_idx] = self.tokenizer.mask_token
+            else:
+                original_masked_sentence[idx] = self.tokenizer.mask_token
+        return " ".join(original_masked_sentence)
+    def mask_sentence_entropy(self, original_sentence, common_ngrams, remove_stopwords):
+        """
+        Mask one word before the first common n-gram, one between two n-grams,
+        and one after the last common n-gram (highest entropy selection).
+        Args:
+            original_sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence with original stop words retained
+        """
+        words = original_sentence.split()
+        # non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words
+        if remove_stopwords:
+            non_stop_words = [word for word in words if word.lower() not in self.stop_words]
+            non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+        else:
+            non_stop_words = words
+            non_stop_word_indices = list(range(len(words)))
+        adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+        entropy_scores = {}
+        for idx, word in enumerate(non_stop_words):
+            if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+                continue  # Skip words in common n-grams
+            masked_sentence = non_stop_words[:idx] + [self.tokenizer.mask_token] + non_stop_words[idx + 1:]
+            masked_sentence = " ".join(masked_sentence)
+            input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+            mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+            with torch.no_grad():
+                outputs = self.model(input_ids)
+                logits = outputs.logits
+            filtered_logits = logits[0, mask_token_index, :]
+            probs = torch.softmax(filtered_logits, dim=-1)
+            entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item()  # Add epsilon to prevent log(0)
+            entropy_scores[idx] = entropy
+        mask_indices = []
+        # Handle before the first common n-gram
+        if adjusted_ngrams:
+            first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+            candidates = [i for i in range(first_ngram_start) if i in entropy_scores]
+            if candidates:
+                mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Handle between common n-grams
+        ngram_positions = list(adjusted_ngrams.values())
+        for i in range(len(ngram_positions) - 1):
+            end_prev = ngram_positions[i][-1][1]
+            start_next = ngram_positions[i + 1][0][0]
+            candidates = [i for i in range(end_prev + 1, start_next) if i in entropy_scores]
+            if candidates:
+                mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Handle after the last common n-gram
+        last_ngram_end = ngram_positions[-1][-1][1]
+        candidates = [i for i in range(last_ngram_end + 1, len(non_stop_words)) if i in entropy_scores]
+        if candidates:
+            mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Mask the chosen indices
+        original_masked_sentence = words[:]
+        # for idx in mask_indices:
+        #     non_stop_words[idx] = self.tokenizer.mask_token
+        #     original_masked_sentence[idx] = self.tokenizer.mask_token
+        for idx in mask_indices:
+            if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+                continue  # Skip if index belongs to common n-grams
+            if remove_stopwords:
+                original_idx = non_stop_word_indices[idx]  # Map back to original indices
+                original_masked_sentence[original_idx] = self.tokenizer.mask_token
+            else:
+                original_masked_sentence[idx] = self.tokenizer.mask_token
+        return " ".join(original_masked_sentence)
+    def calculate_mask_logits(self, masked_sentence):
+        """
+        Calculate logits for masked tokens in the sentence using BERT.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens
+        Returns:
+            dict: Masked token indices and their logits
+        """
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index}
+        return mask_logits
+    def process_sentences(self, original_sentences, result_dict, method="random", remove_stopwords=False):
+        """
+        Process a list of sentences and calculate logits for masked tokens using the specified method.
+        Args:
+            original_sentences (list): List of original sentences
+            result_dict (dict): Common n-grams and their indices for each sentence
+            method (str): Masking method ("random" or "entropy")
+        Returns:
+            dict: Masked sentences and their logits for each sentence
+        """
+        results = {}
+        for sentence, ngrams in result_dict.items():
+            if method == "random":
+                masked_sentence = self.mask_sentence_random(sentence, ngrams, remove_stopwords)
+            elif method == "entropy":
+                masked_sentence = self.mask_sentence_entropy(sentence, ngrams, remove_stopwords)
+            else:
+                raise ValueError("Invalid method. Choose 'random' or 'entropy'.")
+            logits = self.calculate_mask_logits(masked_sentence)
+            results[sentence] = {
+                "masked_sentence": masked_sentence,
+                "mask_logits": logits
+            }
+        return results
+# Example usage
+if __name__ == "__main__":
+    # !!! Working both the cases regardless if the stopword is removed or not
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A speedy brown fox jumps over a lazy dog.",
+        "A swift brown fox leaps over the lethargic dog."
+    ]
+    result_dict ={
+        'The quick brown fox jumps over the lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+    }
+    processor = MaskingProcessor()
+    results_random = processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=True)
+    # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False)
+    for sentence, output in results_random.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {output['masked_sentence']}")
+        # # print(f"Mask Logits (Random): {output['mask_logits']}")
+        # print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+        # print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+        # print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
+        print('--------------------------------')
+        # for mask_idx, logits in output["mask_logits"].items():
+        #     print(f"Logits for [MASK] at position {mask_idx}:")
+        #     print(f' logits : {logits[:5]}')  # List of logits for all vocabulary tokens
+    # result_dict = {
+    #     "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+    #     "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+    #     "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}
+    # }
+    # print('--------------------------------')
+    # for sentence, output in results_entropy.items():
+    #     print(f"Original Sentence (Entropy): {sentence}")
+    #     print(f"Masked Sentence (Entropy): {output['masked_sentence']}")
+    #     # print(f"Mask Logits (Entropy): {output['mask_logits']}")
+    #     print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+    #     print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+    #     print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')

utils/old/masking/masking_methods_new_work.py ADDED Viewed

	@@ -0,0 +1,447 @@

+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+# Ensure stopwords are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+class MaskingProcessor:
+    def __init__(self):
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+        self.stop_words = set(stopwords.words('english'))
+    def remove_stopwords(self, words):
+        """
+        Remove stopwords from the given list of words.
+        Args:
+            words (list): List of words.
+        Returns:
+            list: List of non-stop words.
+        """
+        return [word for word in words if word.lower() not in self.stop_words]
+    def adjust_ngram_indices(self, original_words, common_ngrams):
+        """
+        Adjust indices of common n-grams after removing stopwords.
+        Args:
+            original_words (list): Original list of words.
+            common_ngrams (dict): Common n-grams and their indices.
+        Returns:
+            dict: Adjusted common n-grams with updated indices.
+        """
+        non_stop_words = self.remove_stopwords(original_words)
+        original_to_non_stop = []
+        non_stop_idx = 0
+        for original_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                original_to_non_stop.append((original_idx, non_stop_idx))
+                non_stop_idx += 1
+        adjusted_ngrams = {}
+        for ngram, positions in common_ngrams.items():
+            adjusted_positions = []
+            for start, end in positions:
+                try:
+                    new_start = next(non_stop for orig, non_stop in original_to_non_stop if orig == start)
+                    new_end = next(non_stop for orig, non_stop in original_to_non_stop if orig == end)
+                    adjusted_positions.append((new_start, new_end))
+                except StopIteration:
+                    continue  # Skip if indices cannot be mapped
+            adjusted_ngrams[ngram] = adjusted_positions
+        return adjusted_ngrams
+    def mask_sentence_random(self, sentence, common_ngrams):
+        """
+        Mask words in the sentence based on the specified rules after removing stopwords.
+        """
+        original_words = sentence.split()
+        print(f' ---- original_words : {original_words} ----- ')
+        non_stop_words = self.remove_stopwords(original_words)
+        print(f' ---- non_stop_words : {non_stop_words} ----- ')
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        print(f' ---- common_ngrams : {common_ngrams} ----- ')
+        print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ')
+        mask_indices = []
+        # Extract n-gram positions in non-stop words
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        # Mask a word before the first common n-gram
+        if ngram_positions:
+            print(f' ---- ngram_positions : {ngram_positions} ----- ')
+            first_ngram_start = ngram_positions[0][0]
+            print(f' ---- first_ngram_start : {first_ngram_start} ----- ')
+            if first_ngram_start > 0:
+                mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+                print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ')
+                mask_indices.append(mask_index_before_ngram)
+            # Mask words between common n-grams
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                print(f' ---- end_prev : {end_prev} ----- ') # END INDICE FROM PREV LOOP FUNKNLKNLKNLKNLKNLKNLSKDNFLKSDHJFLSDJKFH:KLSDHF:LHKSDF:HJKLDFS:HJKLDFSHJK:
+                start_next = ngram_positions[i + 1][0]
+                print(f' ---- start_next : {start_next} ----- ')
+                if start_next > end_prev + 1:
+                    mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+                    print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ')
+                    mask_indices.append(mask_index_between_ngrams)
+            # Mask a word after the last common n-gram
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                print(f' ---- last_ngram_end : {last_ngram_end} ----- ')
+                mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+                print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ')
+                mask_indices.append(mask_index_after_ngram)
+        # Create mapping from non-stop words to original indices
+        non_stop_to_original = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                non_stop_idx += 1
+        # Map mask indices from non-stop word positions to original positions
+        print(f' ---- non_stop_to_original : {non_stop_to_original} ----- ')
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        print(f' ---- original_mask_indices : {original_mask_indices} ----- ')
+        # Apply masks to the original sentence
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        return " ".join(masked_words)
+    def mask_sentence_pseudorandom(self, sentence, common_ngrams):
+        """
+        Mask words in the sentence based on the specified rules after removing stopwords.
+        """
+        random.seed(42)
+        original_words = sentence.split()
+        print(f' ---- original_words : {original_words} ----- ')
+        non_stop_words = self.remove_stopwords(original_words)
+        print(f' ---- non_stop_words : {non_stop_words} ----- ')
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        print(f' ---- common_ngrams : {common_ngrams} ----- ')
+        print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ')
+        mask_indices = []
+        # Extract n-gram positions in non-stop words
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        # Mask a word before the first common n-gram
+        if ngram_positions:
+            print(f' ---- ngram_positions : {ngram_positions} ----- ')
+            first_ngram_start = ngram_positions[0][0]
+            print(f' ---- first_ngram_start : {first_ngram_start} ----- ')
+            if first_ngram_start > 0:
+                mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+                print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ')
+                mask_indices.append(mask_index_before_ngram)
+            # Mask words between common n-grams
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                print(f' ---- end_prev : {end_prev} ----- ')
+                start_next = ngram_positions[i + 1][0]
+                print(f' ---- start_next : {start_next} ----- ')
+                if start_next > end_prev + 1:
+                    mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+                    print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ')
+                    mask_indices.append(mask_index_between_ngrams)
+            # Mask a word after the last common n-gram
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                print(f' ---- last_ngram_end : {last_ngram_end} ----- ')
+                mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+                print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ')
+                mask_indices.append(mask_index_after_ngram)
+        # Create mapping from non-stop words to original indices
+        non_stop_to_original = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                non_stop_idx += 1
+        # Map mask indices from non-stop word positions to original positions
+        print(f' ---- non_stop_to_original : {non_stop_to_original} ----- ')
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        print(f' ---- original_mask_indices : {original_mask_indices} ----- ')
+        # Apply masks to the original sentence
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        return " ".join(masked_words)
+    def calculate_word_entropy(self, sentence, word_position):
+        """
+        Calculate entropy for a specific word position in the sentence.
+        Args:
+            sentence (str): The input sentence
+            word_position (int): Position of the word to calculate entropy for
+        Returns:
+            float: Entropy value for the word
+        """
+        words = sentence.split()
+        masked_words = words.copy()
+        masked_words[word_position] = self.tokenizer.mask_token
+        masked_sentence = " ".join(masked_words)
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        # Get probabilities for the masked position
+        probs = torch.nn.functional.softmax(logits[0, mask_token_index], dim=-1)
+        # Calculate entropy: -sum(p * log(p))
+        entropy = -torch.sum(probs * torch.log(probs + 1e-9))
+        return entropy.item()
+    def mask_sentence_entropy(self, sentence, common_ngrams):
+        """
+        Mask words in the sentence based on entropy, following n-gram positioning rules.
+        Args:
+            sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence
+        """
+        original_words = sentence.split()
+        non_stop_words = self.remove_stopwords(original_words)
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        # Create mapping from non-stop words to original indices
+        non_stop_to_original = {}
+        original_to_non_stop = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                original_to_non_stop[orig_idx] = non_stop_idx
+                non_stop_idx += 1
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        mask_indices = []
+        if ngram_positions:
+            # Handle words before first n-gram
+            first_ngram_start = ngram_positions[0][0]
+            if first_ngram_start > 0:
+                # Calculate entropy for all candidate positions
+                candidate_positions = range(0, first_ngram_start)
+                entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+                            for pos in candidate_positions]
+                # Select position with highest entropy
+                mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+            # Handle words between n-grams
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                start_next = ngram_positions[i + 1][0]
+                if start_next > end_prev + 1:
+                    candidate_positions = range(end_prev + 1, start_next)
+                    entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+                                for pos in candidate_positions]
+                    mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+            # Handle words after last n-gram
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                candidate_positions = range(last_ngram_end + 1, len(non_stop_words))
+                entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+                            for pos in candidate_positions]
+                mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+        # Map mask indices to original sentence positions and apply masks
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        return " ".join(masked_words)
+    def calculate_mask_logits(self, masked_sentence):
+        """
+        Calculate logits for masked tokens in the sentence using BERT.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens.
+        Returns:
+            dict: Masked token indices and their logits.
+        """
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index}
+        return mask_logits
+    def process_sentences(self, sentences, result_dict, method="random"):
+        """
+        Process sentences and calculate logits for masked tokens.
+        Args:
+            sentences (list): List of sentences
+            result_dict (dict): Dictionary of common n-grams
+            method (str): Masking method ("random" or "entropy")
+        Returns:
+            dict: Masked sentences and logits for each sentence
+        """
+        results = {}
+        for sentence, ngrams in result_dict.items():
+            if method == "random":
+                masked_sentence = self.mask_sentence_random(sentence, ngrams)
+            elif method == "pseudorandom":
+                masked_sentence = self.mask_sentence_pseudorandom(sentence, ngrams)
+            else:  # entropy
+                masked_sentence = self.mask_sentence_entropy(sentence, ngrams)
+            logits = self.calculate_mask_logits(masked_sentence)
+            results[sentence] = {
+                "masked_sentence": masked_sentence,
+                "mask_logits": logits
+            }
+        return results
+if __name__ == "__main__":
+    # !!! Working both the cases regardless if the stopword is removed or not
+    sentences = [
+        "The quick brown fox jumps over the lazy dog everyday.",
+        # "A speedy brown fox jumps over a lazy dog.",
+        # "A swift brown fox leaps over the lethargic dog."
+    ]
+    result_dict ={
+        'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        # 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        # 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+    }
+    processor = MaskingProcessor()
+    # results_random = processor.process_sentences(sentences, result_dict)
+    results_entropy = processor.process_sentences(sentences, result_dict, method="random")
+    # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False)
+    for sentence, output in results_entropy.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {output['masked_sentence']}")
+        # print(f"Mask Logits (Random): {output['mask_logits']}")
+        print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+        print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+        print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
+        print('--------------------------------')
+        for mask_idx, logits in output["mask_logits"].items():
+            print(f"Logits for [MASK] at position {mask_idx}:")
+            print(f' logits : {logits[:5]}')  # List of logits for all vocabulary tokens
+            print(f' len(logits) : {len(logits)}')
+# -------------------------------------------------------------------------------------------
+    # def mask_sentence(self, sentence, common_ngrams):
+    #     """
+    #     Mask words in the sentence based on the specified rules after removing stopwords.
+    #     Args:
+    #         sentence (str): Original sentence.
+    #         common_ngrams (dict): Common n-grams and their indices.
+    #     Returns:
+    #         str: Masked sentence.
+    #     """
+    #     original_words = sentence.split()
+    #     print(f' ---- original_words : {original_words} ----- ')
+    #     non_stop_words = self.remove_stopwords(original_words)
+    #     print(f' ---- non_stop_words : {non_stop_words} ----- ')
+    #     adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+    #     print(f' ---- common_ngrams : {common_ngrams} ----- ')
+    #     print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ')
+    #     mask_indices = []
+    #     # Extract n-gram positions in non-stop words
+    #     ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+    #     print(f' ---- ngram_positions : {ngram_positions} ----- ')
+    #     # Mask a word before the first common n-gram
+    #     if ngram_positions:
+    #         first_ngram_start = ngram_positions[0][0]
+    #         print(f' ---- first_ngram_start : {first_ngram_start} ----- ')
+    #         if first_ngram_start > 0:
+    #             mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+    #             print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ')
+    #             mask_indices.append(mask_index_before_ngram)
+    #         # Mask words between common n-grams
+    #         for i in range(len(ngram_positions) - 1):
+    #             end_prev = ngram_positions[i][1]
+    #             print(f' ---- end_prev : {end_prev} ----- ')
+    #             start_next = ngram_positions[i + 1][0]
+    #             print(f' ---- start_next : {start_next} ----- ')
+    #             if start_next > end_prev + 1:
+    #                 mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+    #                 print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ')
+    #                 mask_indices.append(mask_index_between_ngrams)
+    #         # Mask a word after the last common n-gram
+    #         last_ngram_end = ngram_positions[-1][1]
+    #         print(f' ---- last_ngram_end : {last_ngram_end} ----- ')
+    #         if last_ngram_end < len(non_stop_words) - 1:
+    #             mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+    #             print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ')
+    #             mask_indices.append(mask_index_after_ngram)
+    #     # Map mask indices back to original sentence
+    #     adjusted_indices = [
+    #         orig for orig, non_stop in enumerate(original_words)
+    #         if non_stop in mask_indices
+    #     ]
+    #     # Apply masks to the original sentence
+    #     for idx in adjusted_indices:
+    #         original_words[idx] = self.tokenizer.mask_token
+    #     return " ".join(original_words)

utils/old/masking/masking_methods_ok_working.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+# Ensure stopwords are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+class MaskingProcessor:
+    def __init__(self, ):
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+        self.stop_words = set(stopwords.words('english'))
+    def adjust_ngram_indices(self, words, common_ngrams, remove_stopwords):
+        """
+        Adjust indices of common n-grams after removing stop words.
+        Args:
+            words (list): List of words in the original sentence.
+            common_ngrams (dict): Common n-grams and their indices.
+        Returns:
+            dict: Adjusted common n-grams and their indices.
+        """
+        if not remove_stopwords:
+            return common_ngrams
+        non_stop_word_indices = [i for i, word in enumerate(words) if word.lower() not in self.stop_words]
+        adjusted_ngrams = {}
+        for ngram, positions in common_ngrams.items():
+            adjusted_positions = []
+            for start, end in positions:
+                try:
+                    new_start = non_stop_word_indices.index(start)
+                    new_end = non_stop_word_indices.index(end)
+                    adjusted_positions.append((new_start, new_end))
+                except ValueError:
+                    continue  # Skip if indices cannot be mapped
+            adjusted_ngrams[ngram] = adjusted_positions
+        return adjusted_ngrams
+    def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords):
+        """
+        Mask one word before the first common n-gram, one between two n-grams,
+        and one after the last common n-gram (random selection).
+        Args:
+            original_sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence with original stop words retained
+        """
+        words = original_sentence.split()
+        non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words
+        adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+        mask_indices = []
+        # Handle before the first common n-gram
+        if adjusted_ngrams:
+            first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+            if first_ngram_start > 0:
+                mask_indices.append(random.randint(0, first_ngram_start - 1))
+        # Handle between common n-grams
+        ngram_positions = list(adjusted_ngrams.values())
+        for i in range(len(ngram_positions) - 1):
+            end_prev = ngram_positions[i][-1][1]
+            start_next = ngram_positions[i + 1][0][0]
+            if start_next > end_prev + 1:
+                mask_indices.append(random.randint(end_prev + 1, start_next - 1))
+        # Handle after the last common n-gram
+        last_ngram_end = ngram_positions[-1][-1][1]
+        if last_ngram_end < len(non_stop_words) - 1:
+            mask_indices.append(random.randint(last_ngram_end + 1, len(non_stop_words) - 1))
+        # Mask the chosen indices
+        original_masked_sentence = words[:]
+        for idx in mask_indices:
+            if idx not in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+                non_stop_words[idx] = self.tokenizer.mask_token
+                original_masked_sentence[idx] = self.tokenizer.mask_token
+        return " ".join(original_masked_sentence)
+    def mask_sentence_entropy(self, original_sentence, common_ngrams, remove_stopwords):
+        """
+        Mask one word before the first common n-gram, one between two n-grams,
+        and one after the last common n-gram (highest entropy selection).
+        Args:
+            original_sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence with original stop words retained
+        """
+        words = original_sentence.split()
+        non_stop_words = [word for word in words if word.lower() not in self.stop_words] if remove_stopwords else words
+        adjusted_ngrams = self.adjust_ngram_indices(words, common_ngrams, remove_stopwords)
+        entropy_scores = {}
+        for idx, word in enumerate(non_stop_words):
+            if idx in [index for ngram_indices in adjusted_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+                continue  # Skip words in common n-grams
+            masked_sentence = non_stop_words[:idx] + [self.tokenizer.mask_token] + non_stop_words[idx + 1:]
+            masked_sentence = " ".join(masked_sentence)
+            input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+            mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+            with torch.no_grad():
+                outputs = self.model(input_ids)
+                logits = outputs.logits
+            filtered_logits = logits[0, mask_token_index, :]
+            probs = torch.softmax(filtered_logits, dim=-1)
+            entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item()  # Add epsilon to prevent log(0)
+            entropy_scores[idx] = entropy
+        mask_indices = []
+        # Handle before the first common n-gram
+        if adjusted_ngrams:
+            first_ngram_start = list(adjusted_ngrams.values())[0][0][0]
+            candidates = [i for i in range(first_ngram_start) if i in entropy_scores]
+            if candidates:
+                mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Handle between common n-grams
+        ngram_positions = list(adjusted_ngrams.values())
+        for i in range(len(ngram_positions) - 1):
+            end_prev = ngram_positions[i][-1][1]
+            start_next = ngram_positions[i + 1][0][0]
+            candidates = [i for i in range(end_prev + 1, start_next) if i in entropy_scores]
+            if candidates:
+                mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Handle after the last common n-gram
+        last_ngram_end = ngram_positions[-1][-1][1]
+        candidates = [i for i in range(last_ngram_end + 1, len(non_stop_words)) if i in entropy_scores]
+        if candidates:
+            mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Mask the chosen indices
+        original_masked_sentence = words[:]
+        for idx in mask_indices:
+            non_stop_words[idx] = self.tokenizer.mask_token
+            original_masked_sentence[idx] = self.tokenizer.mask_token
+        return " ".join(original_masked_sentence)
+    def calculate_mask_logits(self, masked_sentence):
+        """
+        Calculate logits for masked tokens in the sentence using BERT.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens
+        Returns:
+            dict: Masked token indices and their logits
+        """
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index}
+        return mask_logits
+    def process_sentences(self, original_sentences, result_dict, method="random", remove_stopwords=False):
+        """
+        Process a list of sentences and calculate logits for masked tokens using the specified method.
+        Args:
+            original_sentences (list): List of original sentences
+            result_dict (dict): Common n-grams and their indices for each sentence
+            method (str): Masking method ("random" or "entropy")
+        Returns:
+            dict: Masked sentences and their logits for each sentence
+        """
+        results = {}
+        for sentence, ngrams in result_dict.items():
+            if method == "random":
+                masked_sentence = self.mask_sentence_random(sentence, ngrams, remove_stopwords)
+            elif method == "entropy":
+                masked_sentence = self.mask_sentence_entropy(sentence, ngrams, remove_stopwords)
+            else:
+                raise ValueError("Invalid method. Choose 'random' or 'entropy'.")
+            logits = self.calculate_mask_logits(masked_sentence)
+            results[sentence] = {
+                "masked_sentence": masked_sentence,
+                "mask_logits": logits
+            }
+        return results
+# Example usage
+if __name__ == "__main__":
+    # !!! Working both the cases regardless if the stopword is removed or not
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A quick brown dog outpaces a lazy fox.",
+        "Quick brown animals leap over lazy obstacles."
+    ]
+    result_dict = {
+        "The quick brown fox jumps over the lazy dog.": {"quick brown": [(1, 2)], "lazy": [(7, 7)]},
+        "A quick brown dog outpaces a lazy fox.": {"quick brown": [(1, 2)], "lazy": [(6, 6)]},
+        "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(5, 5)]}
+    }
+    # result_dict = {
+    #     "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+    #     "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+    #     "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}
+    # }
+    processor = MaskingProcessor()
+    results_random = processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+    # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False)
+    for sentence, output in results_random.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {output['masked_sentence']}")
+        # print(f"Mask Logits (Random): {output['mask_logits']}")
+        print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+        print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+        print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
+        print('--------------------------------')
+        for mask_idx, logits in output["mask_logits"].items():
+            print(f"Logits for [MASK] at position {mask_idx}:")
+            print(f' logits : {logits[:5]}')  # List of logits for all vocabulary tokens
+    # print('--------------------------------')
+    # for sentence, output in results_entropy.items():
+    #     print(f"Original Sentence (Entropy): {sentence}")
+    #     print(f"Masked Sentence (Entropy): {output['masked_sentence']}")
+    #     # print(f"Mask Logits (Entropy): {output['mask_logits']}")
+    #     print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+    #     print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+    #     print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')

utils/old/masking/masking_methods_v1_working.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    # THIS IS WORKING WHEN THE COORDINATES ARE WITHOUT REMOVING STOPWORDS
+    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+# Ensure stopwords are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+class MaskingProcessor:
+    def __init__(self):
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+        self.stop_words = set(stopwords.words('english'))
+    def mask_sentence_random(self, original_sentence, common_ngrams, remove_stopwords=False):
+        """
+        Mask one word before the first common n-gram, one between two n-grams,
+        and one after the last common n-gram (random selection).
+        Args:
+            original_sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence
+        """
+        if remove_stopwords:
+            words = original_sentence.split()
+            words = [word for word in words if word not in self.stop_words]
+        else:
+            words = original_sentence.split()
+        mask_indices = []
+        # Handle before the first common n-gram
+        if common_ngrams:
+            first_ngram_start = list(common_ngrams.values())[0][0][0]
+            if first_ngram_start > 0:
+                mask_indices.append(random.randint(0, first_ngram_start - 1))
+        # Handle between common n-grams
+        ngram_positions = list(common_ngrams.values())
+        for i in range(len(ngram_positions) - 1):
+            end_prev = ngram_positions[i][-1][1]
+            start_next = ngram_positions[i + 1][0][0]
+            if start_next > end_prev + 1:
+                mask_indices.append(random.randint(end_prev + 1, start_next - 1))
+        # Handle after the last common n-gram
+        last_ngram_end = ngram_positions[-1][-1][1]
+        if last_ngram_end < len(words) - 1:
+            mask_indices.append(random.randint(last_ngram_end + 1, len(words) - 1))
+        # Mask the chosen indices
+        for idx in mask_indices:
+            if idx not in [index for ngram_indices in common_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+                words[idx] = self.tokenizer.mask_token
+        return " ".join(words)
+    def mask_sentence_entropy(self, original_sentence, common_ngrams, remove_stopwords=False):
+        """
+        Mask one word before the first common n-gram, one between two n-grams,
+        and one after the last common n-gram (highest entropy selection).
+        Args:
+            original_sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence
+        """
+        if remove_stopwords:
+            words = original_sentence.split()
+            words = [word for word in words if word not in self.stop_words]
+        else:
+            words = original_sentence.split()
+        entropy_scores = {}
+        for idx, word in enumerate(words):
+            if idx in [index for ngram_indices in common_ngrams.values() for start, end in ngram_indices for index in range(start, end + 1)]:
+                continue  # Skip words in common n-grams
+            masked_sentence = words[:idx] + [self.tokenizer.mask_token] + words[idx + 1:]
+            masked_sentence = " ".join(masked_sentence)
+            input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+            mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+            with torch.no_grad():
+                outputs = self.model(input_ids)
+                logits = outputs.logits
+            filtered_logits = logits[0, mask_token_index, :]
+            probs = torch.softmax(filtered_logits, dim=-1)
+            entropy = -torch.sum(probs * torch.log(probs + 1e-10)).item()  # Add epsilon to prevent log(0)
+            entropy_scores[idx] = entropy
+        mask_indices = []
+        # Handle before the first common n-gram
+        if common_ngrams:
+            first_ngram_start = list(common_ngrams.values())[0][0][0]
+            candidates = [i for i in range(first_ngram_start) if i in entropy_scores]
+            if candidates:
+                mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Handle between common n-grams
+        ngram_positions = list(common_ngrams.values())
+        for i in range(len(ngram_positions) - 1):
+            end_prev = ngram_positions[i][-1][1]
+            start_next = ngram_positions[i + 1][0][0]
+            candidates = [i for i in range(end_prev + 1, start_next) if i in entropy_scores]
+            if candidates:
+                mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Handle after the last common n-gram
+        last_ngram_end = ngram_positions[-1][-1][1]
+        candidates = [i for i in range(last_ngram_end + 1, len(words)) if i in entropy_scores]
+        if candidates:
+            mask_indices.append(max(candidates, key=lambda x: entropy_scores[x]))
+        # Mask the chosen indices
+        for idx in mask_indices:
+            words[idx] = self.tokenizer.mask_token
+        return " ".join(words)
+    def calculate_mask_logits(self, masked_sentence):
+        """
+        Calculate logits for masked tokens in the sentence using BERT.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens
+        Returns:
+            dict: Masked token indices and their logits
+        """
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        mask_logits = {idx.item(): logits[0, idx].tolist() for idx in mask_token_index}
+        return mask_logits
+    def process_sentences(self, original_sentences, result_dict, remove_stopwords=False, method="random"):
+        """
+        Process a list of sentences and calculate logits for masked tokens using the specified method.
+        Args:
+            original_sentences (list): List of original sentences
+            result_dict (dict): Common n-grams and their indices for each sentence
+            method (str): Masking method ("random" or "entropy")
+        Returns:
+            dict: Masked sentences and their logits for each sentence
+        """
+        results = {}
+        for sentence, ngrams in result_dict.items():
+            if method == "random":
+                masked_sentence = self.mask_sentence_random(sentence, ngrams)
+            elif method == "entropy":
+                masked_sentence = self.mask_sentence_entropy(sentence, ngrams)
+            else:
+                raise ValueError("Invalid method. Choose 'random' or 'entropy'.")
+            logits = self.calculate_mask_logits(masked_sentence)
+            results[sentence] = {
+                "masked_sentence": masked_sentence,
+                "mask_logits": logits
+            }
+        return results
+# Example usage
+if __name__ == "__main__":
+    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+    # THIS IS WORKING WHEN THE COORDINATES ARE WITHOUT REMOVING STOPWORDS
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A quick brown dog outpaces a lazy fox.",
+        "Quick brown animals leap over lazy obstacles."
+    ]
+    result_dict = {
+        "The quick brown fox jumps over the lazy dog.": {"quick brown": [(1, 2)], "lazy": [(7, 7)]},
+        "A quick brown dog outpaces a lazy fox.": {"quick brown": [(1, 2)], "lazy": [(6, 6)]},
+        "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(5, 5)]}
+    }
+    # result_dict = {
+    #     "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+    #     "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+    #     "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}
+    # }
+    processor = MaskingProcessor()
+    results_random = processor.process_sentences(sentences, result_dict, remove_stopwords=True, method="random")
+    results_entropy = processor.process_sentences(sentences, result_dict, remove_stopwords=True, method="entropy")
+    for sentence, output in results_random.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {output['masked_sentence']}")
+        # print(f"Mask Logits (Random): {output['mask_logits']}")
+    for sentence, output in results_entropy.items():
+        print(f"Original Sentence (Entropy): {sentence}")
+        print(f"Masked Sentence (Entropy): {output['masked_sentence']}")
+        # print(f"Mask Logits (Entropy): {output['mask_logits']}")
+'''
+ result_dict = {
+        "The quick brown fox jumps over the lazy dog.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+        "A quick brown dog outpaces a lazy fox.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]},
+        "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(4, 4)]}
+    }
+'''

utils/old/masking_methods_final_copy.py ADDED Viewed

	@@ -0,0 +1,619 @@

+import random
+import torch
+from transformers import BertTokenizer, BertForMaskedLM
+from nltk.corpus import stopwords
+import nltk
+from transformers import RobertaTokenizer, RobertaForMaskedLM
+# Ensure stopwords are downloaded
+try:
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('stopwords')
+class MaskingProcessor:
+    # def __init__(self, tokenizer, model):
+    def __init__(self):
+        # self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        # self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
+        # self.tokenizer = tokenizer
+        # self.model = model
+        self.tokenizer = BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+        self.model = BertForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking")
+        # self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+        # self.model = RobertaForMaskedLM.from_pretrained("roberta-base")
+        self.stop_words = set(stopwords.words('english'))
+    def remove_stopwords(self, words):
+        """
+        Remove stopwords from the given list of words.
+        Args:
+            words (list): List of words.
+        Returns:
+            list: List of non-stop words.
+        """
+        return [word for word in words if word.lower() not in self.stop_words]
+    def adjust_ngram_indices(self, original_words, common_ngrams):
+        """
+        Adjust indices of common n-grams after removing stopwords.
+        Args:
+            original_words (list): Original list of words.
+            common_ngrams (dict): Common n-grams and their indices.
+        Returns:
+            dict: Adjusted common n-grams with updated indices.
+        """
+        non_stop_words = self.remove_stopwords(original_words)
+        original_to_non_stop = []
+        non_stop_idx = 0
+        for original_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                original_to_non_stop.append((original_idx, non_stop_idx))
+                non_stop_idx += 1
+        adjusted_ngrams = {}
+        for ngram, positions in common_ngrams.items():
+            adjusted_positions = []
+            for start, end in positions:
+                try:
+                    new_start = next(non_stop for orig, non_stop in original_to_non_stop if orig == start)
+                    new_end = next(non_stop for orig, non_stop in original_to_non_stop if orig == end)
+                    adjusted_positions.append((new_start, new_end))
+                except StopIteration:
+                    continue  # Skip if indices cannot be mapped
+            adjusted_ngrams[ngram] = adjusted_positions
+        return adjusted_ngrams
+    def mask_sentence_random(self, sentence, common_ngrams):
+        """
+        Mask words in the sentence based on the specified rules after removing stopwords.
+        """
+        # Split sentence into words
+        original_words = sentence.split()
+        # Handle punctuation at the end
+        has_punctuation = False
+        punctuation = None
+        if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+            has_punctuation = True
+            punctuation = original_words[-1][-1]
+            original_words = original_words[:-1]
+        print(f' ---- original_words : {original_words} ----- ')
+        # Process words without punctuation
+        non_stop_words = self.remove_stopwords(original_words)
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        # Rest of the existing function code...
+        mask_indices = []
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        if ngram_positions:
+            first_ngram_start = ngram_positions[0][0]
+            if first_ngram_start > 0:
+                mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+                mask_indices.append(mask_index_before_ngram)
+            # Mask words between common n-grams
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                start_next = ngram_positions[i + 1][0]
+                if start_next > end_prev + 1:
+                    mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+                    mask_indices.append(mask_index_between_ngrams)
+            # Mask a word after the last common n-gram
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+                mask_indices.append(mask_index_after_ngram)
+        # Create mapping from non-stop words to original indices
+        non_stop_to_original = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                non_stop_idx += 1
+        # Map mask indices and apply masks
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+            # masked_words[idx] = '<mask>' # for roberta
+        # Add back punctuation if it existed
+        if has_punctuation:
+            masked_words.append(punctuation)
+        print(f' ***** masked_words at end  : {masked_words} ***** ')
+        print(f' ***** original_mask_indices : {original_mask_indices} ***** ')
+        print(f' ***** TESTING : {" ".join(masked_words)} ***** ')
+        return " ".join(masked_words), original_mask_indices
+    def mask_sentence_pseudorandom(self, sentence, common_ngrams):
+        """
+        Mask words in the sentence based on the specified rules after removing stopwords.
+        """
+        # Split sentence into words
+        random.seed(3)
+        original_words = sentence.split()
+        # Handle punctuation at the end
+        has_punctuation = False
+        punctuation = None
+        if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+            has_punctuation = True
+            punctuation = original_words[-1][-1]
+            original_words = original_words[:-1]
+        print(f' ---- original_words : {original_words} ----- ')
+        # Process words without punctuation
+        non_stop_words = self.remove_stopwords(original_words)
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        # Rest of the existing function code...
+        mask_indices = []
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        if ngram_positions:
+            first_ngram_start = ngram_positions[0][0]
+            if first_ngram_start > 0:
+                mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+                mask_indices.append(mask_index_before_ngram)
+            # Mask words between common n-grams
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                start_next = ngram_positions[i + 1][0]
+                if start_next > end_prev + 1:
+                    mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+                    mask_indices.append(mask_index_between_ngrams)
+            # Mask a word after the last common n-gram
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+                mask_indices.append(mask_index_after_ngram)
+        # Create mapping from non-stop words to original indices
+        non_stop_to_original = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                non_stop_idx += 1
+        # Map mask indices and apply masks
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+            # masked_words[idx] = '<mask>' # for roberta
+        # Add back punctuation if it existed
+        if has_punctuation:
+            masked_words.append(punctuation)
+        print(f' ***** masked_words at end  : {masked_words} ***** ')
+        print(f' ***** original_mask_indices : {original_mask_indices} ***** ')
+        print(f' ***** TESTING : {" ".join(masked_words)} ***** ')
+        return " ".join(masked_words), original_mask_indices
+    def calculate_word_entropy(self, sentence, word_position):
+        """
+        Calculate entropy for a specific word position in the sentence.
+        Args:
+            sentence (str): The input sentence
+            word_position (int): Position of the word to calculate entropy for
+        Returns:
+            float: Entropy value for the word
+        """
+        words = sentence.split()
+        masked_words = words.copy()
+        masked_words[word_position] = self.tokenizer.mask_token
+        masked_sentence = " ".join(masked_words)
+        input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+        mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+        with torch.no_grad():
+            outputs = self.model(input_ids)
+            logits = outputs.logits
+        # Get probabilities for the masked position
+        probs = torch.nn.functional.softmax(logits[0, mask_token_index], dim=-1)
+        # Calculate entropy: -sum(p * log(p))
+        entropy = -torch.sum(probs * torch.log(probs + 1e-9))
+        return entropy.item()
+    def mask_sentence_entropy(self, sentence, common_ngrams):
+        """
+        Mask words in the sentence based on entropy, following n-gram positioning rules.
+        Args:
+            sentence (str): Original sentence
+            common_ngrams (dict): Common n-grams and their indices
+        Returns:
+            str: Masked sentence
+        """
+        # Split sentence into words
+        original_words = sentence.split()
+        # Handle punctuation at the end
+        has_punctuation = False
+        punctuation = None
+        if original_words and any(original_words[-1].endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+            has_punctuation = True
+            punctuation = original_words[-1][-1]
+            original_words = original_words[:-1]
+        # Process words without punctuation
+        non_stop_words = self.remove_stopwords(original_words)
+        adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+        # Create mapping from non-stop words to original indices
+        non_stop_to_original = {}
+        original_to_non_stop = {}
+        non_stop_idx = 0
+        for orig_idx, word in enumerate(original_words):
+            if word.lower() not in self.stop_words:
+                non_stop_to_original[non_stop_idx] = orig_idx
+                original_to_non_stop[orig_idx] = non_stop_idx
+                non_stop_idx += 1
+        ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+        mask_indices = []
+        if ngram_positions:
+            # Handle words before first n-gram
+            first_ngram_start = ngram_positions[0][0]
+            if first_ngram_start > 0:
+                candidate_positions = range(0, first_ngram_start)
+                entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+                            for pos in candidate_positions]
+                mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+            # Handle words between n-grams
+            for i in range(len(ngram_positions) - 1):
+                end_prev = ngram_positions[i][1]
+                start_next = ngram_positions[i + 1][0]
+                if start_next > end_prev + 1:
+                    candidate_positions = range(end_prev + 1, start_next)
+                    entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+                                for pos in candidate_positions]
+                    mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+            # Handle words after last n-gram
+            last_ngram_end = ngram_positions[-1][1]
+            if last_ngram_end < len(non_stop_words) - 1:
+                candidate_positions = range(last_ngram_end + 1, len(non_stop_words))
+                entropies = [(pos, self.calculate_word_entropy(sentence, non_stop_to_original[pos]))
+                            for pos in candidate_positions]
+                mask_indices.append(max(entropies, key=lambda x: x[1])[0])
+        # Map mask indices to original sentence positions and apply masks
+        original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+        masked_words = original_words.copy()
+        for idx in original_mask_indices:
+            masked_words[idx] = self.tokenizer.mask_token
+        # Add back punctuation if it existed
+        if has_punctuation:
+            masked_words.append(punctuation)
+        return " ".join(masked_words), original_mask_indices
+    def calculate_mask_logits(self, original_sentence, original_mask_indices):
+        """
+        Calculate logits for masked tokens in the sentence using BERT.
+        Args:
+            original_sentence (str): Original sentence without masks
+            original_mask_indices (list): List of indices to mask
+        Returns:
+            dict: Masked token indices and their logits
+        """
+        print('==========================================================================================================')
+        words = original_sentence.split()
+        print(f' ##### calculate_mask_logits >> words : {words} ##### ')
+        mask_logits = {}
+        for idx in original_mask_indices:
+            # Create a copy of words and mask the current position
+            print(f' ---- idx : {idx} ----- ')
+            masked_words = words.copy()
+            masked_words[idx] = '[MASK]'
+            # masked_words[idx] = '<mask>' # for roberta
+            masked_sentence = " ".join(masked_words)
+            print(f' ---- masked_sentence : {masked_sentence} ----- ')
+            # Calculate logits for the current mask
+            input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+            mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+            with torch.no_grad():
+                outputs = self.model(input_ids)
+                logits = outputs.logits
+            # Extract logits for the masked position
+            mask_logits_tensor = logits[0, mask_token_index, :]
+            # Get top logits and corresponding tokens
+            top_mask_logits, top_mask_indices = torch.topk(mask_logits_tensor, 100, dim=-1)  # Get more candidates
+            # Convert token IDs to words and filter out subword tokens
+            top_tokens = []
+            top_logits = []
+            seen_words = set()  # To keep track of unique words
+            for token_id, logit in zip(top_mask_indices[0], top_mask_logits[0]):
+                token = self.tokenizer.convert_ids_to_tokens(token_id.item())
+                # Skip if it's a subword token (starts with ##)
+                if token.startswith('##'):
+                    continue
+                # Convert token to proper word
+                word = self.tokenizer.convert_tokens_to_string([token]).strip()
+                # Only add if it's a new word and not empty
+                if word and word not in seen_words:
+                    seen_words.add(word)
+                    top_tokens.append(word)
+                    top_logits.append(logit.item())
+                    # Break if we have 50 unique complete words
+                    if len(top_tokens) == 50:
+                        break
+            # print(f' ---- top_tokens : {top_tokens} ----- ')
+            # Store results
+            mask_logits[idx] = {
+                "tokens": top_tokens,
+                "logits": top_logits
+            }
+        return mask_logits
+    # def calculate_mask_logits(self, original_sentence, original_mask_indices):
+    #     """
+    #     Calculate logits for masked tokens in the sentence using BERT.
+    #     Args:
+    #         original_sentence (str): Original sentence without masks
+    #         original_mask_indices (list): List of indices to mask
+    #     Returns:
+    #         dict: Masked token indices and their logits
+    #     """
+    #     words = original_sentence.split()
+    #     print(f' ##### calculate_mask_logits >> words : {words} ##### ')
+    #     mask_logits = {}
+    #     for idx in original_mask_indices:
+    #         # Create a copy of words and mask the current position
+    #         print(f' ---- idx : {idx} ----- ')
+    #         masked_words = words.copy()
+    #         print(f' ---- words : {masked_words} ----- ')
+    #         # masked_words[idx] = self.tokenizer.mask_token
+    #         masked_words[idx] = '[MASK]'
+    #         print(f' ---- masked_words : {masked_words} ----- ')
+    #         masked_sentence = " ".join(masked_words)
+    #         print(f' ---- masked_sentence : {masked_sentence} ----- ')
+    #         # Calculate logits for the current mask
+    #         input_ids = self.tokenizer(masked_sentence, return_tensors="pt")["input_ids"]
+    #         mask_token_index = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+    #         with torch.no_grad():
+    #             outputs = self.model(input_ids)
+    #             logits = outputs.logits
+    #         # Extract logits for the masked position
+    #         mask_logits_tensor = logits[0, mask_token_index, :]
+    #         # Get top 50 logits and corresponding tokens
+    #         top_mask_logits, top_mask_indices = torch.topk(mask_logits_tensor, 50, dim=-1)
+    #         # Convert token IDs to words
+    #         top_tokens = [self.tokenizer.convert_ids_to_tokens(token_id.item()) for token_id in top_mask_indices[0]]
+    #         print(f' ---- top_tokens : {top_tokens} ----- ')
+    #         # Store results
+    #         mask_logits[idx] = {
+    #             "tokens": top_tokens,
+    #             "logits": top_mask_logits.tolist()
+    #         }
+    #     return mask_logits
+    def process_sentences(self, sentences, result_dict, method="random"):
+        """
+        Process sentences and calculate logits for masked tokens.
+        """
+        results = {}
+        for sentence, ngrams in result_dict.items():
+            # Split punctuation from the last word before processing
+            words = sentence.split()
+            last_word = words[-1]
+            if any(last_word.endswith(p) for p in ['.', ',', '!', '?', ';', ':']):
+                # Split the last word and punctuation
+                words[-1] = last_word[:-1]
+                punctuation = last_word[-1]
+                # Rejoin with space before punctuation to treat it as separate token
+                processed_sentence = " ".join(words) + " " + punctuation
+            else:
+                processed_sentence = sentence
+            if method == "random":
+                masked_sentence, original_mask_indices = self.mask_sentence_random(processed_sentence, ngrams)
+            elif method == "pseudorandom":
+                masked_sentence, original_mask_indices = self.mask_sentence_pseudorandom(processed_sentence, ngrams)
+            else:  # entropy
+                masked_sentence, original_mask_indices = self.mask_sentence_entropy(processed_sentence, ngrams)
+            logits = self.calculate_mask_logits(processed_sentence, original_mask_indices)
+            results[sentence] = {
+                "masked_sentence": masked_sentence,
+                "mask_logits": logits
+            }
+        return results
+if __name__ == "__main__":
+    # !!! Working both the cases regardless if the stopword is removed or not
+    sentences = [
+        "The quick brown fox jumps over small cat the lazy dog everyday again and again .",
+        # "A speedy brown fox jumps over a lazy dog.",
+        # "A swift brown fox leaps over the lethargic dog."
+    ]
+    result_dict ={
+        'The quick brown fox jumps over small cat the lazy dog everyday again and again .': {'brown fox': [(2, 3)],'cat': [(7, 7)], 'dog': [(10, 10)]},
+        # 'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        # 'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+    }
+    processor = MaskingProcessor()
+    # results_random = processor.process_sentences(sentences, result_dict)
+    results_entropy = processor.process_sentences(sentences, result_dict, method="random")
+    '''
+        results structure :
+        results = {
+                    "The quick brown fox jumps over the lazy dog everyday.":
+                    {  # Original sentence as key
+                        "masked_sentence": str,  # The sentence with [MASK] tokens
+                        "mask_logits":
+                        {  # Dictionary of mask positions and their predictions
+                            1:
+                                {  # Position of mask in sentence
+                                    "tokens" (words) : list,  # List of top 50 predicted tokens
+                                    "logits" (probabilities) : list   # Corresponding logits for those tokens
+                                },
+                            7:
+                                {
+                                    "tokens" (words) : list,
+                                    "logits" (probabilities) : list
+                                },
+                            10:
+                                {
+                                    "tokens (words)": list,
+                                    "logits (probabilities)": list
+                                }
+                        }
+                    }
+                }
+    '''
+    # results_entropy = processor.process_sentences(sentences, result_dict, method="entropy", remove_stopwords=False)
+    for sentence, output in results_entropy.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {output['masked_sentence']}")
+        # print(f"Mask Logits (Random): {output['mask_logits']}")
+        # print(f' type(output["mask_logits"]) : {type(output["mask_logits"])}')
+        # print(f' length of output["mask_logits"] : {len(output["mask_logits"])}')
+        # print(f' output["mask_logits"].keys() : {output["mask_logits"].keys()}')
+        # print('--------------------------------')
+        # for mask_idx, logits in output["mask_logits"].items():
+        #     print(f"Logits for [MASK] at position {mask_idx}:")
+        #     print(f' logits : {logits[:5]}')  # List of logits for all vocabulary tokens
+        #     print(f' len(logits) : {len(logits)}')
+# ------------------------------------------------------------------------------------------------
+    # def mask_sentence_random(self, sentence, common_ngrams):
+    #     """
+    #     Mask words in the sentence based on the specified rules after removing stopwords.
+    #     """
+    #     original_words = sentence.split()
+    #     # print(f' ---- original_words : {original_words} ----- ')
+    #     non_stop_words = self.remove_stopwords(original_words)
+    #     # print(f' ---- non_stop_words : {non_stop_words} ----- ')
+    #     adjusted_ngrams = self.adjust_ngram_indices(original_words, common_ngrams)
+    #     # print(f' ---- common_ngrams : {common_ngrams} ----- ')
+    #     # print(f' ---- adjusted_ngrams : {adjusted_ngrams} ----- ')
+    #     mask_indices = []
+    #     # Extract n-gram positions in non-stop words
+    #     ngram_positions = [pos for positions in adjusted_ngrams.values() for pos in positions]
+    #     # Mask a word before the first common n-gram
+    #     if ngram_positions:
+    #         # print(f' ---- ngram_positions : {ngram_positions} ----- ')
+    #         first_ngram_start = ngram_positions[0][0]
+    #         # print(f' ---- first_ngram_start : {first_ngram_start} ----- ')
+    #         if first_ngram_start > 0:
+    #             mask_index_before_ngram = random.randint(0, first_ngram_start-1)
+    #             # print(f' ---- mask_index_before_ngram : {mask_index_before_ngram} ----- ')
+    #             mask_indices.append(mask_index_before_ngram)
+    #         # Mask words between common n-grams
+    #         for i in range(len(ngram_positions) - 1):
+    #             end_prev = ngram_positions[i][1]
+    #             # print(f' ---- end_prev : {end_prev} ----- ')
+    #             start_next = ngram_positions[i + 1][0]
+    #             # print(f' ---- start_next : {start_next} ----- ')
+    #             if start_next > end_prev + 1:
+    #                 mask_index_between_ngrams = random.randint(end_prev + 1, start_next - 1)
+    #                 # print(f' ---- mask_index_between_ngrams : {mask_index_between_ngrams} ----- ')
+    #                 mask_indices.append(mask_index_between_ngrams)
+    #         # Mask a word after the last common n-gram
+    #         last_ngram_end = ngram_positions[-1][1]
+    #         if last_ngram_end < len(non_stop_words) - 1:
+    #             # print(f' ---- last_ngram_end : {last_ngram_end} ----- ')
+    #             mask_index_after_ngram = random.randint(last_ngram_end + 1, len(non_stop_words) - 1)
+    #             # print(f' ---- mask_index_after_ngram : {mask_index_after_ngram} ----- ')
+    #             mask_indices.append(mask_index_after_ngram)
+    #     # Create mapping from non-stop words to original indices
+    #     non_stop_to_original = {}
+    #     non_stop_idx = 0
+    #     for orig_idx, word in enumerate(original_words):
+    #         if word.lower() not in self.stop_words:
+    #             non_stop_to_original[non_stop_idx] = orig_idx
+    #             non_stop_idx += 1
+    #     # Map mask indices from non-stop word positions to original positions
+    #     # print(f' ---- non_stop_to_original : {non_stop_to_original} ----- ')
+    #     original_mask_indices = [non_stop_to_original[idx] for idx in mask_indices]
+    #     # print(f' ---- original_mask_indices : {original_mask_indices} ----- ')
+    #     # Apply masks to the original sentence
+    #     masked_words = original_words.copy()
+    #     for idx in original_mask_indices:
+    #         masked_words[idx] = self.tokenizer.mask_token
+    #     return " ".join(masked_words), original_mask_indices

utils/old/non_melting_points_v1.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import nltk
+from nltk.corpus import stopwords
+from nltk.util import ngrams
+from collections import Counter
+import re
+class NgramProcessor:
+    def __init__(self):
+        try:
+            nltk.data.find('corpora/stopwords')
+        except LookupError:
+            nltk.download('stopwords')
+        self.stop_words = set(stopwords.words('english'))
+    def remove_stopwords(self, text):
+        """
+        Remove stopwords using NLTK's stopword list
+        Args:
+            text (str): Input text
+        Returns:
+            str: Cleaned text with stopwords removed
+        """
+        words = re.findall(r'\w+', text.lower())
+        filtered_words = [word for word in words if word not in self.stop_words]
+        return ' '.join(filtered_words)
+    def is_exact_match(self, ngram, sentences):
+        """
+        Check if the given n-gram has an exact match in all sentences
+        Args:
+            ngram (str): The n-gram to search for
+            sentences (list): List of sentences to search in
+        Returns:
+            bool: True if n-gram has exact match in all sentences, False otherwise
+        """
+        return all(ngram in sentence for sentence in sentences)
+    def is_substring_of_any(self, ngram, common_ngrams):
+        """
+        Check if the given n-gram is an exact substring of any previously found common n-grams
+        Args:
+            ngram (str): The n-gram to check
+            common_ngrams (list): List of previously found common n-grams
+        Returns:
+            bool: True if ngram is a substring of any common_ngrams, False otherwise
+        """
+        return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
+    def find_filtered_ngrams(self, sentences):
+        """
+        Find all n-grams that have exact matches across all sentences,
+        excluding those that are part of larger common n-grams
+        Args:
+            sentences (list): List of sentences to analyze
+        Returns:
+            list: List of tuples where each tuple contains the n-gram and its indices in each sentence
+        """
+        original_sentences = sentences[:]
+        sentences = [self.remove_stopwords(sentence) for sentence in sentences]
+        ngram_lengths = [4, 3, 2, 1]  # Quadgram, trigram, bigram, unigram
+        common_ngrams = []
+        for n in ngram_lengths:
+            ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences]
+            ngrams_counter = Counter(ngrams_list[0])
+            for ngram in ngrams_counter:
+                ngram_str = ' '.join(ngram)
+                if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, [ng[0] for ng in common_ngrams]):
+                    indices = []
+                    for original_sentence in original_sentences:
+                        words = original_sentence.split()
+                        ngram_indices = [
+                            (i, i + n - 1) for i in range(len(words) - n + 1)
+                            if ' '.join(words[i:i + n]).lower() == ngram_str
+                        ]
+                        indices.append(ngram_indices)
+                    common_ngrams.append((ngram_str, indices))
+        return common_ngrams
+    def find_relative_order(self, sentence, common_ngrams):
+        """
+        Find the relative order of the common n-grams in the sentence
+        Args:
+            sentence (str): Sentence in which to find the relative order
+            common_ngrams (list): List of common n-grams
+        Returns:
+            list: List of tuples with the relative position and the n-gram
+        """
+        relative_order = []
+        for ngram, _ in common_ngrams:
+            index = sentence.find(ngram)
+            if index != -1:
+                relative_order.append((index, ngram))
+        return sorted(relative_order)
+# Example usage
+if __name__ == "__main__":
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A quick brown dog outpaces a lazy fox.",
+        "Quick brown animals leap over lazy obstacles."
+    ]
+    processor = NgramProcessor()
+    common_ngrams = processor.find_filtered_ngrams(sentences)
+    print("Common n-grams and their indices:")
+    for ngram, indices in common_ngrams:
+        print(f"{ngram}: {indices}")
+    for sentence in sentences:
+        relative_order = processor.find_relative_order(sentence, common_ngrams)
+        print(f"Relative order in sentence '{sentence}':", relative_order)
+# import nltk
+# from nltk.corpus import stopwords
+# from nltk.util import ngrams
+# from collections import Counter
+# import re
+# class NgramProcessor:
+#     def __init__(self):
+#         try:
+#             nltk.data.find('corpora/stopwords')
+#         except LookupError:
+#             nltk.download('stopwords')
+#         self.stop_words = set(stopwords.words('english'))
+#     def remove_stopwords(self, text):
+#         """
+#         Remove stopwords using NLTK's stopword list
+#         Args:
+#             text (str): Input text
+#         Returns:
+#             str: Cleaned text with stopwords removed
+#         """
+#         words = re.findall(r'\w+', text.lower())
+#         filtered_words = [word for word in words if word not in self.stop_words]
+#         return ' '.join(filtered_words)
+#     def is_exact_match(self, ngram, sentences):
+#         """
+#         Check if the given n-gram has an exact match in all sentences
+#         Args:
+#             ngram (str): The n-gram to search for
+#             sentences (list): List of sentences to search in
+#         Returns:
+#             bool: True if n-gram has exact match in all sentences, False otherwise
+#         """
+#         return all(ngram in sentence for sentence in sentences)
+#     def is_substring_of_any(self, ngram, common_ngrams):
+#         """
+#         Check if the given n-gram is an exact substring of any previously found common n-grams
+#         Args:
+#             ngram (str): The n-gram to check
+#             common_ngrams (list): List of previously found common n-grams
+#         Returns:
+#             bool: True if ngram is a substring of any common_ngrams, False otherwise
+#         """
+#         return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)
+#     def find_filtered_ngrams(self, sentences):
+#         """
+#         Find all n-grams that have exact matches across all sentences,
+#         excluding those that are part of larger common n-grams
+#         Args:
+#             sentences (list): List of sentences to analyze
+#         Returns:
+#             list: List of all common n-grams in order of their appearance in the first sentence
+#         """
+#         sentences = [self.remove_stopwords(sentence) for sentence in sentences]
+#         ngram_lengths = [4, 3, 2, 1]  # Quadgram, trigram, bigram, unigram
+#         common_ngrams = []
+#         for n in ngram_lengths:
+#             ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences]
+#             ngrams_counter = Counter(ngrams_list[0])
+#             for ngram in ngrams_counter:
+#                 ngram_str = ' '.join(ngram)
+#                 if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, common_ngrams):
+#                     common_ngrams.append(ngram_str)
+#         return common_ngrams
+#     def find_relative_order(self, sentence, common_ngrams):
+#         """
+#         Find the relative order of the common n-grams in the sentence
+#         Args:
+#             sentence (str): Sentence in which to find the relative order
+#             common_ngrams (list): List of common n-grams
+#         Returns:
+#             list: List of tuples with the relative position and the n-gram
+#         """
+#         relative_order = []
+#         for ngram in common_ngrams:
+#             index = sentence.find(ngram)
+#             if index != -1:
+#                 relative_order.append((index, ngram))
+#         return sorted(relative_order)
+# # Example usage
+# if __name__ == "__main__":
+#     sentences = [
+#         "The quick brown fox jumps over the lazy dog.",
+#         "A quick brown dog outpaces a lazy fox.",
+#         "Quick brown animals leap over lazy obstacles."
+#     ]
+#     processor = NgramProcessor()
+#     common_ngrams = processor.find_filtered_ngrams(sentences)
+#     print("Common n-grams:", common_ngrams)
+#     for sentence in sentences:
+#         relative_order = processor.find_relative_order(sentence, common_ngrams)
+#         print(f"Relative order in sentence '{sentence}':", relative_order)

utils/old/sampling/sampling.py ADDED Viewed

	@@ -0,0 +1,330 @@

+import torch
+import random
+from masking_methods import MaskingProcessor
+import nltk
+from nltk.corpus import words
+import torch.nn.functional as F
+class SamplingProcessor:
+    def __init__(self, tokenizer):
+        """
+        Initialize the SamplingProcessor.
+        Args:
+            tokenizer: BERT tokenizer instance
+        """
+        self.tokenizer = tokenizer
+        self.subtoken_prefix = self._get_subtoken_prefix()
+        self.subtoken_ids = self._get_subtoken_ids()
+        try:
+            nltk.data.find('corpora/words')
+        except LookupError:
+            nltk.download('words')
+        self.english_words = set(words.words())
+    # def _get_subtoken_prefix(self):
+    #     """
+    #     Identify the subtoken prefix based on the tokenizer.
+    #     Returns:
+    #         str: The prefix used for subtokens (e.g., "##" for BERT).
+    #     """
+    #     # This method assumes that the tokenizer uses a consistent subtoken prefix.
+    #     # Adjust accordingly if using different tokenizers.
+    #     # For BERT's WordPiece tokenizer:
+    #     if hasattr(self.tokenizer, "init_kwargs") and "wordpiece_prefix" in self.tokenizer.init_kwargs:
+    #         return self.tokenizer.init_kwargs["wordpiece_prefix"]
+    #     elif hasattr(self.tokenizer, "prefix_tokens"):
+    #         return self.tokenizer.prefix_tokens
+    #     else:
+    #         # Default to BERT's subtoken prefix
+    #         return "##"
+    def _get_subtoken_prefix(self):
+        """
+        Identify the subtoken prefix based on the tokenizer.
+        Returns:
+            str: The prefix used for subtokens (e.g., "##" for BERT).
+        """
+        # This method assumes that the tokenizer uses a consistent subtoken prefix.
+        # Adjust accordingly if using different tokenizers.
+        # For BERT's WordPiece tokenizer:
+        if hasattr(self.tokenizer, "init_kwargs") and "wordpiece_prefix" in self.tokenizer.init_kwargs:
+            return self.tokenizer.init_kwargs["wordpiece_prefix"]
+        elif hasattr(self.tokenizer, "prefix_tokens"):
+            return self.tokenizer.prefix_tokens
+        else:
+            # Default to BERT's subtoken prefix
+            return "##"
+    # def _get_subtoken_ids(self):
+    #     """
+    #     Retrieve all token IDs that correspond to subtokens.
+    #     Returns:
+    #         set: A set of subtoken IDs.
+    #     """
+    #     vocab = self.tokenizer.get_vocab()
+    #     subtoken_ids = set()
+    #     for token, idx in vocab.items():
+    #         if token.startswith(self.subtoken_prefix):
+    #             subtoken_ids.add(idx)
+    #     return subtoken_ids
+    def _get_subtoken_ids(self):
+        """
+        Retrieve all token IDs that correspond to subtokens.
+        Returns:
+            list: A list of subtoken IDs.
+        """
+        vocab = self.tokenizer.get_vocab()
+        subtoken_ids = []
+        for token, idx in vocab.items():
+            if token.startswith(self.subtoken_prefix):
+                subtoken_ids.append(idx)
+        return subtoken_ids  # Changed from set to list
+    def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0):
+        tokens = self.tokenizer.tokenize(masked_sentence)
+        for mask_pos in sorted(mask_logits_dict.keys()):
+            try:
+                # Get logits and squeeze extra dimension
+                mask_logits = torch.tensor(mask_logits_dict[mask_pos]).squeeze(0)  # Remove the extra dimension
+                # Create a mask for valid tokens (no special tokens, no subwords)
+                valid_mask = torch.zeros_like(mask_logits, dtype=torch.bool)
+                for idx in range(len(mask_logits)):
+                    token = self.tokenizer.convert_ids_to_tokens([idx])[0]
+                    # Only allow regular words (no special tokens, no subwords)
+                    if token.isalpha() and not token.startswith('[') and not token.startswith('##'):
+                        valid_mask[idx] = True
+                # Get valid logits
+                valid_logits = mask_logits[valid_mask]
+                valid_indices = torch.where(valid_mask)[0]
+                if len(valid_logits) == 0:
+                    print(f"Warning: No valid tokens found for position {mask_pos}")
+                    continue
+                if sampling_technique == "inverse_transform":
+                    probs = torch.softmax(valid_logits / temperature, dim=-1)
+                    cumulative_probs = torch.cumsum(probs, dim=-1)
+                    random_prob = random.random()
+                    sampled_idx = torch.where(cumulative_probs >= random_prob)[0][0].item()
+                    sampled_index = valid_indices[sampled_idx].item()
+                elif sampling_technique == "exponential_minimum":
+                    probs = torch.softmax(valid_logits / temperature, dim=-1)
+                    exp_probs = torch.exp(-torch.log(probs))
+                    random_probs = torch.rand_like(exp_probs)
+                    sampled_idx = torch.argmax(random_probs * exp_probs).item()
+                    sampled_index = valid_indices[sampled_idx].item()
+                elif sampling_technique == "temperature":
+                    valid_logits = torch.clamp(valid_logits, min=-1e8, max=1e8)
+                    probs = torch.softmax(valid_logits / temperature, dim=-1)
+                    if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+                        raise ValueError("The computed probabilities contain NaN or inf values.")
+                    probs = torch.max(probs, torch.tensor(1e-8))
+                    probs = probs / torch.sum(probs)
+                    sampled_idx = torch.multinomial(probs, 1)[0].item()
+                    sampled_index = valid_indices[sampled_idx].item()
+                elif sampling_technique == 'greedy':
+                    sampled_idx = torch.argmax(valid_logits).item()
+                    sampled_index = valid_indices[sampled_idx].item()
+                # Replace mask with sampled token
+                sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0]
+                tokens[mask_pos] = sampled_token
+            except Exception as e:
+                print(f"Error sampling for position {mask_pos}: {str(e)}")
+                continue
+        return self.tokenizer.convert_tokens_to_string(tokens)
+    def process_masked_sentences(self, results_dict, sampling_technique="temperature", temperature=1.0):
+        """
+        Process all masked sentences in the results dictionary.
+        Args:
+            results_dict (dict): Dictionary containing masked sentences and their logits
+            sampling_technique (str): Sampling method to use
+            temperature (float): Temperature parameter for sampling
+        Returns:
+            dict: Dictionary containing original, masked, and sampled sentences
+        """
+        processed_results = {}
+        for original_sentence, data in results_dict.items():
+            masked_sentence = data["masked_sentence"]
+            mask_logits = data["mask_logits"]
+            sampled_sentence = self.sample_tokens(
+                mask_logits,
+                masked_sentence,
+                sampling_technique,
+                temperature
+            )
+            processed_results[original_sentence] = {
+                "masked_sentence": masked_sentence,
+                "sampled_sentence": sampled_sentence
+            }
+        return processed_results
+if __name__ == "__main__":
+    sentences = [
+        "The quick brown fox jumps over the lazy dog everyday.",
+    ]
+    result_dict = {
+        'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+    }
+    # First, mask the sentences
+    masking_processor = MaskingProcessor()
+    masking_results = masking_processor.process_sentences(sentences, result_dict)
+    # Then, sample replacements for the masks
+    sampling_processor = SamplingProcessor(masking_processor.tokenizer)
+    # Try different sampling techniques
+    sampling_techniques = ["temperature", "greedy", "inverse_transform", "exponential_minimum"]
+    for technique in sampling_techniques:
+        print(f"\nSampling using {technique}:")
+        sampled_results = sampling_processor.process_masked_sentences(
+            masking_results,
+            sampling_technique=technique,
+            temperature=1.0
+        )
+        for original_sentence, result in sampled_results.items():
+            print(f"Original:  {original_sentence}")
+            print(f"Masked:    {result['masked_sentence']}")
+            print(f"Sampled:   {result['sampled_sentence']}")
+            print("---")
+# --------------------------------------------------------------------------------------------------
+    # def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0, top_k=100):
+    #     words = masked_sentence.split()
+    #     mask_positions = sorted(mask_logits_dict.keys())
+    #     for mask_pos in mask_positions:
+    #         mask_logits = torch.tensor(mask_logits_dict[mask_pos])
+    #         try:
+    #             if sampling_technique == "inverse_transform":
+    #                 probs = torch.softmax(mask_logits / temperature, dim=-1)
+    #                 cumulative_probs = torch.cumsum(probs, dim=-1)
+    #                 random_prob = random.random()
+    #                 sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+    #             elif sampling_technique == "exponential_minimum":
+    #                 probs = torch.softmax(mask_logits / temperature, dim=-1)
+    #                 exp_probs = torch.exp(-torch.log(probs))
+    #                 random_probs = torch.rand_like(exp_probs)
+    #                 sampled_index = torch.argmax(random_probs * exp_probs).item()
+    #             elif sampling_technique == "temperature":
+    #                 mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8)
+    #                 probs = torch.softmax(mask_logits / temperature, dim=-1)
+    #                 if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+    #                     raise ValueError("The computed probabilities contain NaN or inf values.")
+    #                 probs = torch.max(probs, torch.tensor(1e-8))
+    #                 probs = probs / torch.sum(probs)
+    #                 sampled_index = torch.multinomial(probs, 1)[0].item()
+    #             elif sampling_technique == 'greedy':
+    #                 sampled_index = torch.argmax(mask_logits).item()
+    #             else:
+    #                 raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+    #             # Replace mask with sampled token
+    #             sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0]
+    #             words[mask_pos] = sampled_token
+    #         except Exception as e:
+    #             print(f"Error sampling for position {mask_pos}: {str(e)}")
+    #             continue
+    #     return " ".join(words)
+    ## MORE WEIRD RESULTS
+    # def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0, top_k=100):
+    #     words = masked_sentence.split()
+    #     mask_positions = sorted(mask_logits_dict.keys())
+    #     for mask_pos in mask_positions:
+    #         mask_logits = torch.tensor(mask_logits_dict[mask_pos])
+    #         try:
+    #             # Create a mask for valid tokens (no special tokens, no subwords)
+    #             valid_mask = torch.zeros_like(mask_logits, dtype=torch.bool)
+    #             for idx in range(len(mask_logits)):
+    #                 token = self.tokenizer.convert_ids_to_tokens([idx])[0]
+    #                 # Only allow regular words (no special tokens, no subwords)
+    #                 if token.isalpha() and not token.startswith('[') and not token.startswith('##'):
+    #                     valid_mask[idx] = True
+    #             # Get valid logits
+    #             valid_logits = mask_logits[valid_mask]
+    #             valid_indices = torch.where(valid_mask)[0]
+    #             if len(valid_logits) == 0:
+    #                 print(f"Warning: No valid tokens found for position {mask_pos}")
+    #                 continue
+    #             if sampling_technique == "inverse_transform":
+    #                 probs = torch.softmax(valid_logits / temperature, dim=-1)
+    #                 cumulative_probs = torch.cumsum(probs, dim=-1)
+    #                 random_prob = random.random()
+    #                 sampled_idx = torch.where(cumulative_probs >= random_prob)[0][0].item()
+    #                 sampled_index = valid_indices[sampled_idx].item()
+    #             elif sampling_technique == "exponential_minimum":
+    #                 probs = torch.softmax(valid_logits / temperature, dim=-1)
+    #                 exp_probs = torch.exp(-torch.log(probs))
+    #                 random_probs = torch.rand_like(exp_probs)
+    #                 sampled_idx = torch.argmax(random_probs * exp_probs).item()
+    #                 sampled_index = valid_indices[sampled_idx].item()
+    #             elif sampling_technique == "temperature":
+    #                 valid_logits = torch.clamp(valid_logits, min=-1e8, max=1e8)
+    #                 probs = torch.softmax(valid_logits / temperature, dim=-1)
+    #                 if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+    #                     raise ValueError("The computed probabilities contain NaN or inf values.")
+    #                 probs = torch.max(probs, torch.tensor(1e-8))
+    #                 probs = probs / torch.sum(probs)
+    #                 sampled_idx = torch.multinomial(probs, 1)[0].item()
+    #                 sampled_index = valid_indices[sampled_idx].item()
+    #             elif sampling_technique == 'greedy':
+    #                 sampled_idx = torch.argmax(valid_logits).item()
+    #                 sampled_index = valid_indices[sampled_idx].item()
+    #             else:
+    #                 raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+    #             # Replace mask with sampled token
+    #             sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0]
+    #             words[mask_pos] = sampled_token
+    #         except Exception as e:
+    #             print(f"Error sampling for position {mask_pos}: {str(e)}")
+    #             continue
+    #     return " ".join(words)

utils/old/sampling/sampling_methods.py ADDED Viewed

	@@ -0,0 +1,291 @@

+from transformers import BertTokenizer, BertForMaskedLM
+import torch
+import random
+from masking_methods import MaskingProcessor
+from transformers import pipeline
+class SamplingProcessorWithModel:
+    def __init__(self, model_name='bert-base-uncased'):
+        self.tokenizer = BertTokenizer.from_pretrained(model_name)
+        self.model = BertForMaskedLM.from_pretrained(model_name)
+        self.model.eval()  # Set the model to evaluation mode
+    def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0):
+        """
+        Fills each mask in the masked sentence using the specified sampling technique.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens.
+            sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+            temperature (float): Temperature parameter for sampling methods.
+        Returns:
+            str: Sentence with the masks filled.
+        """
+        input_ids = self.tokenizer.encode(masked_sentence, return_tensors="pt")
+        while self.tokenizer.mask_token_id in input_ids[0]:
+            # Find indices of all [MASK] tokens
+            mask_indices = torch.where(input_ids == self.tokenizer.mask_token_id)[1]
+            # Process the first [MASK] token in the sequence
+            mask_index = mask_indices[0].item()
+            # Get logits from the model
+            with torch.no_grad():
+                outputs = self.model(input_ids)
+                logits = outputs.logits
+            # Extract logits for the [MASK] token
+            mask_logits = logits[0, mask_index]
+            if sampling_technique == "inverse_transform":
+                probs = torch.softmax(mask_logits / temperature, dim=-1)
+                cumulative_probs = torch.cumsum(probs, dim=-1)
+                random_prob = random.random()
+                sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+            elif sampling_technique == "exponential_minimum":
+                probs = torch.softmax(mask_logits / temperature, dim=-1)
+                exp_probs = torch.exp(-torch.log(probs))
+                random_probs = torch.rand_like(exp_probs)
+                sampled_index = torch.argmax(random_probs * exp_probs).item()
+            elif sampling_technique == "temperature":
+                mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8)
+                probs = torch.softmax(mask_logits / temperature, dim=-1)
+                if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+                    raise ValueError("The computed probabilities contain NaN or inf values.")
+                probs = torch.max(probs, torch.tensor(1e-8, device=mask_logits.device))
+                probs = probs / torch.sum(probs)
+                probs = probs.flatten()
+                if probs.size(0) > 1:
+                    sampled_index = torch.multinomial(probs, 1).item()
+                else:
+                    sampled_index = torch.argmax(probs).item()
+            elif sampling_technique == 'greedy':
+                sampled_index = torch.argmax(mask_logits).item()
+            else:
+                raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+            # Replace the first [MASK] with the selected token
+            input_ids[0, mask_index] = sampled_index
+        return self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+    def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0):
+        """
+        Fills each mask in the masked sentence using the specified sampling technique.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens.
+            sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+            temperature (float): Temperature parameter for sampling methods.
+        Returns:
+            str: Sentence with the masks filled.
+        """
+        while '[MASK]' in masked_sentence:
+            # Get predictions for the first [MASK]
+            predictions = self.unmasker(masked_sentence)
+            # Ensure predictions is a list of dictionaries
+            if not isinstance(predictions, list) or not all(isinstance(pred, dict) for pred in predictions):
+                raise ValueError("Unexpected structure in predictions from the pipeline.")
+            # Extract logits (scores) from the predictions
+            logits = torch.tensor([pred['score'] for pred in predictions], dtype=torch.float32)
+            if sampling_technique == "inverse_transform":
+                probs = torch.softmax(logits / temperature, dim=-1)
+                cumulative_probs = torch.cumsum(probs, dim=-1)
+                random_prob = random.random()
+                sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+            elif sampling_technique == "exponential_minimum":
+                probs = torch.softmax(logits / temperature, dim=-1)
+                exp_probs = torch.exp(-torch.log(probs))
+                random_probs = torch.rand_like(exp_probs)
+                sampled_index = torch.argmax(random_probs * exp_probs).item()
+            elif sampling_technique == "temperature":
+                logits = torch.clamp(logits, min=-1e8, max=1e8)
+                probs = torch.softmax(logits / temperature, dim=-1)
+                if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+                    raise ValueError("The computed probabilities contain NaN or inf values.")
+                probs = torch.max(probs, torch.tensor(1e-8, device=logits.device))
+                probs = probs / torch.sum(probs)
+                probs = probs.flatten()
+                if probs.size(0) > 1:
+                    sampled_index = torch.multinomial(probs, 1).item()
+                else:
+                    sampled_index = torch.argmax(probs).item()
+            elif sampling_technique == 'greedy':
+                sampled_index = torch.argmax(logits).item()
+            else:
+                raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+            # Replace the first [MASK] with the selected word
+            sampled_token = predictions[sampled_index]['token_str']
+            masked_sentence = masked_sentence.replace('[MASK]', sampled_token, 1)
+        return masked_sentence
+# Example usage
+if __name__ == "__main__":
+    from transformers import BertTokenizer
+    # Define sentences and result_dict
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A quick brown dog outpaces a lazy fox.",
+        "Quick brown dog leaps over lazy the fox."
+    ]
+    result_dict = {
+        "The quick brown fox jumps over the lazy dog.": {'quick brown': [(0, 1)], 'fox': [(2, 2)], 'lazy': [(4, 4)], 'dog': [(5, 5)]},
+        "A quick brown dog outpaces a lazy fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]},
+        "Quick brown dog leaps over lazy the fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]}
+    }
+    masking_processor = MaskingProcessor()
+    masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+    # Use SamplingProcessor
+    sampling_processor = SamplingProcessorWithModel()
+    # Iterate through masking results to apply sampling
+    for sentence, result in masking_results.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {result['masked_sentence']}")
+        masked_sentence = result["masked_sentence"]
+        # Apply different sampling techniques
+        for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]:
+            print(f"Sampling Technique: {technique}")
+            filled_sentence = sampling_processor.fill_masked_sentence(
+                masked_sentence=masked_sentence,
+                sampling_technique=technique,
+                temperature=1.0  # Adjust temperature as needed
+            )
+            print(f"Filled Sentence: {filled_sentence}\n")
+        print('--------------------------------')
+# from transformers import pipeline
+# import torch
+# import random
+# from masking_methods import MaskingProcessor
+# class SamplingProcessorWithPipeline:
+#     def __init__(self, model_name='bert-base-uncased'):
+#         self.unmasker = pipeline('fill-mask', model=model_name)
+#         self.tokenizer = self.unmasker.tokenizer
+#     def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0):
+#         """
+#         Fills each mask in the masked sentence using the specified sampling technique.
+#         Args:
+#             masked_sentence (str): Sentence with [MASK] tokens.
+#             sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+#             temperature (float): Temperature parameter for sampling methods.
+#         Returns:
+#             str: Sentence with the masks filled.
+#         """
+#         while '[MASK]' in masked_sentence:
+#             # Get predictions for the first [MASK]
+#             predictions = self.unmasker(masked_sentence)
+#             print(f' predictions : {predictions}')
+#             print(f' type of predictions : {type(predictions)}')
+#             # Ensure predictions is a list of dictionaries for the first [MASK]
+#             if not isinstance(predictions, list) or not all(isinstance(pred, dict) for pred in predictions):
+#                 raise ValueError("Unexpected structure in predictions from the pipeline.")
+#             # Extract logits (scores) from the predictions
+#             logits = torch.tensor([pred['score'] for pred in predictions], dtype=torch.float32)
+#             if sampling_technique == "inverse_transform":
+#                 probs = torch.softmax(logits / temperature, dim=-1)
+#                 cumulative_probs = torch.cumsum(probs, dim=-1)
+#                 random_prob = random.random()
+#                 sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+#             elif sampling_technique == "exponential_minimum":
+#                 probs = torch.softmax(logits / temperature, dim=-1)
+#                 exp_probs = torch.exp(-torch.log(probs))
+#                 random_probs = torch.rand_like(exp_probs)
+#                 sampled_index = torch.argmax(random_probs * exp_probs).item()
+#             elif sampling_technique == "temperature":
+#                 logits = torch.clamp(logits, min=-1e8, max=1e8)
+#                 probs = torch.softmax(logits / temperature, dim=-1)
+#                 if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+#                     raise ValueError("The computed probabilities contain NaN or inf values.")
+#                 probs = torch.max(probs, torch.tensor(1e-8, device=logits.device))
+#                 probs = probs / torch.sum(probs)
+#                 probs = probs.flatten()
+#                 if probs.size(0) > 1:
+#                     sampled_index = torch.multinomial(probs, 1).item()
+#                 else:
+#                     sampled_index = torch.argmax(probs).item()
+#             elif sampling_technique == 'greedy':
+#                 sampled_index = torch.argmax(logits).item()
+#             else:
+#                 raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+#             # Replace the first [MASK] with the selected word
+#             sampled_token = predictions[sampled_index]['token_str']
+#             masked_sentence = masked_sentence.replace('[MASK]', sampled_token, 1)
+#         return masked_sentence
+# # Example usage
+# if __name__ == "__main__":
+#     from transformers import BertTokenizer
+#     # Define sentences and result_dict
+#     sentences = [
+#         "The quick brown fox jumps over the lazy dog.",
+#         "A quick brown dog outpaces a lazy fox.",
+#         "Quick brown animals leap over lazy obstacles."
+#     ]
+#     result_dict = {
+#         "The quick brown fox jumps over the lazy dog.": {"quick brown": [(1, 2)], "lazy": [(7, 7)]},
+#         "A quick brown dog outpaces a lazy fox.": {"quick brown": [(1, 2)], "lazy": [(6, 6)]},
+#         "Quick brown animals leap over lazy obstacles.": {"quick brown": [(0, 1)], "lazy": [(5, 5)]}
+#     }
+#     masking_processor = MaskingProcessor()
+#     masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+#     # Use SamplingProcessor
+#     sampling_processor = SamplingProcessorWithPipeline()
+#     # Iterate through masking results to apply sampling
+#     for sentence, result in masking_results.items():
+#         print(f"Original Sentence (Random): {sentence}")
+#         print(f"Masked Sentence (Random): {result['masked_sentence']}")
+#         masked_sentence = result["masked_sentence"]
+#         # Apply different sampling techniques
+#         for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]:
+#             print(f"Sampling Technique: {technique}")
+#             filled_sentence = sampling_processor.fill_masked_sentence(
+#                 masked_sentence=masked_sentence,
+#                 sampling_technique=technique,
+#                 temperature=1.0  # Adjust temperature as needed
+#             )
+#             print(f"Filled Sentence: {filled_sentence}\n")
+#         print('--------------------------------')

utils/old/sampling/sampling_methods_v1.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+import random
+from masking_methods import MaskingProcessor
+class SamplingProcessor:
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+    def fill_masked_sentence(self, original_sentence, mask_logits, sampling_technique, temperature=1.0):
+        """
+        Fills each mask in the masked sentence using the specified sampling technique.
+        Args:
+            original_sentence (str): The original masked sentence.
+            mask_logits (dict): Logits for each [MASK] token.
+            sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+            temperature (float): Temperature parameter for sampling methods.
+        Returns:
+            str: Sentence with the masks filled.
+        """
+        sentence_tokens = self.tokenizer.tokenize(original_sentence)
+        mask_token_indices = [i for i, token in enumerate(sentence_tokens) if token == self.tokenizer.mask_token]
+        if len(mask_token_indices) != len(mask_logits):
+            raise ValueError("Mismatch between number of [MASK] tokens and logits provided.")
+        for mask_idx, filtered_logits in zip(mask_token_indices, mask_logits.values()):
+            # Convert logits to a tensor
+            filtered_logits = torch.tensor(filtered_logits)
+            # filtered_logits, _ = torch.sort(filtered_logits, descending=True)
+            # print(f' type of filtered_logits : {type(filtered_logits)}')
+            # filtered_logits = filtered_logits[:5]
+            if sampling_technique == "inverse_transform":
+                probs = torch.softmax(filtered_logits / temperature, dim=-1)
+                cumulative_probs = torch.cumsum(probs, dim=-1)
+                random_prob = random.random()
+                sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+            elif sampling_technique == "exponential_minimum":
+                probs = torch.softmax(filtered_logits / temperature, dim=-1)
+                exp_probs = torch.exp(-torch.log(probs))
+                random_probs = torch.rand_like(exp_probs)
+                sampled_index = torch.argmax(random_probs * exp_probs).item()
+            elif sampling_technique == "temperature":
+                filtered_logits = torch.clamp(filtered_logits, min=-1e8, max=1e8)
+                probs = torch.softmax(filtered_logits / temperature, dim=-1)
+                if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+                    raise ValueError("The computed probabilities contain NaN or inf values.")
+                probs = torch.max(probs, torch.tensor(1e-8, device=filtered_logits.device))
+                probs = probs / torch.sum(probs)
+                probs = probs.flatten()
+                if probs.size(0) > 1:
+                    sampled_index = torch.multinomial(probs, 1).item()
+                else:
+                    sampled_index = torch.argmax(probs).item()
+            elif sampling_technique == 'greedy':
+                sampled_index = torch.argmax(filtered_logits).item()
+            else:
+                raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+            sampled_token = self.tokenizer.convert_ids_to_tokens([sampled_index])[0]
+            sentence_tokens[mask_idx] = sampled_token
+        return self.tokenizer.convert_tokens_to_string(sentence_tokens)
+    def process_samples(self, masked_sentences, mask_logits, sampling_technique, temperature=1.0):
+        """
+        Process multiple masked sentences and fill their masks using the specified sampling technique.
+        Args:
+            masked_sentences (list): List of masked sentences.
+            mask_logits (dict): Logits for each [MASK] token in each sentence.
+            sampling_technique (str): Sampling technique to use.
+            temperature (float): Temperature parameter for sampling methods.
+        Returns:
+            list: List of sentences with masks filled.
+        """
+        filled_sentences = []
+        for sentence, logits in zip(masked_sentences, mask_logits):
+            filled_sentence = self.fill_masked_sentence(sentence, logits, sampling_technique, temperature)
+            filled_sentences.append(filled_sentence)
+        return filled_sentences
+# Example usage
+if __name__ == "__main__":
+    from transformers import BertTokenizer
+    # tokenizer = BertTokenizer.from_pretrained("bert-large-cased-whole-word-masking")
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    processor = SamplingProcessor(tokenizer)
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A quick brown dog outpaces a lazy fox.",
+        "Quick brown dog leaps over lazy the fox."
+    ]
+    result_dict = {
+        "The quick brown fox jumps over the lazy dog.": {'quick brown': [(0, 1)], 'fox': [(2, 2)], 'lazy': [(4, 4)], 'dog': [(5, 5)]},
+        "A quick brown dog outpaces a lazy fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]},
+        "Quick brown dog leaps over lazy the fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]}
+    }
+    masking_processor = MaskingProcessor()
+    masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+    # masked_sentence = "The [MASK] brown fox jumps [MASK] the lazy dog."
+    # mask_logits = {
+    #     1: torch.randn(len(tokenizer)),  # Example logits for first [MASK]
+    #     5: torch.randn(len(tokenizer)),  # Example logits for second [MASK]
+    # }
+    # Iterate through masking results to apply sampling
+    for sentence, result in masking_results.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {result['masked_sentence']}")
+        # print(f"Mask Logits (Random): {output['mask_logits']}")
+        print(f' type(result["mask_logits"]) : {type(result["mask_logits"])}')
+        print(f' length of result["mask_logits"] : {len(result["mask_logits"])}')
+        print(f' result["mask_logits"].keys() : {result["mask_logits"].keys()}')
+        masked_sentence = result["masked_sentence"]
+        mask_logits = result["mask_logits"]
+        print(f"Original Masked Sentence: {masked_sentence}")
+        # Apply different sampling techniques
+        for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]:
+            print(f"Sampling Technique: {technique}")
+            # Fill the masks using the sampling processor
+            filled_sentence = processor.fill_masked_sentence(
+                original_sentence=masked_sentence,
+                mask_logits=mask_logits,
+                sampling_technique=technique,
+                temperature=1.0  # Adjust temperature as needed
+            )
+            print(f"Filled Sentence: {filled_sentence}\n")
+        print('--------------------------------')

utils/old/sampling/sampling_methods_v2.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from transformers import pipeline
+import torch
+import random
+from masking_methods import MaskingProcessor
+class SamplingProcessorWithPipeline:
+    def __init__(self, model_name='bert-base-uncased'):
+        self.unmasker = pipeline('fill-mask', model=model_name)
+        self.tokenizer = self.unmasker.tokenizer
+    def fill_masked_sentence(self, masked_sentence, sampling_technique, temperature=1.0):
+        """
+        Fills each mask in the masked sentence using the specified sampling technique.
+        Args:
+            masked_sentence (str): Sentence with [MASK] tokens.
+            sampling_technique (str): Sampling technique to use (e.g., "inverse_transform", "exponential_minimum", "temperature", "greedy").
+            temperature (float): Temperature parameter for sampling methods.
+        Returns:
+            str: Sentence with the masks filled.
+        """
+        while '[MASK]' in masked_sentence:
+            # Get predictions for the first [MASK]
+            predictions = self.unmasker(masked_sentence)
+            print(f' predictions : {predictions}')
+            print(f' type of predictions : {type(predictions)}')
+            # Ensure predictions is a list of dictionaries
+            if not isinstance(predictions, list) or not all(isinstance(pred, dict) for pred in predictions):
+                raise ValueError("Unexpected structure in predictions from the pipeline.")
+            # Extract logits (scores) from the predictions
+            logits = torch.tensor([pred['score'] for pred in predictions], dtype=torch.float32)
+            if sampling_technique == "inverse_transform":
+                probs = torch.softmax(logits / temperature, dim=-1)
+                cumulative_probs = torch.cumsum(probs, dim=-1)
+                random_prob = random.random()
+                sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+            elif sampling_technique == "exponential_minimum":
+                probs = torch.softmax(logits / temperature, dim=-1)
+                exp_probs = torch.exp(-torch.log(probs))
+                random_probs = torch.rand_like(exp_probs)
+                sampled_index = torch.argmax(random_probs * exp_probs).item()
+            elif sampling_technique == "temperature":
+                logits = torch.clamp(logits, min=-1e8, max=1e8)
+                probs = torch.softmax(logits / temperature, dim=-1)
+                if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+                    raise ValueError("The computed probabilities contain NaN or inf values.")
+                probs = torch.max(probs, torch.tensor(1e-8, device=logits.device))
+                probs = probs / torch.sum(probs)
+                probs = probs.flatten()
+                if probs.size(0) > 1:
+                    sampled_index = torch.multinomial(probs, 1).item()
+                else:
+                    sampled_index = torch.argmax(probs).item()
+            elif sampling_technique == 'greedy':
+                sampled_index = torch.argmax(logits).item()
+            else:
+                raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+            # Replace the first [MASK] with the selected word
+            sampled_token = predictions[sampled_index]['token_str']
+            masked_sentence = masked_sentence.replace('[MASK]', sampled_token, 1)
+        return masked_sentence
+# Example usage
+if __name__ == "__main__":
+    from transformers import BertTokenizer
+    # Define sentences and result_dict
+    sentences = [
+        "The quick brown fox jumps over the lazy dog.",
+        "A quick brown dog outpaces a lazy fox.",
+        "Quick brown dog leaps over lazy the fox."
+    ]
+    result_dict = {
+        "The quick brown fox jumps over the lazy dog.": {'quick brown': [(0, 1)], 'fox': [(2, 2)], 'lazy': [(4, 4)], 'dog': [(5, 5)]},
+        "A quick brown dog outpaces a lazy fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]},
+        "Quick brown dog leaps over lazy the fox.": {'quick brown': [(0, 1)], 'fox': [(5, 5)], 'lazy': [(4, 4)], 'dog': [(2, 2)]}
+    }
+    masking_processor = MaskingProcessor()
+    masking_results = masking_processor.process_sentences(sentences, result_dict, method="random", remove_stopwords=False)
+    # Use SamplingProcessor
+    sampling_processor = SamplingProcessorWithPipeline()
+    # Iterate through masking results to apply sampling
+    for sentence, result in masking_results.items():
+        print(f"Original Sentence (Random): {sentence}")
+        print(f"Masked Sentence (Random): {result['masked_sentence']}")
+        masked_sentence = result["masked_sentence"]
+        # Apply different sampling techniques
+        for technique in ["inverse_transform", "exponential_minimum", "temperature", "greedy"]:
+            print(f"Sampling Technique: {technique}")
+            filled_sentence = sampling_processor.fill_masked_sentence(
+                masked_sentence=masked_sentence,
+                sampling_technique=technique,
+                temperature=1.0  # Adjust temperature as needed
+            )
+            print(f"Filled Sentence: {filled_sentence}\n")
+        print('--------------------------------')

utils/old/sampling_final_copy.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+import random
+from masking_methods import MaskingProcessor
+class SamplingProcessor:
+    def __init__(self, tokenizer):
+        """
+        Initialize the SamplingProcessor.
+        Args:
+            tokenizer: BERT tokenizer instance
+        """
+        self.tokenizer = tokenizer
+    def sample_tokens(self, mask_logits_dict, masked_sentence, sampling_technique="temperature", temperature=1.0):
+        """
+        Sample tokens for each mask in the sentence using the specified sampling technique.
+        Args:
+            mask_logits_dict (dict): Dictionary of mask positions and their logits/tokens
+            masked_sentence (str): Sentence with [MASK] tokens
+            sampling_technique (str): Sampling method to use
+            temperature (float): Temperature parameter for sampling
+        Returns:
+            str: Sentence with sampled tokens replacing masks
+        """
+        words = masked_sentence.split()
+        # Convert positions and logits to sorted list to process masks in order
+        mask_positions = sorted(mask_logits_dict.keys())
+        for mask_pos in mask_positions:
+            mask_data = mask_logits_dict[mask_pos]
+            mask_logits = torch.tensor(mask_data['logits'])
+            candidate_tokens = mask_data['tokens']
+            try:
+                if sampling_technique == "inverse_transform":
+                    probs = torch.softmax(mask_logits / temperature, dim=-1)
+                    cumulative_probs = torch.cumsum(probs, dim=-1)
+                    random_prob = random.random()
+                    sampled_index = torch.where(cumulative_probs >= random_prob)[0][0].item()
+                elif sampling_technique == "exponential_minimum":
+                    probs = torch.softmax(mask_logits / temperature, dim=-1)
+                    exp_probs = torch.exp(-torch.log(probs))
+                    random_probs = torch.rand_like(exp_probs)
+                    sampled_index = torch.argmax(random_probs * exp_probs).item()
+                elif sampling_technique == "temperature":
+                    mask_logits = torch.clamp(mask_logits, min=-1e8, max=1e8)
+                    probs = torch.softmax(mask_logits / temperature, dim=-1)
+                    if torch.any(torch.isnan(probs)) or torch.any(torch.isinf(probs)):
+                        raise ValueError("The computed probabilities contain NaN or inf values.")
+                    probs = torch.max(probs, torch.tensor(1e-8))
+                    probs = probs / torch.sum(probs)
+                    probs = probs.flatten()
+                    if probs.size(0) > 1:
+                        sampled_index = torch.multinomial(probs, 1).item()
+                    else:
+                        sampled_index = torch.argmax(probs).item()
+                elif sampling_technique == 'greedy':
+                    sampled_index = torch.argmax(mask_logits).item()
+                else:
+                    raise ValueError(f"Unknown sampling technique: {sampling_technique}")
+                # Use the sampled index to get the corresponding token
+                sampled_token = candidate_tokens[sampled_index]
+                # Remove ## if it's a subword token
+                sampled_token = sampled_token.replace('##', '')
+                words[mask_pos] = sampled_token
+            except Exception as e:
+                print(f"Error sampling for position {mask_pos}: {str(e)}")
+                continue
+        return " ".join(words)
+    def process_masked_sentences(self, results_dict, sampling_technique="temperature", temperature=1.0):
+        """
+        Process all masked sentences in the results dictionary.
+        Args:
+            results_dict (dict): Dictionary containing masked sentences and their logits
+            sampling_technique (str): Sampling method to use
+            temperature (float): Temperature parameter for sampling
+        Returns:
+            dict: Dictionary containing original, masked, and sampled sentences
+        """
+        processed_results = {}
+        for original_sentence, data in results_dict.items():
+            masked_sentence = data["masked_sentence"]
+            mask_logits = data["mask_logits"]
+            sampled_sentence = self.sample_tokens(
+                mask_logits,
+                masked_sentence,
+                sampling_technique,
+                temperature
+            )
+            processed_results[original_sentence] = {
+                "masked_sentence": masked_sentence,
+                "sampled_sentence": sampled_sentence
+            }
+        return processed_results
+if __name__ == "__main__":
+    sentences = [
+        "The quick brown fox jumps over the lazy dog everyday.",
+        "A speedy brown fox jumps over a lazy dog.",
+        "A swift brown fox leaps over the lethargic dog."
+    ]
+    result_dict ={
+        'The quick brown fox jumps over the lazy dog everyday.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        'A speedy brown fox jumps over a lazy dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]},
+        'A swift brown fox leaps over the lethargic dog.': {'brown fox': [(2, 3)], 'dog': [(8, 8)]}
+    }
+    # First, mask the sentences
+    masking_processor = MaskingProcessor()
+    masking_results = masking_processor.process_sentences(sentences, result_dict)
+    # Then, sample replacements for the masks
+    sampling_processor = SamplingProcessor(masking_processor.tokenizer)
+    # Try different sampling techniques
+    sampling_techniques = ["temperature", "greedy", "inverse_transform", "exponential_minimum"]
+    for technique in sampling_techniques:
+        print(f"\nSampling using {technique}:")
+        sampled_results = sampling_processor.process_masked_sentences(
+            masking_results,
+            sampling_technique=technique,
+            temperature=1.0
+        )
+        '''
+            {
+                "original_sentence_1":
+                {
+                    "masked_sentence": "sentence with [MASK] tokens",
+                    "sampling_method1": "sentence with sampled tokens",
+                },
+                "original_sentence_2":
+                {
+                    "masked_sentence": "sentence with [MASK] tokens",
+                    "sampling_method": "sentence with sampled tokens"
+                },
+                # ... and so on for each input sentence
+            },
+        '''
+        for original_sentence, result in sampled_results.items():
+            print(f"Original:  {original_sentence}")
+            print(f"Masked:    {result['masked_sentence']}")
+            print(f"Sampled:   {result['sampled_sentence']}")
+            print("---")