import re def create_matching_pattern(word): """Creates appropriate regex pattern based on word characteristics""" escaped_word = re.escape(word) # Check for special cases if any(char in word for char in '&-/.\'()[]$€£¥+') or ' ' in word: # Special handling for phrases with special characters or spaces return rf'{escaped_word}' elif word.endswith('%'): # Special handling for percentage values numeric_part = word[:-1] return rf'\b{re.escape(numeric_part)}\s*%' elif re.search(r'[0-9]', word) and re.search(r'[a-zA-Z]', word): # Special handling for alphanumeric combinations return rf'{escaped_word}' else: # Standard word boundary pattern for simple words return rf'\b{escaped_word}\b' def highlight_common_words(common_words, sentences, title): """ Highlight common words in sentences by adding color-coded background and unique IDs. Args: common_words (list of tuples): List of tuples where each tuple contains a word's index and the word. sentences (list of str): List of sentences to search through. title (str): The title for the HTML output. Returns: str: HTML string with the highlighted sentences. """ color_map = {} color_index = 0 highlighted_html = [] pastel_colors = ['#E199C6','#7FB3D5', '#E57373', '#B388EB', '#80D9AA', '#F0B66B', "#73D9A0", "#9A89EB", "#E38AD4", "#7FAFDB", "#DDAF8C", "#DA7FC1", "#65CFA5", "#B38FDE", "#E6C97A"] # Process each sentence for idx, sentence in enumerate(sentences, start=1): sentence_with_idx = f"{idx}. {sentence}" highlighted_sentence = sentence_with_idx # Highlight common words in each sentence for index, word in common_words: if word not in color_map: color_map[word] = pastel_colors[color_index % len(pastel_colors)] color_index += 1 # Create appropriate pattern based on word characteristics pattern = create_matching_pattern(word) # Replace the word with highlighted version highlighted_sentence = re.sub( pattern, lambda m, idx=index, color=color_map[word]: ( f'' f'{idx}' f'{m.group(0)}' f'' ), highlighted_sentence, flags=re.IGNORECASE ) highlighted_html.append(highlighted_sentence) # Format the HTML output with the title final_html = "

".join(highlighted_html) return f'''

{title}

{final_html}
''' def highlight_common_words_dict(common_words, sentences, title): """ Highlight common words in sentences (from a dictionary) by adding color-coded background and unique IDs. Args: common_words (list of tuples): List of tuples where each tuple contains a word's index and the word. sentences (dict): A dictionary of sentences where the key is the sentence and the value is an entailment score. title (str): The title for the HTML output. Returns: str: HTML string with the highlighted sentences and their entailment scores. """ color_map = {} color_index = 0 highlighted_html = [] pastel_colors = ['#E199C6','#7FB3D5', '#E57373', '#B388EB', '#80D9AA', '#F0B66B', "#73D9A0", "#9A89EB", "#E38AD4", "#7FAFDB", "#DDAF8C", "#DA7FC1", "#65CFA5", "#B38FDE", "#E6C97A"] # Process each sentence and its score for idx, (sentence, score) in enumerate(sentences.items(), start=1): sentence_with_idx = f"{idx}. {sentence}" highlighted_sentence = sentence_with_idx # Highlight common words in each sentence for index, word in common_words: if word not in color_map: color_map[word] = pastel_colors[color_index % len(pastel_colors)] color_index += 1 # Create appropriate pattern based on word characteristics pattern = create_matching_pattern(word) # Replace the word with highlighted version highlighted_sentence = re.sub( pattern, lambda m, idx=index, color=color_map[word]: ( f'' f'{idx}' f'{m.group(0)}' f'' ), highlighted_sentence, flags=re.IGNORECASE ) # Add the entailment score highlighted_html.append( f'
' f'{highlighted_sentence}' f'
' f'Entailment Score: {score}
' ) # Format the HTML output with the title final_html = "
".join(highlighted_html) return f'''

{title}

{final_html}
''' def reparaphrased_sentences_html(sentences): """ Create an HTML representation of sentences with numbering. Args: sentences (list of str): List of sentences to format. Returns: str: HTML string with numbered sentences. """ formatted_sentences = [] # Process each sentence for idx, sentence in enumerate(sentences, start=1): sentence_with_idx = f"{idx}. {sentence}" formatted_sentences.append(sentence_with_idx) # Format the HTML output final_html = "

".join(formatted_sentences) return f'''
{final_html}
''' if __name__ == "__main__": # Example usage common_words = [(1, "highlight"), (2, "numbering"), (3, "S&P 500")] sentences = ["This is a test to highlight words.", "Numbering is important for clarity.", "The S&P 500 index rose 2% today."] # Test highlight_common_words highlighted_html = highlight_common_words(common_words, sentences, "Test Highlighting") print(highlighted_html) # Test highlight_common_words_dict sentences_with_scores = {"Highlight words in this text.": 0.95, "Number sentences for clarity.": 0.8, "The S&P 500 index is a market benchmark.": 0.88} highlighted_html_dict = highlight_common_words_dict(common_words, sentences_with_scores, "Test Dict Highlighting") print(highlighted_html_dict)