Spaces:

text-peccavi
/

TEXT_PECCAVI

Sleeping

App Files Files Community

TEXT_PECCAVI / renderers /highlighter.py

text-peccavi

Upload 54 files

c8b9192 verified 7 months ago

raw

history blame contribute delete

8.09 kB

	import re

	def create_matching_pattern(word):
	"""Creates appropriate regex pattern based on word characteristics"""
	escaped_word = re.escape(word)

	# Check for special cases
	if any(char in word for char in '&-/.\'()[]$€£¥+') or ' ' in word:
	# Special handling for phrases with special characters or spaces
	return rf'{escaped_word}'
	elif word.endswith('%'):
	# Special handling for percentage values
	numeric_part = word[:-1]
	return rf'\b{re.escape(numeric_part)}\s*%'
	elif re.search(r'[0-9]', word) and re.search(r'[a-zA-Z]', word):
	# Special handling for alphanumeric combinations
	return rf'{escaped_word}'
	else:
	# Standard word boundary pattern for simple words
	return rf'\b{escaped_word}\b'

	def highlight_common_words(common_words, sentences, title):
	"""
	Highlight common words in sentences by adding color-coded background and unique IDs.

	Args:
	common_words (list of tuples): List of tuples where each tuple contains a word's index and the word.
	sentences (list of str): List of sentences to search through.
	title (str): The title for the HTML output.

	Returns:
	str: HTML string with the highlighted sentences.
	"""
	color_map = {}
	color_index = 0
	highlighted_html = []
	pastel_colors = ['#E199C6','#7FB3D5', '#E57373', '#B388EB', '#80D9AA', '#F0B66B',
	"#73D9A0", "#9A89EB", "#E38AD4", "#7FAFDB", "#DDAF8C", "#DA7FC1",
	"#65CFA5", "#B38FDE", "#E6C97A"]

	# Process each sentence
	for idx, sentence in enumerate(sentences, start=1):
	sentence_with_idx = f"{idx}. {sentence}"
	highlighted_sentence = sentence_with_idx

	# Highlight common words in each sentence
	for index, word in common_words:
	if word not in color_map:
	color_map[word] = pastel_colors[color_index % len(pastel_colors)]
	color_index += 1

	# Create appropriate pattern based on word characteristics
	pattern = create_matching_pattern(word)

	# Replace the word with highlighted version
	highlighted_sentence = re.sub(
	pattern,
	lambda m, idx=index, color=color_map[word]: (
	f'<span style="background-color: {color}; font-weight: bold;'
	f' padding: 2px 4px; border-radius: 2px; position: relative;">'
	f'<span style="background-color: black; color: white; border-radius: 50%;'
	f' padding: 2px 5px; margin-right: 5px;">{idx}</span>'
	f'{m.group(0)}'
	f'</span>'
	),
	highlighted_sentence,
	flags=re.IGNORECASE
	)

	highlighted_html.append(highlighted_sentence)

	# Format the HTML output with the title
	final_html = "<br><br>".join(highlighted_html)
	return f'''
	<div style="border: solid 1px #FFFFFF; padding: 16px; background-color: #000000; color: #FFFFFF; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
	<h3 style="margin-top: 0; font-size: 1em; color: #FFFFFF;">{title}</h3>
	<div style="background-color: #333333; line-height: 1.6; padding: 15px; border-radius: 8px; color: #FFFFFF;">{final_html}</div>
	</div>
	'''

	def highlight_common_words_dict(common_words, sentences, title):
	"""
	Highlight common words in sentences (from a dictionary) by adding color-coded background and unique IDs.

	Args:
	common_words (list of tuples): List of tuples where each tuple contains a word's index and the word.
	sentences (dict): A dictionary of sentences where the key is the sentence and the value is an entailment score.
	title (str): The title for the HTML output.

	Returns:
	str: HTML string with the highlighted sentences and their entailment scores.
	"""
	color_map = {}
	color_index = 0
	highlighted_html = []
	pastel_colors = ['#E199C6','#7FB3D5', '#E57373', '#B388EB', '#80D9AA', '#F0B66B',
	"#73D9A0", "#9A89EB", "#E38AD4", "#7FAFDB", "#DDAF8C", "#DA7FC1",
	"#65CFA5", "#B38FDE", "#E6C97A"]

	# Process each sentence and its score
	for idx, (sentence, score) in enumerate(sentences.items(), start=1):
	sentence_with_idx = f"{idx}. {sentence}"
	highlighted_sentence = sentence_with_idx

	# Highlight common words in each sentence
	for index, word in common_words:
	if word not in color_map:
	color_map[word] = pastel_colors[color_index % len(pastel_colors)]
	color_index += 1

	# Create appropriate pattern based on word characteristics
	pattern = create_matching_pattern(word)

	# Replace the word with highlighted version
	highlighted_sentence = re.sub(
	pattern,
	lambda m, idx=index, color=color_map[word]: (
	f'<span style="background-color: {color}; font-weight: bold;'
	f' padding: 1px 2px; border-radius: 2px; position: relative;">'
	f'<span style="background-color: black; color: white; border-radius: 50%;'
	f' padding: 1px 3px; margin-right: 3px; font-size: 0.8em;">{idx}</span>'
	f'{m.group(0)}'
	f'</span>'
	),
	highlighted_sentence,
	flags=re.IGNORECASE
	)

	# Add the entailment score
	highlighted_html.append(
	f'<div style="margin-bottom: 5px;">'
	f'{highlighted_sentence}'
	f'<div style="display: inline-block; margin-left: 5px; padding: 3px 5px; border-radius: 3px; '
	f'background-color: #333333; color: white; font-size: 0.9em;">'
	f'Entailment Score: {score}</div></div>'
	)

	# Format the HTML output with the title
	final_html = "<br>".join(highlighted_html)
	return f'''
	<div style="background-color: #000000; color: #FFFFFF;border: solid 1px #FFFFFF; border-radius: 8px;">
	<h3 style="margin-top: 0; font-size: 1em; color: #FFFFFF;">{title}</h3>
	<div style="background-color: #333333; line-height: 1.6; padding: 15px; border-radius: 8px; color: #FFFFFF;">{final_html}</div>
	</div>
	'''

	def reparaphrased_sentences_html(sentences):
	"""
	Create an HTML representation of sentences with numbering.

	Args:
	sentences (list of str): List of sentences to format.

	Returns:
	str: HTML string with numbered sentences.
	"""
	formatted_sentences = []

	# Process each sentence
	for idx, sentence in enumerate(sentences, start=1):
	sentence_with_idx = f"{idx}. {sentence}"
	formatted_sentences.append(sentence_with_idx)

	# Format the HTML output
	final_html = "<br><br>".join(formatted_sentences)
	return f'''
	<div style="border: solid 1px #FFFFFF; background-color: #000000; color: #FFFFFF;
	box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); border-radius: 8px;">
	<div style="background-color: #333333; line-height: 1.6; padding: 15px; border-radius: 8px;">{final_html}</div>
	</div>
	'''

	if __name__ == "__main__":
	# Example usage
	common_words = [(1, "highlight"), (2, "numbering"), (3, "S&P 500")]
	sentences = ["This is a test to highlight words.", "Numbering is important for clarity.", "The S&P 500 index rose 2% today."]

	# Test highlight_common_words
	highlighted_html = highlight_common_words(common_words, sentences, "Test Highlighting")
	print(highlighted_html)

	# Test highlight_common_words_dict
	sentences_with_scores = {"Highlight words in this text.": 0.95, "Number sentences for clarity.": 0.8, "The S&P 500 index is a market benchmark.": 0.88}
	highlighted_html_dict = highlight_common_words_dict(common_words, sentences_with_scores, "Test Dict Highlighting")
	print(highlighted_html_dict)