Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| from tqdm import tqdm | |
| import numpy as np | |
| import torch | |
| import matplotlib.pyplot as plt | |
| from transformers import GPT2LMHeadModel, GPT2TokenizerFast | |
| from bert_score import BERTScorer | |
| from bert_score.utils import model2layers | |
| from nltk.tokenize import word_tokenize | |
| from Levenshtein import distance as levenshtein_distance | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from scipy.spatial.distance import cdist | |
| from scipy.optimize import linear_sum_assignment | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | |
| from config.config import load_config | |
| config_path = os.path.join(os.path.dirname(__file__), '..', 'config', 'config.yaml') | |
| config = load_config(config_path)['PECCAVI_TEXT']['Metrics'] | |
| class SentenceDistortionCalculator: | |
| """ | |
| A class to calculate and analyze distortion metrics between an original sentence and modified sentences. | |
| """ | |
| def __init__(self, config, original_sentence, paraphrased_sentences): | |
| """ | |
| Initialize the calculator with the original sentence and a list of modified sentences. | |
| """ | |
| self.original_sentence = original_sentence | |
| self.paraphrased_sentences = paraphrased_sentences | |
| self.levenshtein_distances = {} | |
| self.bert_scores = {} | |
| self.mover_scores = {} | |
| self.normalized_levenshtein = {} | |
| self.normalized_bert_scores = {} | |
| self.normalized_mover_scores = {} | |
| self.combined_distortions = {} | |
| self.tokenizer = GPT2TokenizerFast.from_pretrained(config['Distortion']) | |
| self.model = GPT2LMHeadModel.from_pretrained(config['Distortion']) | |
| self.model.eval() | |
| def calculate_all_metrics(self): | |
| """ | |
| Calculate all distortion metrics for each modified sentence. | |
| """ | |
| for idx, modified_sentence in tqdm(enumerate(self.paraphrased_sentences), total=len(self.paraphrased_sentences), desc="Calculating Metrics"): | |
| key = f"Sentence_{idx+1}" | |
| self.levenshtein_distances[key] = self._calculate_levenshtein_distance(modified_sentence) | |
| self.bert_scores[key] = self._calculate_bert_score(modified_sentence) | |
| self.mover_scores[key] = self._calculate_mover_score(modified_sentence) | |
| def normalize_metrics(self): | |
| """ | |
| Normalize all metrics to be between 0 and 1. | |
| """ | |
| for _ in tqdm(range(1), desc="Normalizing Metrics"): # Add tqdm here (wrap the normalization process) | |
| self.normalized_levenshtein = self._normalize_dict(self.levenshtein_distances) | |
| self.normalized_bert_scores = self._normalize_dict(self.bert_scores) | |
| self.normalized_mover_scores = self._normalize_dict(self.mover_scores) | |
| def calculate_combined_distortion(self): | |
| """ | |
| Calculate the combined distortion using the root mean square of the normalized metrics. | |
| """ | |
| for _ in tqdm(range(1), desc="Calculating Combined Distortion"): # Add tqdm here | |
| for key in self.normalized_levenshtein.keys(): | |
| rms = np.sqrt( | |
| ( | |
| self.normalized_levenshtein[key] ** 2 + | |
| self.normalized_bert_scores[key] ** 2+ | |
| self.normalized_mover_scores[key] **2 | |
| ) / 3 | |
| ) | |
| self.combined_distortions[key] = rms | |
| def plot_metrics(self): | |
| """ | |
| Plot each normalized metric and the combined distortion in separate graphs. | |
| """ | |
| keys = list(self.normalized_levenshtein.keys()) | |
| indices = np.arange(len(keys)) | |
| # Prepare data for plotting | |
| metrics = { | |
| 'Levenshtein Distance': [self.normalized_levenshtein[key] for key in keys], | |
| 'BERTScore': [self.normalized_bert_scores[key] for key in keys], | |
| 'MOVERscore':[self.normalized_mover_scores[key] for key in keys], | |
| 'Combined Distortion': [self.combined_distortions[key] for key in keys] | |
| } | |
| # Plot each metric separately | |
| for metric_name, values in tqdm(metrics.items(), desc="Plotting Metrics"): # Add tqdm here | |
| plt.figure(figsize=(12, 6)) | |
| plt.plot(indices, values, marker='o', color=np.random.rand(3,)) | |
| plt.xlabel('Sentence Index') | |
| plt.ylabel('Normalized Value (0-1)') | |
| plt.title(f'Normalized {metric_name}') | |
| plt.grid(True) | |
| plt.tight_layout() | |
| plt.show() | |
| def _calculate_levenshtein_distance(self, modified_sentence): | |
| """ | |
| Calculate the word-level Levenshtein distance between the original and modified sentence. | |
| """ | |
| words1 = word_tokenize(self.original_sentence) | |
| words2 = word_tokenize(modified_sentence) | |
| lev_distance = levenshtein_distance(words1, words2) | |
| return (lev_distance / max(len(words1), len(words2))) | |
| def _calculate_bert_score(self, modified_sentence): | |
| """ | |
| Compute the BERTScore similarity between the original and modified sentence. | |
| Returns 1 - F1 score to represent dissimilarity. | |
| """ | |
| if not hasattr(self, 'original_sentence'): | |
| raise ValueError("original_sentence is not set. Please set self.original_sentence before calling this function.") | |
| if not isinstance(modified_sentence, str): | |
| raise ValueError("modified_sentence must be a string.") | |
| model_type = "microsoft/deberta-xlarge-mnli" | |
| num_layers = model2layers[model_type] | |
| if not hasattr(self, "cached_bertscorer"): | |
| self.cached_bertscorer = BERTScorer( | |
| model_type=model_type, | |
| num_layers=num_layers, | |
| batch_size=1, # Single sentence comparison | |
| nthreads=4, | |
| all_layers=False, | |
| idf=False, | |
| device="cuda" if torch.cuda.is_available() else "cpu", | |
| lang="en" | |
| ) | |
| # Compute BERTScore | |
| _, _, F1 = self.cached_bertscorer.score( | |
| cands=[modified_sentence], | |
| refs=[self.original_sentence], | |
| verbose=False, | |
| batch_size=1 | |
| ) | |
| return 1 - F1.item() # Return dissimilarity score | |
| def _calculate_mover_score(self,modified_sentence,model_name='all-MiniLM-L6-v2'): | |
| """Compute MoverScore correctly using word-level embeddings.""" | |
| if not self.original_sentence: | |
| raise ValueError("Original sentence not provided.") | |
| # Tokenize sentences | |
| original_tokens = self.original_sentence.split() | |
| modified_tokens = modified_sentence.split() | |
| model = SentenceTransformer(model_name) | |
| # Compute word embeddings | |
| original_embeddings = model.encode(original_tokens, convert_to_numpy=True) | |
| modified_embeddings = model.encode(modified_tokens, convert_to_numpy=True) | |
| # Compute cost matrix (cosine distance) | |
| cost_matrix = cdist(original_embeddings, modified_embeddings, metric='cosine') | |
| # Solve optimal transport problem (Hungarian Algorithm) | |
| row_ind, col_ind = linear_sum_assignment(cost_matrix) | |
| # Compute IDF weights | |
| vectorizer = TfidfVectorizer() | |
| vectorizer.fit([self.original_sentence, modified_sentence]) | |
| idf_values = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_)) | |
| # Apply IDF weighting to aligned word pairs | |
| idf_weights_original = np.array([idf_values.get(word.lower(), 1.0) for word in original_tokens]) | |
| idf_weights_modified = np.array([idf_values.get(word.lower(), 1.0) for word in modified_tokens]) | |
| combined_idf_weights = (idf_weights_original[row_ind] + idf_weights_modified[col_ind]) / 2 | |
| weighted_score = np.sum((1 - cost_matrix[row_ind, col_ind]) * combined_idf_weights) / np.sum(combined_idf_weights) | |
| return 1-weighted_score # Higher score = more dissimilar | |
| def _normalize_dict(self, metric_dict): | |
| """ | |
| Normalize the values in a dictionary to be between 0 and 1. | |
| """ | |
| values = np.array(list(metric_dict.values())) | |
| min_val = values.min() | |
| max_val = values.max() | |
| if max_val - min_val == 0: | |
| normalized_values = np.zeros_like(values) | |
| else: | |
| normalized_values = (values - min_val) / (max_val - min_val) | |
| return dict(zip(metric_dict.keys(), normalized_values)) | |
| def get_normalized_metrics(self): | |
| """ | |
| Get all normalized metrics as a dictionary. | |
| """ | |
| return { | |
| 'Min Edit Distance': self.normalized_levenshtein, | |
| 'BERTScore': self.normalized_bert_scores, | |
| 'Mover Score': self.normalized_mover_scores | |
| } | |
| def get_combined_distortions(self): | |
| """ | |
| Get the dictionary of combined distortion values. | |
| """ | |
| return self.combined_distortions | |
| # Example usage | |
| if __name__ == "__main__": | |
| config = load_config(config_path)['PECCAVI_TEXT']['Metrics'] | |
| # Original sentence | |
| original_sentence = "The quick brown fox jumps over the lazy dog" | |
| # Paraphrased sentences | |
| paraphrased_sentences = [ | |
| # Original 1: "A swift auburn fox leaps across a sleepy canine." | |
| "The swift auburn fox leaps across a sleepy canine.", | |
| "A quick auburn fox leaps across a sleepy canine.", | |
| "A swift ginger fox leaps across a sleepy canine.", | |
| "A swift auburn fox bounds across a sleepy canine.", | |
| "A swift auburn fox leaps across a tired canine.", | |
| "Three swift auburn foxes leap across a sleepy canine.", | |
| "The vulpine specimen rapidly traverses over a dormant dog.", | |
| "Like lightning, the russet hunter soars over the drowsy guardian.", | |
| "Tha quick ginger fox jumps o'er the lazy hound, ye ken.", | |
| "One rapid Vulpes vulpes traverses the path of a quiescent canine.", | |
| "A swift auburn predator navigates across a lethargic pet.", | |
| "Subject A (fox) demonstrates velocity over Subject B (dog).", | |
| # Original 2: "The agile russet fox bounds over an idle hound." | |
| "Some agile russet foxes bound over an idle hound.", | |
| "The nimble russet fox bounds over an idle hound.", | |
| "The agile brown fox bounds over an idle hound.", | |
| "The agile russet fox jumps over an idle hound.", | |
| "The agile russet fox bounds over a lazy hound.", | |
| "Two agile russet foxes bound over an idle hound.", | |
| "A dexterous vulpine surpasses a stationary canine.", | |
| "Quick as thought, the copper warrior sails over the guardian.", | |
| "Tha nimble reddish fox jumps o'er the doggo, don't ya know.", | |
| "A dexterous V. vulpes exceeds the plane of an inactive canine.", | |
| "An agile russet hunter maneuvers above a resting hound.", | |
| "Test subject F-1 achieves displacement superior to subject D-1.", | |
| # Original 3: "A nimble mahogany vulpine vaults above a drowsy dog." | |
| "The nimble mahogany vulpine vaults above a drowsy dog.", | |
| "A swift mahogany vulpine vaults above a drowsy dog.", | |
| "A nimble reddish vulpine vaults above a drowsy dog.", | |
| "A nimble mahogany fox vaults above a drowsy dog.", | |
| "A nimble mahogany vulpine leaps above a drowsy dog.", | |
| "Four nimble mahogany vulpines vault above a drowsy dog.", | |
| "An agile specimen of reddish fur surpasses a somnolent canine.", | |
| "Fleet as wind, the earth-toned hunter soars over the sleepy guard.", | |
| "Tha quick brown beastie jumps o'er the tired pup, aye.", | |
| "Single V. vulpes demonstrates vertical traverse over C. familiaris.", | |
| "A nimble rust-colored predator crosses above a drowsy pet.", | |
| "Observed: Subject Red executes vertical motion over Subject Gray.", | |
| # Original 4: "The speedy copper-colored fox hops over the lethargic pup." | |
| "A speedy copper-colored fox hops over the lethargic pup.", | |
| "The quick copper-colored fox hops over the lethargic pup.", | |
| "The speedy bronze fox hops over the lethargic pup.", | |
| "The speedy copper-colored fox jumps over the lethargic pup.", | |
| "The speedy copper-colored fox hops over the tired pup.", | |
| "Multiple speedy copper-colored foxes hop over the lethargic pup.", | |
| "A rapid vulpine of bronze hue traverses an inactive young canine.", | |
| "Swift as a dart, the metallic hunter bounds over the lazy puppy.", | |
| "Tha fast copper beastie leaps o'er the sleepy wee dog.", | |
| "1 rapid V. vulpes crosses above 1 juvenile C. familiaris.", | |
| "A fleet copper-toned predator moves past a sluggish young dog.", | |
| "Field note: Adult fox subject exceeds puppy subject vertically.", | |
| # Original 5: "A rapid tawny fox springs over a sluggish dog." | |
| "The rapid tawny fox springs over a sluggish dog.", | |
| "A quick tawny fox springs over a sluggish dog.", | |
| "A rapid golden fox springs over a sluggish dog.", | |
| "A rapid tawny fox jumps over a sluggish dog.", | |
| "A rapid tawny fox springs over a lazy dog.", | |
| "Six rapid tawny foxes spring over a sluggish dog.", | |
| "An expeditious yellowish vulpine surpasses a torpid canine.", | |
| "Fast as a bullet, the golden hunter vaults over the idle guard.", | |
| "Tha swift yellowy fox jumps o'er the lazy mutt, aye.", | |
| "One V. vulpes displays rapid transit over one inactive C. familiaris.", | |
| "A speedy yellow-brown predator bypasses a motionless dog.", | |
| "Log entry: Vulpine subject achieves swift vertical displacement.", | |
| # Original 6: "The fleet-footed chestnut fox soars above an indolent canine." | |
| "A fleet-footed chestnut fox soars above an indolent canine.", | |
| "The swift chestnut fox soars above an indolent canine.", | |
| "The fleet-footed brown fox soars above an indolent canine.", | |
| "The fleet-footed chestnut fox leaps above an indolent canine.", | |
| "The fleet-footed chestnut fox soars above a lazy canine.", | |
| "Several fleet-footed chestnut foxes soar above an indolent canine.", | |
| "A rapid brown vulpine specimen traverses a lethargic domestic dog.", | |
| "Graceful as a bird, the nutbrown hunter flies over the lazy guard.", | |
| "Tha quick brown beastie sails o'er the sleepy hound, ken.", | |
| "Single agile V. vulpes achieves elevation above stationary canine.", | |
| "A nimble brown predator glides over an unmoving domestic animal.", | |
| "Research note: Brown subject displays superior vertical mobility.", | |
| # Original 7: "A fast ginger fox hurdles past a slothful dog." | |
| "The fast ginger fox hurdles past a slothful dog.", | |
| "A quick ginger fox hurdles past a slothful dog.", | |
| "A fast red fox hurdles past a slothful dog.", | |
| "A fast ginger fox jumps past a slothful dog.", | |
| "A fast ginger fox hurdles past a lazy dog.", | |
| "Five fast ginger foxes hurdle past a slothful dog.", | |
| "A rapid orange vulpine bypasses a lethargic canine.", | |
| "Quick as lightning, the flame-colored hunter races past the lazy guard.", | |
| "Tha swift ginger beastie leaps past the tired doggy, ye see.", | |
| "1 rapid orange V. vulpes surpasses 1 inactive C. familiaris.", | |
| "A speedy red-orange predator overtakes a motionless dog.", | |
| "Data point: Orange subject demonstrates rapid transit past Gray subject.", | |
| # Original 8: "The spry rusty-colored fox jumps across a dozing hound." | |
| "A spry rusty-colored fox jumps across a dozing hound.", | |
| "The agile rusty-colored fox jumps across a dozing hound.", | |
| "The spry reddish fox jumps across a dozing hound.", | |
| "The spry rusty-colored fox leaps across a dozing hound.", | |
| "The spry rusty-colored fox jumps across a sleeping hound.", | |
| "Multiple spry rusty-colored foxes jump across a dozing hound.", | |
| "An agile rust-toned vulpine traverses a somnolent canine.", | |
| "Nimble as thought, the copper hunter bounds over the resting guard.", | |
| "Tha lively rust-colored beastie hops o'er the snoozin' hound.", | |
| "Single dexterous V. vulpes crosses path of dormant C. familiaris.", | |
| "A lithe rust-tinted predator moves past a slumbering dog.", | |
| "Observation: Russet subject exhibits agility over dormant subject.", | |
| # Original 9: "A quick tan fox leaps over an inactive dog." | |
| "The quick tan fox leaps over an inactive dog.", | |
| "A swift tan fox leaps over an inactive dog.", | |
| "A quick beige fox leaps over an inactive dog.", | |
| "A quick tan fox jumps over an inactive dog.", | |
| "A quick tan fox leaps over a motionless dog.", | |
| "Seven quick tan foxes leap over an inactive dog.", | |
| "A rapid light-brown vulpine surpasses a stationary canine.", | |
| "Fast as wind, the sand-colored hunter soars over the still guard.", | |
| "Tha nimble tan beastie jumps o'er the quiet doggy, aye.", | |
| "One agile fawn V. vulpes traverses one immobile C. familiaris.", | |
| "A fleet tan-colored predator bypasses an unmoving dog.", | |
| "Field report: Tan subject demonstrates movement over static subject.", | |
| # Original 10: "The brisk auburn vulpine bounces over a listless canine." | |
| "Some brisk auburn vulpines bounce over a listless canine.", | |
| "The quick auburn vulpine bounces over a listless canine.", | |
| "The brisk russet vulpine bounces over a listless canine.", | |
| "The brisk auburn fox bounces over a listless canine.", | |
| "The brisk auburn vulpine jumps over a listless canine.", | |
| "Five brisk auburn vulpines bounce over a listless canine.", | |
| "The expeditious specimen supersedes a quiescent Canis lupus.", | |
| "Swift as wind, the russet hunter vaults over the idle guardian.", | |
| "Tha quick ginger beastie hops o'er the lazy mutt, aye.", | |
| "One V. vulpes achieves displacement over inactive C. familiaris.", | |
| "A high-velocity auburn predator traverses an immobile animal.", | |
| "Final observation: Red subject shows mobility over Gray subject." | |
| ] | |
| distortion_calculator = SentenceDistortionCalculator(config, original_sentence, paraphrased_sentences) | |
| for _ in tqdm(range(1)): | |
| distortion_calculator.calculate_all_metrics() | |
| distortion_calculator.normalize_metrics() | |
| distortion_calculator.calculate_combined_distortion() | |
| distortion_calculator.plot_metrics() | |
| print("Normalized Metrics:", distortion_calculator.get_normalized_metrics()) | |
| print("Combined Distortion:", distortion_calculator.get_combined_distortions()) |