Spaces:
Sleeping
Sleeping
| import nltk | |
| import logging | |
| import spacy | |
| from nltk.corpus import stopwords | |
| from nltk.util import ngrams | |
| from collections import Counter | |
| import re | |
| from tqdm import tqdm | |
| from spacy.cli import download | |
| # Add imports for the curated transformer support | |
| from spacy.util import registry | |
| # Logging setup | |
| logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s") | |
| logger = logging.getLogger(__name__) | |
| class NgramProcessor: | |
| def __init__(self, models=None): | |
| try: | |
| nltk.data.find('corpora/stopwords') | |
| except LookupError: | |
| nltk.download('stopwords') | |
| self.stop_words = set(stopwords.words('english')) | |
| # Default to standard model if none specified | |
| if models is None: | |
| models = ["en_core_web_trf"] | |
| # Check and register curated transformer if needed | |
| # Fix for registry.get_names error - check if curated_transformer exists in a compatible way | |
| try: | |
| # First try to check if "curated_transformer" exists in the registry | |
| curated_transformer_exists = False | |
| # Compatible check across different spaCy versions | |
| if hasattr(registry, 'factories'): | |
| if hasattr(registry.factories, 'get_names'): | |
| # spaCy older version | |
| curated_transformer_exists = "curated_transformer" in registry.factories.get_names("pipe", "en") | |
| elif hasattr(registry.factories, 'get'): | |
| # spaCy newer version - check differently | |
| pipe_factories = registry.factories.get("pipe", {}) | |
| curated_transformer_exists = "curated_transformer" in pipe_factories | |
| else: | |
| # Alternative check if structure is different | |
| factories_dict = getattr(registry.factories, "__dict__", {}) | |
| for key, value in factories_dict.items(): | |
| if "curated_transformer" in str(key): | |
| curated_transformer_exists = True | |
| break | |
| if not curated_transformer_exists: | |
| try: | |
| # Try to import and register the curated transformer | |
| import spacy_curated_transformers | |
| from spacy_curated_transformers import CuratedTransformer | |
| from spacy.language import Language | |
| Language.factory("curated_transformer", func=CuratedTransformer) | |
| tqdm.write("[NgramProcessor] Registered curated_transformer factory") | |
| except ImportError: | |
| tqdm.write("[NgramProcessor] Warning: spacy_curated_transformers not found. Some models may not load correctly.") | |
| except Exception as e: | |
| tqdm.write(f"[NgramProcessor] Warning: Error checking for curated_transformer: {str(e)}") | |
| # Load specified model | |
| self.models = {} | |
| for model_name in models: | |
| try: | |
| self.models[model_name] = spacy.load(model_name) | |
| tqdm.write(f"[NgramProcessor] Loaded model: {model_name}") | |
| except IOError: | |
| tqdm.write(f"[NgramProcessor] Error: Model '{model_name}' not found. Please install it with:") | |
| download(model_name) | |
| self.models[model_name] = spacy.load(model_name) | |
| tqdm.write(f"python -m spacy download {model_name}") | |
| except Exception as e: | |
| tqdm.write(f"[NgramProcessor] Error loading model '{model_name}': {str(e)}") | |
| # Set primary NLP model for other processes | |
| if "en_core_web_trf" in self.models: | |
| self.nlp = self.models["en_core_web_trf"] | |
| elif len(self.models) > 0: | |
| # Use first available model as primary if preferred one isn't available | |
| self.nlp = next(iter(self.models.values())) | |
| else: | |
| raise ValueError("No spaCy model was successfully loaded") | |
| # Add custom entity patterns for numerical ranges to primary model | |
| if "entity_ruler" not in self.nlp.pipe_names: | |
| ruler = self.nlp.add_pipe("entity_ruler", before="ner") | |
| patterns = [ | |
| {"label": "CARDINAL", "pattern": [{"TEXT": {"REGEX": "\\d+-\\d+"}}]}, # Pattern for ranges like "7-10" | |
| {"label": "PERCENT", "pattern": [{"TEXT": {"REGEX": "\\d+%"}}]} # Pattern for percentages | |
| ] | |
| ruler.add_patterns(patterns) | |
| # Create special pattern for numerical ranges | |
| self.number_range_pattern = re.compile(r'\b(\d+(?:-\d+)+)\b') | |
| tqdm.write("[NgramProcessor] Initialized with stopwords, spaCy NLP model, and numerical range detection.") | |
| def remove_stopwords(self, text): | |
| words = re.findall(r'\w+', text.lower()) | |
| filtered_words = [word for word in words if word not in self.stop_words] | |
| return ' '.join(filtered_words) | |
| def extract_number_ranges(self, sentences): | |
| """Extract numerical ranges like '7-10' from sentences""" | |
| tqdm.write("[NgramProcessor] Extracting numerical ranges...") | |
| number_ranges = [] | |
| range_counts = Counter() | |
| for sentence in sentences: | |
| # Find all numerical ranges in the sentence | |
| matches = self.number_range_pattern.findall(sentence) | |
| for match in matches: | |
| range_counts[match] += 1 | |
| # Add all ranges that appear in all sentences (threshold for ranges) | |
| for range_text, count in range_counts.items(): | |
| if count >= 1: | |
| number_ranges.append(range_text) | |
| tqdm.write(f"[NgramProcessor] Found {len(number_ranges)} numerical ranges: {number_ranges}") | |
| return number_ranges | |
| def extract_standalone_numbers(self, sentences): | |
| """Extract standalone numerical values from sentences""" | |
| tqdm.write("[NgramProcessor] Extracting standalone numbers...") | |
| # Two patterns: one for percentages, one for regular numbers | |
| percentage_pattern = re.compile(r'\b\d+%\b') # Only matches numbers with % sign | |
| number_pattern = re.compile(r'\b\d+\b') # Only matches standalone numbers | |
| percentage_counts = Counter() | |
| number_counts = Counter() | |
| percentage_values = set() # Store the numeric part of percentages for cross-reference | |
| # First pass: Find all percentages | |
| for sentence in sentences: | |
| # Extract all percentages first | |
| percentage_matches = percentage_pattern.findall(sentence) | |
| for match in percentage_matches: | |
| percentage_counts[match] += 1 | |
| # Store the numeric part for later comparison | |
| numeric_part = match.rstrip('%') | |
| percentage_values.add(numeric_part) | |
| # Second pass: Find standalone numbers | |
| for sentence in sentences: | |
| # Only look for standalone numbers now | |
| number_matches = number_pattern.findall(sentence) | |
| for match in number_matches: | |
| # Avoid double counting numbers that we already counted as percentages | |
| if match not in percentage_values: | |
| number_counts[match] += 1 | |
| # Process percentages first (they have priority) | |
| threshold = max(1, int(len(sentences) * 1.0)) | |
| standalone_numbers = [] | |
| # Add percentages that meet the threshold | |
| for num, count in percentage_counts.items(): | |
| if count >= threshold: | |
| standalone_numbers.append(num) # Already has % sign | |
| # Then add standalone numbers, converting to percentage format if needed | |
| for num, count in number_counts.items(): | |
| if count >= threshold: | |
| # If this number also appeared as part of a percentage, use the percentage format | |
| if num in percentage_values: | |
| standalone_numbers.append(f"{num}%") | |
| else: | |
| standalone_numbers.append(num) | |
| tqdm.write(f"[NgramProcessor] Found {len(standalone_numbers)} standalone numbers: {standalone_numbers}") | |
| return standalone_numbers | |
| def extract_regex_subsequences(self, sentences): | |
| """Extract potential subsequences using regex patterns before applying NLP""" | |
| tqdm.write("[NgramProcessor] Extracting regex subsequences...") | |
| # Find potential multi-word subsequences (2-5 words) that occur across sentences | |
| potential_subsequences = set() | |
| # Process each sentence to find multi-word phrases | |
| for sentence in sentences: | |
| # First, clean the sentence by removing punctuation and converting to lowercase | |
| clean_sentence = re.sub(r'[^\w\s&-./\'()[\]$€£¥+%]', ' ', sentence.lower()) | |
| # Extract sequences of 2-6 words | |
| for i in range(2, 7): # Try sequences of length 2-6 words | |
| pattern = r'\b(\w+(?:[-&\s./\'()[\]$€£¥+%]+\w+){' + str(i-1) + r'})\b' | |
| matches = re.findall(pattern, clean_sentence) | |
| potential_subsequences.update(matches) | |
| # Filter out sequences that consist only of stopwords (but preserve numbers) | |
| filtered_subsequences = [] | |
| for subseq in potential_subsequences: | |
| words = re.split(r'[\s-]+', subseq) # Split on spaces or hyphens | |
| # Function to check if a word is a number or percentage | |
| def is_numeric(word): | |
| return bool(re.match(r'^\d+(\.\d+)?%?$|^\d+-\d+$', word)) | |
| # Skip if ALL words are stopwords and none are numeric | |
| if all((word in self.stop_words and not is_numeric(word)) for word in words): | |
| tqdm.write(f"[NgramProcessor] Skipping all-stopword phrase: {subseq}") | |
| continue | |
| # Keep if sequence has significant words (not just stopwords) | |
| # OR if it contains numbers/percentages | |
| if len(words) > 1 and ( | |
| any(word not in self.stop_words and (len(word) > 2 or is_numeric(word)) for word in words) | |
| ): | |
| # Additional check to reject if standalone "the" or other common stopwords | |
| if not (len(words) == 1 and words[0] in self.stop_words and not is_numeric(words[0])): | |
| filtered_subsequences.append(subseq) | |
| # Count occurrences across all sentences | |
| subseq_counts = Counter() | |
| for subseq in filtered_subsequences: | |
| for sentence in sentences: | |
| if re.search(r'\b' + re.escape(subseq) + r'\b', sentence.lower()): | |
| subseq_counts[subseq] += 1 | |
| # Keep only subsequences that appear in multiple sentences | |
| threshold = max(2, int(len(sentences) * 1.0)) # threshold to catch all patterns | |
| regex_candidates = [subseq for subseq, count in subseq_counts.items() | |
| if count >= threshold] | |
| tqdm.write(f"[NgramProcessor] Found {len(regex_candidates)} regex subsequences") | |
| return regex_candidates | |
| def filter_standalone_stopwords(self, ngrams_dict): | |
| """Remove standalone stopwords and very short terms from the ngrams dictionary""" | |
| filtered_dict = {} | |
| for sentence, ngrams in ngrams_dict.items(): | |
| filtered_dict[sentence] = {} | |
| for ngram, indices in ngrams.items(): | |
| words = ngram.split() | |
| # Skip single stopwords and very short terms UNLESS they are numbers | |
| if (len(words) == 1 and (words[0] in self.stop_words or len(words[0]) < 3)): | |
| # Exception for numbers | |
| if len(words) == 1 and re.match(r'^\d+$', words[0]): | |
| filtered_dict[sentence][ngram] = indices | |
| continue | |
| else: | |
| continue | |
| # Skip if ALL words are stopwords | |
| if all(word in self.stop_words for word in words): | |
| continue | |
| filtered_dict[sentence][ngram] = indices | |
| return filtered_dict | |
| def extract_named_entities(self, sentences): | |
| entity_counter = Counter() | |
| # Process each sentence with each model | |
| for model_name, nlp_model in self.models.items(): | |
| tqdm.write(f"[NgramProcessor] Extracting entities with model: {model_name}") | |
| docs = list(nlp_model.pipe(sentences)) | |
| # Process each sentence | |
| for doc in docs: | |
| for ent in doc.ents: | |
| # Include entity types relevant to this model | |
| # This is a comprehensive list - some models may not use all these types | |
| if ent.label_ in { | |
| # People, organizations, locations | |
| "PERSON", "ORG", "GPE", "LOC", "NORP", | |
| # Facilities and products | |
| "FAC", "PRODUCT", "WORK_OF_ART", "EVENT", | |
| # Numeric entities | |
| "DATE", "TIME", "MONEY", "QUANTITY", "PERCENT", "CARDINAL", "ORDINAL", | |
| # Others | |
| "LAW", "LANGUAGE", | |
| # Scientific entities | |
| "SCIENTIFIC", "SUBSTANCE", "CHEMICAL", "TECHNOLOGY", | |
| # Medical entities | |
| "DISEASE", "MEDICAL", "CLINICAL", "TREATMENT", "SYMPTOM", "DIAGNOSTIC", | |
| "ANATOMICAL", "BIOLOGY", "GENE", "PROTEIN", "DRUG", | |
| # Legal entities | |
| "LEGAL", "COURT", "STATUTE", "PROVISION", "CASE_CITATION", "JUDGE", | |
| "LEGAL_ROLE", "REGULATION", "CONTRACT" | |
| }: | |
| # Handle possessive forms by stripping 's | |
| clean_entity = re.sub(r"'s\b", "", ent.text.lower()).strip() | |
| # Add model name prefix to distinguish sources | |
| entity_counter[clean_entity] += 1 | |
| threshold = max(1, len(sentences) * 1.0) # Adjusted threshold for entities | |
| return [ent for ent, count in entity_counter.items() if count >= threshold] | |
| def extract_domain_specific_entities(self, text): | |
| """Extract entities from all models and categorize by domain""" | |
| domain_entities = {} | |
| for model_name, nlp_model in self.models.items(): | |
| doc = nlp_model(text) | |
| domain_entities[model_name] = [(ent.text, ent.label_) for ent in doc.ents] | |
| return domain_entities | |
| def is_substring_of_any(self, ngram, common_ngrams): | |
| for other_ngram in common_ngrams: | |
| if ngram != other_ngram and ngram in other_ngram: | |
| return True | |
| return False | |
| def find_filtered_ngrams(self, sentences): | |
| tqdm.write("[NgramProcessor] Processing...") | |
| # Step 1: First extract numerical ranges or standalone numbers (special priority) | |
| number_ranges = self.extract_number_ranges(sentences) | |
| standalone_numbers = self.extract_standalone_numbers(sentences) | |
| # Step 2: Use regex to find common subsequences | |
| regex_subsequences = self.extract_regex_subsequences(sentences) | |
| tqdm.write(f"[NgramProcessor] Regex Subsequences: {regex_subsequences}") | |
| # Step 3: Then apply spaCy to detect named entities | |
| named_entities = self.extract_named_entities(sentences) | |
| # Make sure percentage values have proper format | |
| for i, entity in enumerate(named_entities): | |
| if re.match(r'\d+$', entity) and any(f"{entity}%" in sentence for sentence in sentences): | |
| # Replace standalone digit with percentage if it appears as percentage in text | |
| named_entities[i] = f"{entity}%" | |
| tqdm.write(f"[NgramProcessor] Named Entities: {named_entities}") | |
| # Step 4: Consolidate and filter all detected patterns | |
| # Collect all patterns in one list | |
| all_patterns = number_ranges + regex_subsequences + named_entities + standalone_numbers | |
| # Sort by length (longer first) to prioritize more specific patterns | |
| all_patterns.sort(key=len, reverse=True) | |
| # Remove duplicates while preserving order | |
| unique_patterns = [] | |
| seen = set() | |
| for pattern in all_patterns: | |
| if pattern not in seen: | |
| # Check if this pattern is a substring of any already selected pattern | |
| is_substring = False | |
| for selected_pattern in unique_patterns: | |
| if pattern in selected_pattern and pattern != selected_pattern: | |
| is_substring = True | |
| break | |
| if not is_substring: | |
| unique_patterns.append(pattern) | |
| seen.add(pattern) | |
| # Re-index sequentially | |
| indexed_patterns = [(i+1, pattern) for i, pattern in enumerate(unique_patterns)] | |
| self.indexed_patterns = indexed_patterns | |
| non_melting_points = [pattern for _, pattern in indexed_patterns] | |
| tqdm.write(f"[NgramProcessor] Filtered non_melting_points: {non_melting_points}") | |
| tqdm.write(f"[NgramProcessor] Filtered non-melting points: {len(non_melting_points)}") | |
| # Filter out patterns that are substrings of longer patterns or standalone numbers | |
| standalone_numbers_set = set(standalone_numbers) | |
| non_melting_points = [] | |
| for pattern in unique_patterns: | |
| is_substring = False | |
| for longer_pattern in non_melting_points: | |
| # Check if pattern is contained within a longer pattern | |
| if pattern in longer_pattern: | |
| is_substring = True | |
| break | |
| if not is_substring or pattern in standalone_numbers_set: | |
| non_melting_points.append(pattern) | |
| # For remaining cases that might have been missed, apply NLTK n-gram extraction | |
| # Only on cleaned sentences (less computationally expensive now) | |
| clean_to_original = {} | |
| sentences_cleaned = [] | |
| # Process sentences with spaCy to preserve entity information | |
| docs = list(self.nlp.pipe(sentences)) | |
| for i, doc in enumerate(docs): | |
| original_sentence = sentences[i] | |
| entity_texts = {ent.text.lower() for ent in doc.ents if len(ent.text.split()) > 1} | |
| # Tokenize while preserving entities and numerical ranges | |
| tokens = [] | |
| j = 0 | |
| words = [token.text for token in doc] | |
| while j < len(words): | |
| # First check for numerical ranges | |
| current_word = words[j].lower() | |
| if self.number_range_pattern.match(current_word): | |
| tokens.append(current_word) | |
| j += 1 | |
| continue | |
| # Then check for entities | |
| matched_entity = None | |
| for ent in sorted(entity_texts, key=len, reverse=True): | |
| ent_words = ent.split() | |
| if j + len(ent_words) <= len(words) and [w.lower() for w in words[j:j+len(ent_words)]] == ent_words: | |
| matched_entity = " ".join(words[j:j+len(ent_words)]) | |
| tokens.append(matched_entity.lower()) # preserve full entity | |
| j += len(ent_words) | |
| break | |
| if not matched_entity: | |
| word = words[j].lower() | |
| if word not in self.stop_words and re.match(r'\w+', word): | |
| tokens.append(word) | |
| j += 1 | |
| cleaned = " ".join(tokens) | |
| sentences_cleaned.append(cleaned) | |
| clean_to_original[cleaned] = original_sentence | |
| # Step 5: Only run n-gram extraction on gaps not covered by regex and named entities | |
| ngram_lengths = [4, 3, 2, 1] # Consider shorter n-grams now since we already have longer phrases | |
| all_ngrams_by_length = {} | |
| for n in ngram_lengths: | |
| all_ngrams = [] | |
| for sentence in sentences_cleaned: | |
| tokens = sentence.split() | |
| if len(tokens) >= n: | |
| sent_ngrams = list(ngrams(tokens, n)) | |
| all_ngrams.extend(sent_ngrams) | |
| all_ngrams_by_length[n] = Counter(all_ngrams) | |
| # Step 6: Add additional n-grams that are frequent but weren't caught by regex or named entities | |
| threshold_factor = 1.0 # threshold since we're focusing on gaps | |
| for n_size in sorted(ngram_lengths, reverse=True): | |
| ngram_counts = all_ngrams_by_length[n_size] | |
| threshold = max(2, int(len(sentences) * threshold_factor)) | |
| # Sort by count for efficiency | |
| for ngram, count in ngram_counts.most_common(): | |
| if count >= threshold: | |
| ngram_str = ' '.join(ngram) | |
| # Skip if is a substring of existing n-grams or already in our collection | |
| if ngram_str not in non_melting_points and not self.is_substring_of_any(ngram_str, non_melting_points): | |
| non_melting_points.append(ngram_str) | |
| # Create sorted version for efficient lookup | |
| final_non_melting_points = non_melting_points.copy() | |
| sorted_non_melting_points = sorted(final_non_melting_points, key=len, reverse=True) | |
| final_indexed_patterns = [(i+1, pattern) for i, pattern in enumerate(sorted_non_melting_points)] | |
| #Filter out n-grams that consist entirely of stop words | |
| filtered_patterns = [] | |
| for idx, pattern in final_indexed_patterns: | |
| words = pattern.lower().split() | |
| # Check if the pattern is a number or contains a number | |
| has_number = any(re.match(r'.*\d+.*', word) for word in words) | |
| # If the pattern has a number OR has any non-stop word, keep it | |
| if has_number or any(word not in self.stop_words for word in words): | |
| filtered_patterns.append((idx, pattern)) | |
| else: | |
| tqdm.write(f"[NgramProcessor] Removing n-gram with all stop words: {pattern}") | |
| # Reassign filtered patterns with reindexed values | |
| self.indexed_patterns = [(i+1, pattern) for i, (_, pattern) in enumerate(filtered_patterns)] | |
| # Generate the results with more efficient regex matching | |
| result = {} | |
| for sentence in sentences: | |
| sentence_result = {} | |
| for _,ngram in self.indexed_patterns: # Use the filtered patterns | |
| # Skip single word stopwords and short terms | |
| words = ngram.split() | |
| if len(words) == 1 and (words[0] in self.stop_words or len(words[0]) < 3): | |
| continue | |
| # Handle numerical ranges differently - need exact matching | |
| if self.number_range_pattern.match(ngram): | |
| pattern = re.compile(r'\b' + re.escape(ngram) + r'\b', re.IGNORECASE) | |
| else: | |
| # Compile the regex pattern once per n-gram - modified to handle special characters | |
| pattern = re.compile(r'(?<!\w)' + re.escape(ngram) + r'(?!\w)', re.IGNORECASE) | |
| matches = list(pattern.finditer(sentence)) | |
| if matches: | |
| indices = [] | |
| for match in matches: | |
| # Calculate word indices with improved handling for hyphenated terms | |
| start_pos = match.start() | |
| text_before = sentence[:start_pos] | |
| # More accurate word counting that handles hyphenated terms | |
| start_idx = len(re.findall(r'\s+', text_before)) + (0 if text_before.strip() == "" else 1) | |
| # Count words in the matched n-gram (handling hyphens as single terms) | |
| if self.number_range_pattern.match(ngram): | |
| # Numerical ranges count as one term | |
| ngram_word_count = 1 | |
| else: | |
| ngram_word_count = len(re.findall(r'\S+', ngram)) | |
| end_idx = start_idx + ngram_word_count - 1 | |
| indices.append((start_idx, end_idx)) | |
| if indices: # Only add if we found valid indices | |
| sentence_result[ngram] = indices | |
| result[sentence] = sentence_result | |
| # Apply the stopword filter before returning | |
| result = self.filter_standalone_stopwords(result) | |
| return result, dict(self.indexed_patterns) | |
| def find_relative_order(self, sentence, common_ngrams): | |
| # First, identify all possible matches without modifying the sentence | |
| all_matches = [] | |
| for ngram in common_ngrams: | |
| # Special handling for percentages | |
| if any(char in ngram for char in '&-/.\'()[]$€£¥+%'): | |
| pattern = re.compile(r'\b' + re.escape(ngram) + r'\b', re.IGNORECASE) | |
| # Handle numerical ranges | |
| elif self.number_range_pattern.match(ngram): | |
| pattern = re.compile(r'\b' + re.escape(ngram) + r'\b', re.IGNORECASE) | |
| else: | |
| pattern = re.compile(r'(?<!\w)' + re.escape(ngram) + r"(?:'s)?(?!\w)", re.IGNORECASE) | |
| for match in pattern.finditer(sentence): | |
| start, end = match.span() | |
| #store character position range, ngram text, and token count | |
| all_matches.append((start, end, ngram, len(ngram.split()))) | |
| # Pre-process: identify all word spans in the original sentence | |
| words = [] | |
| word_spans = [] | |
| for match in re.finditer(r'\S+', sentence): | |
| words.append(match.group()) | |
| word_spans.append((match.start(), match.end())) | |
| # Create a mapping from character positions to word indices | |
| char_to_word_idx = {} | |
| for i, (start, end) in enumerate(word_spans): | |
| for pos in range(start, end + 1): | |
| char_to_word_idx[pos] = i | |
| # Sort by length in characters first, then by word count | |
| all_matches.sort(key=lambda x: (-len(x[2]), -x[3], x[0])) | |
| # Filter out ngrams that overlap with already claimed ranges | |
| filtered_matches = [] | |
| claimed_ranges = [] | |
| for start, end, ngram, length in all_matches: | |
| # Check if this match overlaps with any existing claimed range | |
| is_overlapping = False | |
| for c_start, c_end in claimed_ranges: | |
| # Check for any overlap | |
| if max(start, c_start) < min(end, c_end): | |
| is_overlapping = True | |
| break | |
| if not is_overlapping: | |
| # Add this ngram to our filtered list | |
| filtered_matches.append((start, end, ngram, length)) | |
| # Claim its range | |
| claimed_ranges.append((start, end)) | |
| # Sort filtered matches by position for final ordering | |
| filtered_matches.sort(key=lambda x: x[0]) | |
| # Create word-level indices for the final matches | |
| word_level_matches = [] | |
| for start, end, ngram, _ in filtered_matches: | |
| # Find the word index for the start and end positions | |
| try: | |
| start_word_idx = char_to_word_idx.get(start, char_to_word_idx.get(start+1)) | |
| end_word_idx = char_to_word_idx.get(end-1, char_to_word_idx.get(end-2)) | |
| if start_word_idx is not None and end_word_idx is not None: | |
| word_level_matches.append((start_word_idx, end_word_idx, ngram)) | |
| except (KeyError, IndexError): | |
| # Skip this match if we can't determine word indices | |
| continue | |
| # Create the final order with 1-based indexing | |
| ngram_to_index = {pattern: idx for idx, pattern in self.indexed_patterns} | |
| relative_order = [(ngram_to_index.get(ngram, i+1), ngram) for i, (_, _, ngram) in enumerate(word_level_matches)] | |
| return relative_order, sentence | |
| # Example usage | |
| if __name__ == "__main__": | |
| # Test with NBA Play-In Tournament example | |
| sentences = [ | |
| "The NBA Play-In Tournament tips off tonight as the No. 7-10 teams in each conference battle for a spot in the playoffs. Here's everything you need to know as the action unfolds.", | |
| "Tonight the NBA Play-In Tournament begins with No. 7-10 teams from each conference competing for playoff spots. Here's your guide to following all the action.", | |
| "The NBA Play-In Tournament kicks off this evening featuring the No. 7-10 teams across both conferences fighting for playoff positions. Here's what you should know about the upcoming games.", | |
| "Starting tonight, the NBA Play-In Tournament will showcase the No. 7-10 teams from each conference as they compete for remaining playoff berths. Here's your complete guide to the action.", | |
| "The NBA Play-In Tournament begins tonight with the No. 7-10 teams in both conferences battling for playoff spots. Here's everything you need to know about the upcoming games.", | |
| "Tonight marks the start of the NBA Play-In Tournament where No. 7-10 teams in each conference compete for playoff positions. Here's your essential guide to following the action.", | |
| "The NBA Play-In Tournament tips off tonight, featuring No. 7-10 teams from both conferences fighting for playoff berths. Here's what you need to know about the tournament.", | |
| "Beginning tonight, the NBA Play-In Tournament will pit the No. 7-10 teams in each conference against each other for playoff spots. Here's everything you should know about the games.", | |
| "The NBA Play-In Tournament starts tonight with No. 7-10 teams across both conferences competing for playoff positions. Here's your complete guide to all the action.", | |
| "Tonight is the tip-off of the NBA Play-In Tournament where the No. 7-10 teams from each conference battle for remaining playoff spots. Here's what you need to know as the games unfold." | |
| ] | |
| # Initialize with multiple models | |
| processor = NgramProcessor(models=["en_core_web_trf"]) | |
| # Process with all models combined | |
| common_ngrams,indexed_ngrams = processor.find_filtered_ngrams(sentences) | |
| # Print results | |
| print("Common n-grams with indices per sentence:") | |
| for sentence in sentences: | |
| order, updated_sentence = processor.find_relative_order(sentence, common_ngrams[sentence]) | |
| print(f"Sentence: {sentence}") | |
| print(f"Order: {order}") | |
| print() |