Spaces:
Running
Running
| import re | |
| from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
| from sentence_transformers import SentenceTransformer | |
| # Importing the necessary configuration for model names | |
| from utils.config import TEXT_MODEL_NAME, EMBEDDING_MODEL_NAME | |
| class TextSynthesizer: | |
| def __init__(self, embed_model=EMBEDDING_MODEL_NAME, text_model=TEXT_MODEL_NAME): # TinyLlama/TinyLlama-1.1B-Chat-v1.0 microsoft/phi-2 HuggingFaceH4/zephyr-1.1B-alpha model_name: str = "gpt-3.5-turbo" | |
| """ | |
| Initializes the TextAnalyzer with a specified sentence-transformer model. | |
| Args: | |
| model_name (str): The name of the sentence-transformer model to use. | |
| """ | |
| self.model = SentenceTransformer(embed_model) | |
| self.tokenizer = AutoTokenizer.from_pretrained(text_model) | |
| self.text_model = AutoModelForCausalLM.from_pretrained( | |
| text_model, | |
| device_map="auto", | |
| torch_dtype="auto" | |
| ) | |
| self.text_generator = pipeline( | |
| 'text-generation', | |
| model=text_model, | |
| tokenizer=self.tokenizer | |
| ) | |
| def get_embedding(self, text: str): | |
| """ | |
| Generates an embedding for the input text. | |
| Args: | |
| text (str): The input text (e.g., a quote, poem, or verse). | |
| Returns: | |
| numpy.ndarray: A vector embedding of the text. | |
| """ | |
| return self.model.encode(text) | |
| def clean_text(self, text: str) -> str: | |
| """ | |
| Clean and normalize input text by removing unwanted characters and trimming extra whitespace. | |
| """ | |
| # Remove any characters except letters, numbers, punctuation, and basic symbols | |
| cleaned = re.sub(r"[^\w\s.,:;!?'\"-]", "", text) | |
| # Normalize whitespace to single spaces | |
| cleaned = re.sub(r"\s+", " ", cleaned).strip() | |
| return cleaned | |
| def validate_text_length(self, text: str, max_length: int = 300) -> bool: | |
| """ | |
| Validate text length to avoid overly long inputs. | |
| Returns True if valid, False otherwise. | |
| """ | |
| return 0 < len(text) <= max_length | |
| def extract_keywords(self, text: str) -> list: | |
| """ | |
| Extracts keywords from the given text. | |
| This is a placeholder and can be enhanced with more sophisticated NLP techniques | |
| or another LLM for semantic keyword extraction. | |
| Args: | |
| text (str): The input text. | |
| Returns: | |
| list: A list of extracted keywords. | |
| """ | |
| # Simple example: convert to lowercase and split by whitespace. | |
| # For production, consider using NLTK, SpaCy, or an LLM-based keyword extractor. | |
| return [word for word in text.lower().split() if len(word) > 2] # Basic filtering | |
| def generate_caption(self, prompt: str, max_new_tokens: int = 300) -> str: | |
| """ | |
| Generates a caption based on the provided prompt, enriched with poetic or authoritative quotes. | |
| :param prompt: The input text prompt for text generation. | |
| :param max_length: The maximum length of the generated caption. | |
| :return: The generated caption. | |
| """ | |
| # Craft a more detailed prompt for the model to generate a fitting caption | |
| generation_prompt = ( | |
| f"Write a detailed, poetic, and informative paragraph about the following topic: \n'{prompt}'.\n" | |
| f"Use vivid, emotional language and include relevant verses or quotes by poets, philosophers, or scientists." | |
| f"The paragraph should be knowledgable, well researched and engaging. The tone should be educational and inspirational, not casual or conversational." | |
| f"Dont use emojis or hastags or words like response or answer, just write the paragraph directly.\n" | |
| ) | |
| generated_outputs = self.text_generator( | |
| generation_prompt, | |
| max_new_tokens=max_new_tokens, | |
| temperature=0.7, | |
| top_p=0.9, | |
| do_sample=True, | |
| repetition_penalty=1.2, # <-- helps reduce loops | |
| num_return_sequences=1, | |
| eos_token_id=self.tokenizer.eos_token_id, | |
| pad_token_id=self.tokenizer.eos_token_id | |
| ) | |
| raw_output = generated_outputs[0]['generated_text'] | |
| # Remove the initial prompt from the result | |
| if generation_prompt in raw_output: | |
| # The model will return the prompt plus the generated text, so we clean it up. | |
| # We find the generated part by removing the initial prompt. | |
| caption = raw_output.split(generation_prompt)[-1].strip() | |
| else: | |
| caption = raw_output.strip() | |
| # Clean up the caption to ensure it's a single coherent block | |
| caption = caption.replace(generation_prompt, "").strip() | |
| # Further clean the text to ensure it's a single, coherent block | |
| # caption = self.clean_text(caption.split('\n')[0]) | |
| return caption |