import streamlit as st import polars as pl from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, logging, AutoModelForCausalLM import torch import os import httpx import languagecodes device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Language options and mappings favourite_langs = {"Romanian": "ro", "German": "de", "English": "en", "-----": "-----"} df = pl.read_parquet("isolanguages.parquet") non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows() all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')} name_to_iso1 = {iso[0]: iso[1] for iso in non_empty_isos} # {'Romanian': 'ro', 'German': 'de'} # langs = ["German", "Romanian", "English", "French", "Spanish", "Italian"] langs = list(favourite_langs.keys()) langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first # all_langs = languagecodes.iso_languages_byname # iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'} iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'} def timer(func): import time def wrapper(*args, **kwargs): start_time = time.time() result = func(*args, **kwargs) end_time = time.time() execution_time = end_time - start_time print(f"Function '{func.__name__}' executed in {execution_time:.4f} seconds.") return result return wrapper models = ["Helsinki-NLP", "QUICKMT", "Argos", "Lego-MT/Lego-MT", "HPLT", "HPLT-OPUS", "Google", "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul", "Helsinki-NLP/opus-mt-tc-bible-big-roa-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-roa", "Helsinki-NLP/opus-mt-tc-bible-big-roa-en", "facebook/nllb-200-distilled-600M", "facebook/nllb-200-distilled-1.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-3.3B", "facebook/mbart-large-50-many-to-many-mmt", "facebook/mbart-large-50-one-to-many-mmt", "facebook/mbart-large-50-many-to-one-mmt", "facebook/m2m100_418M", "facebook/m2m100_1.2B", "bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl", "bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b", "t5-small", "t5-base", "t5-large", "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/madlad400-3b-mt", "Heng666/madlad400-3b-mt-ct2", "Heng666/madlad400-3b-mt-ct2-int8", "Heng666/madlad400-7b-mt-ct2-int8", "utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct", "Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2", "HuggingFaceTB/SmolLM3-3B", "winninghealth/WiNGPT-Babel-2", "tencent/Hunyuan-MT-7B", "openGPT-X/Teuken-7B-instruct-commercial-v0.4", "openGPT-X/Teuken-7B-instruct-v0.6", ] class Translators: def __init__(self, model_name: str, sl: str, tl: str, input_text: str): self.model_name = model_name self.sl, self.tl = sl, tl self.input_text = input_text self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def google(self): url = os.environ['GCLIENT'] + f'sl={self.sl}&tl={self.tl}&q={self.input_text}' response = httpx.get(url) return response.json()[0][0][0] def hplt(self, opus = False): # langs = ['ar', 'bs', 'ca', 'en', 'et', 'eu', 'fi', 'ga', 'gl', 'hi', 'hr', 'is', 'mt', 'nn', 'sq', 'sw', 'zh_hant'] hplt_models = ['ar-en', 'bs-en', 'ca-en', 'en-ar', 'en-bs', 'en-ca', 'en-et', 'en-eu', 'en-fi', 'en-ga', 'en-gl', 'en-hi', 'en-hr', 'en-is', 'en-mt', 'en-nn', 'en-sq', 'en-sw', 'en-zh_hant', 'et-en', 'eu-en', 'fi-en', 'ga-en', 'gl-en', 'hi-en', 'hr-en', 'is-en', 'mt-en', 'nn-en', 'sq-en', 'sw-en', 'zh_hant-en'] if opus: hplt_model = f'HPLT/translate-{self.sl}-{self.tl}-v1.0-hplt_opus' # HPLT/translate-en-hr-v1.0-hplt_opus else: hplt_model = f'HPLT/translate-{self.sl}-{self.tl}-v1.0-hplt' # HPLT/translate-en-hr-v1.0-hplt if f'{self.sl}-{self.tl}' in hplt_models: pipe = pipeline("translation", model=hplt_model, device=self.device) translation = pipe(self.input_text) translated_text = translation[0]['translation_text'] message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {hplt_model}.' else: translated_text = f'HPLT model from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} not available!' message = f"Available models: {', '.join(hplt_models)}" return translated_text, message @staticmethod def quickmttranslate(model_path, input_text): from quickmt import Translator # 'auto' auto-detects GPU, set to "cpu" to force CPU inference device = 'gpu' if torch.cuda.is_available() else 'cpu' translator = Translator(str(model_path), device = device) # translation = Translator(f"./quickmt-{self.sl}-{self.tl}/", device="auto", inter_threads=2) # set beam size to 1 for faster speed (but lower quality) translation = translator(input_text, beam_size=5, max_input_length = 512, max_decoding_length = 512) # print(model_path, input_text, translation) return translation @staticmethod def quickmtdownload(model_name): from quickmt.hub import hf_download from pathlib import Path model_path = Path("/quickmt/models") / model_name if not model_path.exists(): hf_download( model_name = f"quickmt/{model_name}", output_dir=Path("/quickmt/models") / model_name, ) return model_path def quickmt(self): model_name = f"quickmt-{self.sl}-{self.tl}" # from quickmt.hub import hf_list # quickmt_models = [i.split("/quickmt-")[1] for i in hf_list()] # quickmt_models.sort() # print(quickmt_models) quickmt_models = ['ar-en', 'bn-en', 'cs-en', 'da-en', 'de-en', 'el-en', 'en-ar', 'en-bn', 'en-cs', 'en-de', 'en-el', 'en-es', 'en-fa', 'en-fr', 'en-he', 'en-hi', 'en-hu', 'en-id', 'en-it', 'en-ja', 'en-ko', 'en-lv', 'en-pl', 'en-pt', 'en-ro', 'en-ru', 'en-th', 'en-tr', 'en-ur', 'en-vi', 'en-zh', 'es-en', 'fa-en', 'fr-en', 'he-en', 'hi-en', 'hu-en', 'id-en', 'it-en', 'ja-en', 'ko-en', 'lv-en', 'pl-en', 'pt-en', 'ro-en', 'ru-en', 'th-en', 'tr-en', 'ur-en', 'vi-en', 'zh-en'] # available_languages = list(set([lang for model in quickmt_models for lang in model.split('-')])) # available_languages.sort() available_languages = ['ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fr', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'ko', 'lv', 'pl', 'pt', 'ro', 'ru', 'th', 'tr', 'ur', 'vi', 'zh'] # Direct translation model if f"{self.sl}-{self.tl}" in quickmt_models: model_path = Translators.quickmtdownload(model_name) translated_text = Translators.quickmttranslate(model_path, self.input_text) message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {model_name}.' # Pivot language English elif self.sl in available_languages and self.tl in available_languages: model_name = f"quickmt-{self.sl}-en" model_path = Translators.quickmtdownload(model_name) entranslation = Translators.quickmttranslate(model_path, self.input_text) model_name = f"quickmt-en-{self.tl}" model_path = Translators.quickmtdownload(model_name) translated_text = Translators.quickmttranslate(model_path, entranslation) message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with Quickmt using pivot language English.' else: translated_text = f'No Quickmt model available for translation from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]}!' message = f"Available models: {', '.join(quickmt_models)}" return translated_text, message @staticmethod def download_argos_model(from_code, to_code): import argostranslate.package print('Downloading model', from_code, to_code) # Download and install Argos Translate package argostranslate.package.update_package_index() available_packages = argostranslate.package.get_available_packages() package_to_install = next( filter(lambda x: x.from_code == from_code and x.to_code == to_code, available_packages) ) argostranslate.package.install_from_path(package_to_install.download()) def argos(self): import argostranslate.translate, argostranslate.package try: Translators.download_argos_model(self.sl, self.tl) # Download model translated_text = argostranslate.translate.translate(self.input_text, self.sl, self.tl) # Translate except StopIteration: # packages_info = ', '.join(f"{pkg.get_description()}->{str(pkg.links)} {str(pkg.source_languages)}" for pkg in argostranslate.package.get_available_packages()) packages_info = ', '.join(f"{pkg.from_name} ({pkg.from_code}) -> {pkg.to_name} ({pkg.to_code})" for pkg in argostranslate.package.get_available_packages()) translated_text = f"No Argos model for {self.sl} to {self.tl}. Try other model or languages combination from the available Argos models: {packages_info}." except Exception as error: translated_text = error return translated_text def hunyuan(self): # ZH_CODES = {"Chinese": "zh", "Traditional Chinese": "zh-Hant", "Cantonese": "yue"} # if self.sl in ZH_CODES.keys() or self.tl in ZH_CODES.keys(): # prompt = f"把下面的文本翻译成{self.tl},不要额外解释。\n\n{self.input_text}" # else: prompt = f"Translate the following segment into {self.tl}, without additional explanation.\n\n{self.input_text}." tokenizer = AutoTokenizer.from_pretrained(self.model_name) model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto", dtype=torch.bfloat16) systemprompt = {"role": "system", "content": "You are a professional translator, translating in a formal tone and providing only translation, no other comments or explanations"} messages = [systemprompt, {"role": "user", "content": prompt}] # Tokenize the conversation tokenized_chat = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" ) # Generate response temperature = 0.7 with torch.no_grad(): outputs = model.generate( tokenized_chat.to(model.device), max_new_tokens=512, temperature=temperature, top_k=20, top_p=0.95, repetition_penalty=1.05, do_sample=True if temperature > 0 else False, pad_token_id=tokenizer.eos_token_id ) # outputs = model.generate(tokenized_chat.to(model.device), max_new_tokens=512, top_k=20, top_p=0.6, repetition_penalty=1.05, temperature=0.7) # output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) output_text = tokenizer.decode(outputs[0][tokenized_chat.shape[-1]:], skip_special_tokens=True) # Decode only the new tokens return output_text def simplepipe(self): try: pipe = pipeline("translation", model=self.model_name, device=self.device) translation = pipe(self.input_text) message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {self.model_name}.' return translation[0]['translation_text'], message except Exception as error: return f"Error translating with model: {self.model_name}! Try other available language combination or model.", error def HelsinkiNLP_mulroa(self): try: pipe = pipeline("translation", model=self.model_name, device=self.device) iso1to3 = {iso[1]: iso[3] for iso in non_empty_isos} # {'ro': 'ron'} iso3tl = iso1to3.get(self.tl) # 'deu', 'ron', 'eng', 'fra' translation = pipe(f'>>{iso3tl}<< {self.input_text}') return translation[0]['translation_text'], f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {self.model_name}.' except Exception as error: return f"Error translating with model: {self.model_name}! Try other available language combination.", error def HelsinkiNLP(self): try: # Standard bilingual model model_name = f"Helsinki-NLP/opus-mt-{self.sl}-{self.tl}" pipe = pipeline("translation", model=model_name, device=self.device) translation = pipe(self.input_text) return translation[0]['translation_text'], f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {model_name}.' except EnvironmentError: try: # Tatoeba models model_name = f"Helsinki-NLP/opus-tatoeba-{self.sl}-{self.tl}" pipe = pipeline("translation", model=model_name, device=self.device) translation = pipe(self.input_text) return translation[0]['translation_text'], f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {model_name}.' except EnvironmentError as error: self.model_name = "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul" # Last resort: try multi to multi return self.HelsinkiNLP_mulroa() except KeyError as error: return f"Error: Translation direction {self.sl} to {self.tl} is not supported by Helsinki Translation Models", error def LLaMAX(self): pipe = pipeline("text-generation", model="LLaMAX/LLaMAX3-8B") messages = [ {"role": "user", "content": f"Translate the following text from {self.sl} to {self.sl}: {self.input_text}"}, ] return pipe(messages)[0]["generated_text"] def LegoMT(self): from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer model = M2M100ForConditionalGeneration.from_pretrained(self.model_name) # "Lego-MT/Lego-MT" tokenizer = M2M100Tokenizer.from_pretrained(self.model_name) tokenizer.src_lang = self.sl encoded = tokenizer(self.input_text, return_tensors="pt") generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl)) return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] def madlad(self): model = T5ForConditionalGeneration.from_pretrained(self.model_name, device_map="auto") tokenizer = T5Tokenizer.from_pretrained(self.model_name) text = f"<2{self.tl}> {self.input_text}" # input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device) # outputs = model.generate(input_ids=input_ids) # return tokenizer.decode(outputs[0], skip_special_tokens=True) # Use a pipeline as a high-level helper translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl) translated_text = translator(text, max_length=512) return translated_text[0]['translation_text'] def madladct2(self): import ctranslate2 from sentencepiece import SentencePieceProcessor from huggingface_hub import snapshot_download model_path = snapshot_download(self.model_name) tokenizer = SentencePieceProcessor() tokenizer.load(f"{model_path}/spiece.model") translator = ctranslate2.Translator(model_path) input_tokens = tokenizer.encode(f"<2{self.tl}> {self.input_text}", out_type=str) results = translator.translate_batch( [input_tokens], batch_type="tokens", max_batch_size=512, beam_size=1, no_repeat_ngram_size=1, repetition_penalty=2, ) translated_text = tokenizer.decode(results[0].hypotheses[0]) return translated_text def smollm(self): tokenizer = AutoTokenizer.from_pretrained(self.model_name) model = AutoModelForCausalLM.from_pretrained(self.model_name) prompt = f"""Translate the following {self.sl} text to {self.tl}, generating only the translated text and maintaining the original meaning and tone: {self.input_text} Translation:""" inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate( inputs.input_ids, max_length=len(inputs.input_ids[0]) + 150, temperature=0.3, do_sample=True ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(response) return response.split("Translation:")[-1].strip() def flan(self): tokenizer = T5Tokenizer.from_pretrained(self.model_name, legacy=False) model = T5ForConditionalGeneration.from_pretrained(self.model_name) prompt = f"translate {self.sl} to {self.tl}: {self.input_text}" input_ids = tokenizer(prompt, return_tensors="pt").input_ids outputs = model.generate(input_ids) return tokenizer.decode(outputs[0], skip_special_tokens=True).strip() def tfive(self): tokenizer = T5Tokenizer.from_pretrained(self.model_name) model = T5ForConditionalGeneration.from_pretrained(self.model_name, device_map="auto") prompt = f"translate {self.sl} to {self.tl}: {self.input_text}" input_ids = tokenizer.encode(prompt, return_tensors="pt") output_ids = model.generate(input_ids, max_length=512) # Perform translation translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip() # Decode the translated text return translated_text def mbart_many_to_many(self): from transformers import MBartForConditionalGeneration, MBart50TokenizerFast model = MBartForConditionalGeneration.from_pretrained(self.model_name) tokenizer = MBart50TokenizerFast.from_pretrained(self.model_name) # translate source to target tokenizer.src_lang = languagecodes.mbart_large_languages[self.sl] encoded = tokenizer(self.input_text, return_tensors="pt") generated_tokens = model.generate( **encoded, forced_bos_token_id=tokenizer.lang_code_to_id[languagecodes.mbart_large_languages[self.tl]] ) return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] def mbart_one_to_many(self): # translate from English from transformers import MBartForConditionalGeneration, MBart50TokenizerFast model = MBartForConditionalGeneration.from_pretrained(self.model_name) tokenizer = MBart50TokenizerFast.from_pretrained(self.model_name, src_lang="en_XX") model_inputs = tokenizer(self.input_text, return_tensors="pt") langid = languagecodes.mbart_large_languages[self.tl] generated_tokens = model.generate( **model_inputs, forced_bos_token_id=tokenizer.lang_code_to_id[langid] ) return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] def mbart_many_to_one(self): # translate to English from transformers import MBartForConditionalGeneration, MBart50TokenizerFast model = MBartForConditionalGeneration.from_pretrained(self.model_name) tokenizer = MBart50TokenizerFast.from_pretrained(self.model_name) tokenizer.src_lang = languagecodes.mbart_large_languages[self.sl] encoded = tokenizer(self.input_text, return_tensors="pt") generated_tokens = model.generate(**encoded) return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] def mtom(self): from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer model = M2M100ForConditionalGeneration.from_pretrained(self.model_name) tokenizer = M2M100Tokenizer.from_pretrained(self.model_name) tokenizer.src_lang = self.sl encoded = tokenizer(self.input_text, return_tensors="pt") generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl)) return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] def bigscience(self): tokenizer = AutoTokenizer.from_pretrained(self.model_name) model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name) self.input_text = self.input_text if self.input_text.endswith('.') else f'{self.input_text}.' inputs = tokenizer.encode(f"Translate to {self.tl}: {self.input_text}", return_tensors="pt") outputs = model.generate(inputs) translation = tokenizer.decode(outputs[0]) translation = translation.replace(' ', '').replace('', '') return translation def bloomz(self): tokenizer = AutoTokenizer.from_pretrained(self.model_name) model = AutoModelForCausalLM.from_pretrained(self.model_name) self.input_text = self.input_text if self.input_text.endswith('.') else f'{self.input_text}.' # inputs = tokenizer.encode(f"Translate from {self.sl} to {self.tl}: {self.input_text} Translation:", return_tensors="pt") inputs = tokenizer.encode(f"Translate to {self.tl}: {self.input_text}", return_tensors="pt") outputs = model.generate(inputs) translation = tokenizer.decode(outputs[0]) translation = translation.replace(' ', '').replace('', '') translation = translation.split('Translation:')[-1].strip() if 'Translation:' in translation else translation.strip() return translation def nllb(self): tokenizer = AutoTokenizer.from_pretrained(self.model_name, src_lang=self.sl) # model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name, device_map="auto", torch_dtype=torch.bfloat16) model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name) translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl) translated_text = translator(self.input_text, max_length=512) return translated_text[0]['translation_text'] def wingpt(self): model = AutoModelForCausalLM.from_pretrained( self.model_name, dtype="auto", device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(self.model_name) # input_json = '{"input_text": self.input_text}' messages = [ {"role": "system", "content": f"Translate this to {self.tl} language"}, {"role": "user", "content": self.input_text} ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) generated_ids = model.generate( **model_inputs, max_new_tokens=512, temperature=0.1 ) generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)) output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] result = output.split('\n')[-1].strip() if '\n' in output else output.strip() return result def eurollm(self): tokenizer = AutoTokenizer.from_pretrained(self.model_name) model = AutoModelForCausalLM.from_pretrained(self.model_name) prompt = f"{self.sl}: {self.input_text} {self.tl}:" inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=512) output = tokenizer.decode(outputs[0], skip_special_tokens=True) print(output) # result = output.rsplit(f'{self.tl}:')[-1].strip() if f'{self.tl}:' in output else output.strip() result = output.rsplit(f'{self.tl}:')[-1].strip() if '\n' in output or f'{self.tl}:' in output else output.strip() return result def eurollm_instruct(self): tokenizer = AutoTokenizer.from_pretrained(self.model_name) model = AutoModelForCausalLM.from_pretrained(self.model_name) text = f'<|im_start|>system\n<|im_end|>\n<|im_start|>user\nTranslate the following {self.sl} source text to {self.tl}:\n{self.sl}: {self.input_text} \n{self.tl}: <|im_end|>\n<|im_start|>assistant\n' inputs = tokenizer(text, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=512) output = tokenizer.decode(outputs[0], skip_special_tokens=True) if f'{self.tl}:' in output: output = output.rsplit(f'{self.tl}:')[-1].strip().replace('assistant\n', '').strip() return output def teuken(self): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = AutoModelForCausalLM.from_pretrained( self.model_name, trust_remote_code=True, torch_dtype=torch.bfloat16, ) model = model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained( self.model_name, use_fast=False, trust_remote_code=True, ) translation_prompt = f"Translate the following text from {self.sl} into {self.tl}: {self.input_text}" messages = [{"role": "User", "content": translation_prompt}] prompt_ids = tokenizer.apply_chat_template(messages, chat_template="EN", tokenize=True, add_generation_prompt=False, return_tensors="pt") prediction = model.generate( prompt_ids.to(model.device), max_length=512, do_sample=True, top_k=50, top_p=0.95, temperature=0.7, num_return_sequences=1, ) translation = tokenizer.decode(prediction[0].tolist()) return translation def unbabel(self): pipe = pipeline("text-generation", model=self.model_name, torch_dtype=torch.bfloat16, device_map="auto") messages = [{"role": "user", "content": f"Translate the following text from {self.sl} into {self.tl}.\n{self.sl}: {self.input_text}.\n{self.tl}:"}] prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) tokenized_input = pipe.tokenizer(self.input_text, return_tensors="pt") num_input_tokens = len(tokenized_input["input_ids"][0]) max_new_tokens = round(num_input_tokens + 0.75 * num_input_tokens) outputs = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False) translated_text = outputs[0]["generated_text"] print(f"Input chars: {len(input_text)}", f"Input tokens: {num_input_tokens}", f"max_new_tokens: {max_new_tokens}", "Chars to tokens ratio:", round(len(input_text) / num_input_tokens, 2), f"Raw translation: {translated_text}") markers = ["", "<|im_end|>", "<|im_start|>assistant"] # , "\n" for marker in markers: if marker in translated_text: translated_text = translated_text.split(marker)[1].strip() translated_text = translated_text.replace('Answer:', '', 1).strip() if translated_text.startswith('Answer:') else translated_text translated_text = translated_text.split("Translated text:")[0].strip() if "Translated text:" in translated_text else translated_text split_translated_text = translated_text.split('\n', translated_text.count('\n')) translated_text = '\n'.join(split_translated_text[:input_text.count('\n')+1]) return translated_text def bergamot(model_name: str = 'deen', sl: str = 'de', tl: str = 'en', input_text: str = 'Hallo, mein Freund'): try: import bergamot # input_text = [input_text] if isinstance(input_text, str) else input_text config = bergamot.ServiceConfig(numWorkers=4) service = bergamot.Service(config) model = service.modelFromConfigPath(f"./{model_name}/bergamot.config.yml") options = bergamot.ResponseOptions(alignment=False, qualityScores=False, HTML=False) rawresponse = service.translate(model, bergamot.VectorString(input_text), options) translated_text: str = next(iter(rawresponse)).target.text message_text = f"Translated from {sl} to {tl} with Bergamot {model_name}." except Exception as error: response = error return translated_text, message_text @timer def translate_text(model_name: str, s_language: str, t_language: str, input_text: str) -> tuple[str, str]: """ Translates the input text from the source language to the target language using a specified model. Parameters: input_text (str): The source text to be translated s_language (str): The source language of the input text t_language (str): The target language in which the input text is translated model_name (str): The selected translation model name Returns: tuple: translated_text(str): The input text translated to the selected target language message_text(str): A descriptive message summarizing the translation process. Example: "Translated from English to German with Helsinki-NLP." Example: >>> translate_text("Hello world", "English", "German", "Helsinki-NLP") ("Hallo Welt", "Translated from English to German with Helsinki-NLP.") """ sl = all_langs[s_language][0] tl = all_langs[t_language][0] if not input_text.strip() or input_text.strip() == '': translated_text = f'No input text entered!' message_text = 'Please enter a text to translate!' return translated_text, message_text if sl == tl: translated_text = f'Source language {s_language} identical to target language {t_language}!' message_text = 'Please choose different target and source language!' return translated_text, message_text message_text = f'Translated from {s_language} to {t_language} with {model_name}' translated_text = None try: if model_name == "Helsinki-NLP/opus-mt-tc-bible-big-roa-en": translated_text, message_text = Translators(model_name, sl, tl, input_text).simplepipe() elif "-mul" in model_name.lower() or "mul-" in model_name.lower() or "-roa" in model_name.lower(): translated_text, message_text = Translators(model_name, sl, tl, input_text).HelsinkiNLP_mulroa() elif model_name == "Helsinki-NLP": translated_text, message_text = Translators(model_name, sl, tl, input_text).HelsinkiNLP() elif model_name == "QUICKMT": translated_text, message_text = Translators(model_name, sl, tl, input_text).quickmt() elif "HPLT" in model_name: if model_name == "HPLT-OPUS": translated_text, message = Translators(model_name, sl, tl, input_text).hplt(opus = True) else: translated_text, message = Translators(model_name, sl, tl, input_text).hplt() elif model_name == 'Argos': translated_text = Translators(model_name, sl, tl, input_text).argos() elif model_name == 'Google': translated_text = Translators(model_name, sl, tl, input_text).google() elif "m2m" in model_name.lower(): translated_text = Translators(model_name, sl, tl, input_text).mtom() elif "lego" in model_name.lower(): translated_text = Translators(model_name, sl, tl, input_text).LegoMT() elif model_name.startswith('t5'): translated_text = Translators(model_name, s_language, t_language, input_text).tfive() elif 'flan' in model_name.lower(): translated_text = Translators(model_name, s_language, t_language, input_text).flan() elif 'mt-ct2' in model_name.lower(): translated_text = Translators(model_name, sl, tl, input_text).madladct2() elif 'madlad' in model_name.lower(): translated_text = Translators(model_name, sl, tl, input_text).madlad() elif 'mt0' in model_name.lower(): translated_text = Translators(model_name, s_language, t_language, input_text).bigscience() elif 'bloomz' in model_name.lower(): translated_text = Translators(model_name, s_language, t_language, input_text).bloomz() elif 'nllb' in model_name.lower(): nnlbsl, nnlbtl = languagecodes.nllb_language_codes[s_language], languagecodes.nllb_language_codes[t_language] translated_text = Translators(model_name, nnlbsl, nnlbtl, input_text).nllb() elif model_name == "facebook/mbart-large-50-many-to-many-mmt": translated_text = Translators(model_name, s_language, t_language, input_text).mbart_many_to_many() elif model_name == "facebook/mbart-large-50-one-to-many-mmt": translated_text = Translators(model_name, s_language, t_language, input_text).mbart_one_to_many() elif model_name == "facebook/mbart-large-50-many-to-one-mmt": translated_text = Translators(model_name, s_language, t_language, input_text).mbart_many_to_one() elif 'teuken' in model_name.lower(): translated_text = Translators(model_name, s_language, t_language, input_text).teuken() elif model_name == "utter-project/EuroLLM-1.7B-Instruct": translated_text = Translators(model_name, s_language, t_language, input_text).eurollm_instruct() elif model_name == "utter-project/EuroLLM-1.7B": translated_text = Translators(model_name, s_language, t_language, input_text).eurollm() elif 'Unbabel' in model_name: translated_text = Translators(model_name, s_language, t_language, input_text).unbabel() elif model_name == "HuggingFaceTB/SmolLM3-3B": translated_text = Translators(model_name, s_language, t_language, input_text).smollm() elif model_name == "winninghealth/WiNGPT-Babel-2": translated_text = Translators(model_name, s_language, t_language, input_text).wingpt() elif "LLaMAX" in model_name: translated_text = Translators(model_name, s_language, t_language, input_text).LLaMAX() elif model_name == "Bergamot": translated_text, message_text = Translators(model_name, s_language, t_language, input_text).bergamot() elif "Hunyuan" in model_name: translated_text = Translators(model_name, s_language, t_language, input_text).hunyuan() except Exception as error: translated_text = error finally: print(input_text, translated_text, message_text) return translated_text, message_text # App layout st.header("Text Machine Translation", divider="gray", help="Text Machine Translation Streamlit App with Open Source Models") input_text = st.text_area("Enter text to translate:", placeholder="Enter text to translate, maximum 512 characters!", max_chars=512) # Initialize session state if not already set if "sselected_language" not in st.session_state: st.session_state["sselected_language"] = langs[0] if "tselected_language" not in st.session_state: st.session_state["tselected_language"] = langs[1] if "model_name" not in st.session_state: st.session_state["model_name"] = models[1] # Model selection FIRST model_name = st.selectbox("Select a model:", models, index=models.index(st.session_state["model_name"])) # Create columns for language selection scol, swapcol, tcol = st.columns([3, 1, 3]) with scol: sselected_language = st.selectbox("Source language:", langs, index=langs.index(st.session_state["sselected_language"])) with swapcol: if st.button("🔄 Swap"): st.session_state["model_name"] = model_name # Preserve model st.session_state["sselected_language"], st.session_state["tselected_language"] = \ st.session_state["tselected_language"], st.session_state["sselected_language"] st.rerun() with tcol: tselected_language = st.selectbox("Target language:", langs, index=langs.index(st.session_state["tselected_language"])) # Language codes sl = name_to_iso1[st.session_state["sselected_language"]] tl = name_to_iso1[st.session_state["tselected_language"]] # Store selections st.session_state["sselected_language"] = sselected_language st.session_state["tselected_language"] = tselected_language st.session_state["model_name"] = model_name st.write(f'Selected language combination: {sselected_language} - {tselected_language}. Selected model: {model_name}') with st.container(border=None, width="stretch", height="content", horizontal=False, horizontal_alignment="center", vertical_alignment="center", gap="small"): submit_button = st.button("Translate") # Show text area with placeholder also before translating # translated_textarea = st.empty() # message_textarea = st.empty() # translated_textarea.text_area(":green[Translation:]", placeholder="Translation area", value='') # message_textarea.text_input(":blue[Messages:]", placeholder="Messages area", value='') if submit_button: # Handle the submit button click with st.spinner("Translating...", show_time=True): translated_text, message = translate_text(model_name, sselected_language, tselected_language, input_text) print(f"Translated from {sselected_language} to {tselected_language} using {model_name}.", input_text, translated_text) # Display the translated text # translated_textarea.text_area(":green[Translation:]", value=translated_text) # message_textarea.text_input(":blue[Message:]", value=message) st.text_area(":green[Translation:]", value=translated_text) # st.success(message, icon=":material/check:") st.info(message, icon="ℹ️"), st.warning(message, icon=":material/warning:"), error(message, icon=":material/error:"), st.exception st.info(message, icon=":material/info:") # st.text_input(":blue[Messages:]", value=message) # st.rerun()