Spaces:

Solo448
/

speechT5-TTS-hi

Running

App Files Files Community

Solo448 commited on Oct 20, 2024

Commit

d8c0142

verified ·

1 Parent(s): ec45def

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -6

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from speechbrain.inference import EncoderClassifier
 # Load models and processor
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-model = SpeechT5ForTextToSpeech.from_pretrained("Solo448/SpeechT5-fine-tune-en")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 # Load speaker encoder
@@ -21,7 +21,7 @@ speaker_model = EncoderClassifier.from_hparams(
 # Load a sample from the dataset for speaker embedding
 try:
-    dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train")
     dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
     sample = dataset[0]
     speaker_embedding = create_speaker_embedding(sample['audio']['array'])
@@ -40,8 +40,72 @@ def create_speaker_embedding(waveform):
 def text_to_speech(text):
     # Clean up text
     replacements = [
-        ('0', 'zero'), ('1', 'one'), ('2', 'two'), ('3', 'three'), ('4', 'four'),
-        ('5', 'five'), ('6', 'six'), ('7', 'seven'), ('8', 'eight'), ('9', 'nine')
     ]
     for src, dst in replacements:
         text = text.replace(src, dst)
@@ -54,8 +118,8 @@ iface = gr.Interface(
     fn=text_to_speech,
     inputs="text",
     outputs="audio",
-    title="Technical Text-to-Speech",
-    description="Enter technical text to convert to speech. The model has been fine-tuned on technical data."
 )
 iface.launch()

 # Load models and processor
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+model = SpeechT5ForTextToSpeech.from_pretrained("Solo448/Speect5-common-voice-Hindi")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 # Load speaker encoder
 # Load a sample from the dataset for speaker embedding
 try:
+    dataset = load_dataset("mozilla-foundation/common_voice_17_0", "hi", split="validated", trust_remote_code=True)
     dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
     sample = dataset[0]
     speaker_embedding = create_speaker_embedding(sample['audio']['array'])
 def text_to_speech(text):
     # Clean up text
     replacements = [
+    ("अ", "a"),
+    ("आ", "aa"),
+    ("इ", "i"),
+    ("ई", "ee"),
+    ("उ", "u"),
+    ("ऋ", "ri"),
+    ("ए", "ae"),
+    ("ऐ", "ai"),
+    ("ऑ", "au"),
+    ("ओ", "o"),
+    ("औ", "au"),
+    ("क", "k"),
+    ("ख", "kh"),
+    ("ग", "g"),
+    ("घ", "gh"),
+    ("च", "ch"),
+    ("छ", "chh"),
+    ("ज", "j"),
+    ("झ", "jh"),
+    ("ञ", "gna"),
+    ("ट", "t"),
+    ("ठ", "th"),
+    ("ड", "d"),
+    ("ढ", "dh"),
+    ("ण", "nr"),
+    ("त", "t"),
+    ("थ", "th"),
+    ("द", "d"),
+    ("ध", "dh"),
+    ("न", "n"),
+    ("प", "p"),
+    ("फ", "ph"),
+    ("ब", "b"),
+    ("भ", "bh"),
+    ("म", "m"),
+    ("य", "ya"),
+    ("र", "r"),
+    ("ल", "l"),
+    ("व", "w"),
+    ("श", "sha"),
+    ("ष", "sh"),
+    ("स", "s"),
+    ("ह", "ha"),
+    ("़", "ng"),
+    ("्", ""),
+    ("ऽ", ""),
+    ("ा", "a"),
+    ("ि", "i"),
+    ("ी", "ee"),
+    ("ु", "u"),
+    ("ॅ", "n"),
+    ("े", "e"),
+    ("ै", "oi"),
+    ("ो", "o"),
+    ("ौ", "ou"),
+    ("ॅ", "n"),
+    ("ॉ", "r"),
+    ("ू", "uh"),
+    ("ृ", "ri"),
+    ("ं", "n"),
+    ("क़", "q"),
+    ("ज़", "z"),
+    ("ड़", "r"),
+    ("ढ़", "rh"),
+    ("फ़", "f"),
+    ("|", ".")
     ]
     for src, dst in replacements:
         text = text.replace(src, dst)
     fn=text_to_speech,
     inputs="text",
     outputs="audio",
+    title="Hindi Text-to-Speech",
+    description="Enter hindi text to convert to speech"
 )
 iface.launch()