Mr7Explorer commited on
Commit
f34f567
·
verified ·
1 Parent(s): 706745d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -76
app.py CHANGED
@@ -6,8 +6,10 @@ import base64
6
  import logging
7
  from scipy.io import wavfile
8
  from typing import Tuple, Dict, Any
9
- from transformers import AutoTokenizer
10
  from parler_tts import ParlerTTSForConditionalGeneration
 
 
11
 
12
  # --- Logging ---
13
  logging.basicConfig(level=logging.INFO)
@@ -16,25 +18,26 @@ logger.addHandler(logging.StreamHandler())
16
 
17
  # --- TTS Wrapper ---
18
  class IndicParlerTTS:
19
- def __init__(self, model_name: str = "ai4bharat/indic-parler-tts"):
 
20
  self.model_name = model_name
21
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
22
  self.model = None
23
  self.tokenizer = None
24
  self.description_tokenizer = None
25
- self.sample_rate = 24000 # Default for Parler-TTS
 
 
26
  self._load_model()
27
 
28
- # Supported languages (expanded from model card)
29
  self.language_codes = {
30
- "as": "Assamese", "bn": "Bengali", "brx": "Bodo", "doi": "Dogri",
31
- "en": "English", "gu": "Gujarati", "hi": "Hindi", "kn": "Kannada",
32
- "kok": "Konkani", "mai": "Maithili", "ml": "Malayalam", "mni": "Manipuri",
33
- "mr": "Marathi", "ne": "Nepali", "or": "Odia", "sa": "Sanskrit",
34
- "sat": "Santali", "sd": "Sindhi", "ta": "Tamil", "te": "Telugu", "ur": "Urdu"
35
  }
36
 
37
- # Voice style mappings to descriptive terms
38
  self.voice_map = {
39
  "neutral": "neutral",
40
  "formal": "formal and clear",
@@ -43,74 +46,116 @@ class IndicParlerTTS:
43
  "emotional": "emotional and varied"
44
  }
45
 
 
 
 
 
 
 
 
46
  def _load_model(self):
47
  try:
48
- logger.info(f"Loading {self.model_name} on {self.device}")
49
- self.model = ParlerTTSForConditionalGeneration.from_pretrained(
50
- self.model_name
51
- ).to(self.device)
52
- self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
53
- try:
54
- self.description_tokenizer = AutoTokenizer.from_pretrained(
55
- self.model.config.text_encoder._name_or_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  )
57
- except Exception:
58
- logger.warning("Falling back to main tokenizer for descriptions")
59
- self.description_tokenizer = self.tokenizer
60
- self.sample_rate = self.model.config.sampling_rate
61
- logger.info("✅ Real Indic Parler-TTS model loaded")
 
 
62
  except Exception as e:
63
- logger.exception("Failed to load Indic Parler-TTS model")
64
  self.model = None
65
- self.tokenizer = None
66
- self.description_tokenizer = None
67
 
68
  def generate(self, text: str, language: str = "hi", voice: str = "neutral",
69
  pitch: float = 1.0, speed: float = 1.0, emotion: float = 0.5,
70
- reverb: float = 0.0, background_noise: float = 0.0) -> Tuple[np.ndarray, int]:
71
  """
72
- Generate speech using the real Parler-TTS model.
73
  Returns int16 numpy audio and sample rate.
 
74
  """
75
- if self.model is None or self.tokenizer is None:
76
  raise RuntimeError("Model not available")
77
 
78
  if not text.strip():
79
  raise ValueError("Empty text provided")
80
 
81
- # Construct descriptive caption based on parameters
82
- full_lang = self.language_codes.get(language, "Indian")
83
- voice_desc = self.voice_map.get(voice, "neutral")
84
-
85
- pitch_desc = "high" if pitch > 1.2 else "low" if pitch < 0.8 else "balanced"
86
- speed_desc = "fast" if speed > 1.3 else "slow" if speed < 0.7 else "moderate"
87
- emotion_desc = "highly expressive" if emotion > 0.7 else "slightly expressive" if emotion > 0.3 else "neutral"
88
- reverb_desc = "with noticeable reverb as if in a room" if reverb > 0.5 else "clear and close-up"
89
- noise_desc = "with some background noise" if background_noise > 0.5 else "in a quiet environment"
90
-
91
- description = (
92
- f"A {full_lang} speaker with a {voice_desc} voice, {pitch_desc} pitch, "
93
- f"{speed_desc} speaking pace, {emotion_desc} delivery, {reverb_desc}, {noise_desc}."
94
- )
95
-
96
- # Tokenize
97
- prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
98
- prompt_attention_mask = self.tokenizer(text, return_tensors="pt").attention_mask.to(self.device)
99
-
100
- description_input_ids = self.description_tokenizer(description, return_tensors="pt").input_ids.to(self.device)
101
- description_attention_mask = self.description_tokenizer(description, return_tensors="pt").attention_mask.to(self.device)
102
-
103
- with torch.no_grad():
104
- audio_tensor = self.model.generate(
105
- input_ids=description_input_ids,
106
- attention_mask=description_attention_mask,
107
- prompt_input_ids=prompt_input_ids,
108
- prompt_attention_mask=prompt_attention_mask
109
  )
110
 
111
- audio = audio_tensor.cpu().numpy().squeeze()
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- # Convert float32 [-1,1] → int16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  audio = np.clip(audio, -1.0, 1.0)
115
  audio_int16 = (audio * 32767).astype(np.int16)
116
 
@@ -126,13 +171,20 @@ def wav_bytes_from_numpy(audio_np: np.ndarray, sample_rate: int) -> bytes:
126
  def encode_wav_base64(audio_bytes: bytes) -> str:
127
  return base64.b64encode(audio_bytes).decode("utf-8")
128
 
129
- # Instantiate TTS
130
- tts = IndicParlerTTS()
 
 
 
 
 
 
131
 
132
  # --- Gradio functions / API functions ---
133
- def synthesize_speech(text: str, language: str, voice: str,
134
  pitch: float, speed: float, emotion: float,
135
- reverb: float, background_noise: float):
 
136
  try:
137
  if not text or not text.strip():
138
  return None, "Please enter text to synthesize."
@@ -140,26 +192,34 @@ def synthesize_speech(text: str, language: str, voice: str,
140
  if len(text) > 4000:
141
  return None, "Text too long. Maximum 4000 characters supported."
142
 
 
 
 
 
143
  audio_np, sr = tts.generate(text=text, language=language, voice=voice,
144
  pitch=pitch, speed=speed, emotion=emotion,
145
- reverb=reverb, background_noise=background_noise)
146
 
147
- return (sr, audio_np), "Speech generated successfully."
 
148
  except Exception as e:
149
  logger.exception("Error in synthesize_speech:")
150
  return None, f"Error: {str(e)}"
151
 
152
- def api_synthesize(text: str, language: str = "hi", voice: str = "neutral",
153
  pitch: float = 1.0, speed: float = 1.0, emotion: float = 0.5,
154
- reverb: float = 0.0, background_noise: float = 0.0) -> Dict[str, Any]:
 
155
  try:
156
  if not text or not text.strip():
157
  return {"error": "Please provide non-empty text."}
158
 
 
 
 
159
  audio_np, sr = tts.generate(text=text, language=language, voice=voice,
160
  pitch=float(pitch), speed=float(speed),
161
- emotion=float(emotion), reverb=float(reverb),
162
- background_noise=float(background_noise))
163
 
164
  wav_bytes = wav_bytes_from_numpy(audio_np, sr)
165
  return {
@@ -196,20 +256,25 @@ with gr.Blocks(
196
  with gr.Row():
197
  with gr.Column(scale=2):
198
  text_input = gr.Textbox(label="Enter Text", placeholder="Type text here...", lines=4)
 
 
 
 
 
 
199
  language_dropdown = gr.Dropdown(
200
  choices=list(tts.language_codes.keys()),
201
  value="hi",
202
  label="Language (code)",
203
- info="Select language code (e.g. hi, bn, ta, ur...). Model auto-detects from text, but code helps with voice description."
204
  )
205
  voice_dropdown = gr.Dropdown(choices=["neutral", "formal", "casual", "expressive", "emotional"],
206
  value="neutral", label="Voice Style")
207
  with gr.Column(scale=1):
208
- pitch_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Pitch")
209
- speed_slider = gr.Slider(0.3, 3.0, value=1.0, step=0.1, label="Speed")
210
- emotion_slider = gr.Slider(0.0, 1.0, value=0.5, step=0.1, label="Emotion")
211
- reverb_slider = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Reverb")
212
- noise_slider = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Background Noise")
213
 
214
  with gr.Row():
215
  generate_btn = gr.Button("🎵 Generate Speech", variant="primary")
@@ -222,7 +287,7 @@ with gr.Blocks(
222
  # Bind UI
223
  generate_btn.click(
224
  fn=synthesize_speech,
225
- inputs=[text_input, language_dropdown, voice_dropdown, pitch_slider, speed_slider, emotion_slider, reverb_slider, noise_slider],
226
  outputs=[audio_output, status_output]
227
  )
228
 
 
6
  import logging
7
  from scipy.io import wavfile
8
  from typing import Tuple, Dict, Any
9
+ from transformers import AutoTokenizer, AutoModel
10
  from parler_tts import ParlerTTSForConditionalGeneration
11
+ from huggingface_hub import hf_hub_download
12
+ import os
13
 
14
  # --- Logging ---
15
  logging.basicConfig(level=logging.INFO)
 
18
 
19
  # --- TTS Wrapper ---
20
  class IndicParlerTTS:
21
+ def __init__(self, model_type: str = "parler", model_name: str = "ai4bharat/indic-parler-tts"):
22
+ self.model_type = model_type # "parler" or "indicf5"
23
  self.model_name = model_name
24
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
  self.model = None
26
  self.tokenizer = None
27
  self.description_tokenizer = None
28
+ self.sample_rate = 24000 # Default for both
29
+ self.ref_audio_path = None
30
+ self.ref_text = None
31
  self._load_model()
32
 
33
+ # Supported languages (expanded for IndicF5)
34
  self.language_codes = {
35
+ "as": "Assamese", "bn": "Bengali", "gu": "Gujarati", "hi": "Hindi",
36
+ "kn": "Kannada", "ml": "Malayalam", "mr": "Marathi", "or": "Odia",
37
+ "pa": "Punjabi", "ta": "Tamil", "te": "Telugu"
 
 
38
  }
39
 
40
+ # Voice style mappings to descriptive terms (for Parler-TTS)
41
  self.voice_map = {
42
  "neutral": "neutral",
43
  "formal": "formal and clear",
 
46
  "emotional": "emotional and varied"
47
  }
48
 
49
+ # For IndicF5, map voices to reference prompts (simplified; expand as needed)
50
+ self.ref_map = {
51
+ "neutral": ("prompts/PAN_F_HAPPY_00001.wav", "ਭਹੰਪੀ ਵਿੱਚ ਸਮਾਰਕਾਂ ਦੇ ਭਵਨ ਨਿਰਮਾਣ ਕਲਾ ਦੇ ਵੇਰਵੇ ਗੁੰਝਲਦਾਰ ਅਤੇ ਹੈਰਾਨ ਕਰਨ ਵਾਲੇ ਹਨ, ਜੋ ਮੈਨੂੰ ਖੁਸ਼ ਕਰਦੇ ਹਨ।"),
52
+ # Add more mappings, e.g., for other styles/languages from prompts/
53
+ # "formal": ("path/to/formal.wav", "ref text"),
54
+ }
55
+
56
  def _load_model(self):
57
  try:
58
+ if self.model_type == "parler":
59
+ logger.info(f"Loading Indic Parler-TTS ({self.model_name}) on {self.device}")
60
+ self.model = ParlerTTSForConditionalGeneration.from_pretrained(
61
+ self.model_name
62
+ ).to(self.device)
63
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
64
+ try:
65
+ self.description_tokenizer = AutoTokenizer.from_pretrained(
66
+ self.model.config.text_encoder._name_or_path
67
+ )
68
+ except Exception:
69
+ logger.warning("Falling back to main tokenizer for descriptions")
70
+ self.description_tokenizer = self.tokenizer
71
+ self.sample_rate = self.model.config.sampling_rate
72
+ logger.info("✅ Indic Parler-TTS loaded")
73
+ elif self.model_type == "indicf5":
74
+ logger.info(f"Loading IndicF5 on {self.device}")
75
+ self.model = AutoModel.from_pretrained("ai4bharat/IndicF5", trust_remote_code=True).to(self.device)
76
+
77
+ # Download default reference for neutral (expand for other voices)
78
+ default_ref_path = "prompts/PAN_F_HAPPY_00001.wav"
79
+ self.ref_audio_path = hf_hub_download(
80
+ repo_id="ai4bharat/IndicF5",
81
+ filename=default_ref_path,
82
+ local_dir="./prompts"
83
  )
84
+ self.ref_text = "ਭਹੰਪੀ ਵਿੱਚ ਸਮਾਰਕਾਂ ਦੇ ਭਵਨ ਨਿਰਮਾਣ ਕਲਾ ਦੇ ਵੇਰਵੇ ਗੁੰਝਲਦਾਰ ਅਤੇ ਹੈਰਾਨ ਕਰਨ ਵਾਲੇ ਹਨ, ਜੋ ਮੈਨੂੰ ਖੁਸ਼ ਕਰਦੇ ਹਨ।"
85
+
86
+ # For other voices, override in generate()
87
+ self.sample_rate = 24000
88
+ logger.info("✅ IndicF5 loaded with default reference")
89
+ else:
90
+ raise ValueError(f"Unsupported model_type: {self.model_type}")
91
  except Exception as e:
92
+ logger.exception(f"Failed to load {self.model_type} model")
93
  self.model = None
 
 
94
 
95
  def generate(self, text: str, language: str = "hi", voice: str = "neutral",
96
  pitch: float = 1.0, speed: float = 1.0, emotion: float = 0.5,
97
+ reverb: float = 0.0) -> Tuple[np.ndarray, int]:
98
  """
99
+ Generate speech using the selected model.
100
  Returns int16 numpy audio and sample rate.
101
+ For IndicF5: Uses reference-based generation for humanized output.
102
  """
103
+ if self.model is None:
104
  raise RuntimeError("Model not available")
105
 
106
  if not text.strip():
107
  raise ValueError("Empty text provided")
108
 
109
+ if self.model_type == "parler":
110
+ # Existing Parler-TTS logic (without noise)
111
+ full_lang = self.language_codes.get(language, "Indian")
112
+ voice_desc = self.voice_map.get(voice, "neutral") # Safe get to avoid errors
113
+
114
+ pitch_desc = "high" if pitch > 1.2 else "low" if pitch < 0.8 else "balanced"
115
+ speed_desc = "fast" if speed > 1.3 else "slow" if speed < 0.7 else "moderate"
116
+ emotion_desc = "highly expressive" if emotion > 0.7 else "slightly expressive" if emotion > 0.3 else "neutral"
117
+ reverb_desc = "with noticeable reverb as if in a room" if reverb > 0.5 else "clear and close-up"
118
+
119
+ description = (
120
+ f"A {full_lang} speaker with a {voice_desc} voice, {pitch_desc} pitch, "
121
+ f"{speed_desc} speaking pace, {emotion_desc} delivery, {reverb_desc}."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  )
123
 
124
+ # Tokenize
125
+ prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
126
+ prompt_attention_mask = self.tokenizer(text, return_tensors="pt").attention_mask.to(self.device)
127
+
128
+ description_input_ids = self.description_tokenizer(description, return_tensors="pt").input_ids.to(self.device)
129
+ description_attention_mask = self.description_tokenizer(description, return_tensors="pt").attention_mask.to(self.device)
130
+
131
+ with torch.no_grad():
132
+ audio_tensor = self.model.generate(
133
+ input_ids=description_input_ids,
134
+ attention_mask=description_attention_mask,
135
+ prompt_input_ids=prompt_input_ids,
136
+ prompt_attention_mask=prompt_attention_mask
137
+ )
138
 
139
+ audio = audio_tensor.cpu().numpy().squeeze()
140
+
141
+ elif self.model_type == "indicf5":
142
+ # IndicF5 logic: Use reference for voice style (humanized output)
143
+ # For now, use default ref; map voice to specific ref if available
144
+ ref_path, ref_txt = self.ref_map.get(voice, (self.ref_audio_path, self.ref_text))
145
+
146
+ with torch.no_grad():
147
+ audio_float = self.model(
148
+ text,
149
+ ref_audio_path=ref_path,
150
+ ref_text=ref_txt
151
+ )
152
+
153
+ # Normalize if needed (per example)
154
+ if audio_float.dtype == np.int16:
155
+ audio_float = audio_float.astype(np.float32) / 32768.0
156
+ audio = audio_float
157
+
158
+ # Common post-processing: float32 [-1,1] → int16
159
  audio = np.clip(audio, -1.0, 1.0)
160
  audio_int16 = (audio * 32767).astype(np.int16)
161
 
 
171
  def encode_wav_base64(audio_bytes: bytes) -> str:
172
  return base64.b64encode(audio_bytes).decode("utf-8")
173
 
174
+ # Instantiate TTS (default to Parler; will reinstantiate per selection)
175
+ def get_tts(model_type):
176
+ if model_type == "indicf5":
177
+ return IndicParlerTTS(model_type="indicf5")
178
+ else:
179
+ return IndicParlerTTS(model_type="parler")
180
+
181
+ tts = get_tts("parler")
182
 
183
  # --- Gradio functions / API functions ---
184
+ def synthesize_speech(text: str, model_type: str, language: str, voice: str,
185
  pitch: float, speed: float, emotion: float,
186
+ reverb: float):
187
+ global tts
188
  try:
189
  if not text or not text.strip():
190
  return None, "Please enter text to synthesize."
 
192
  if len(text) > 4000:
193
  return None, "Text too long. Maximum 4000 characters supported."
194
 
195
+ # Re-instantiate TTS if model changed
196
+ if tts.model_type != model_type:
197
+ tts = get_tts(model_type)
198
+
199
  audio_np, sr = tts.generate(text=text, language=language, voice=voice,
200
  pitch=pitch, speed=speed, emotion=emotion,
201
+ reverb=reverb)
202
 
203
+ model_note = " (Parler-TTS: Style via description)" if model_type == "parler" else " (IndicF5: Humanized via reference voice)"
204
+ return (sr, audio_np), f"Speech generated successfully{model_note}."
205
  except Exception as e:
206
  logger.exception("Error in synthesize_speech:")
207
  return None, f"Error: {str(e)}"
208
 
209
+ def api_synthesize(text: str, model_type: str = "parler", language: str = "hi", voice: str = "neutral",
210
  pitch: float = 1.0, speed: float = 1.0, emotion: float = 0.5,
211
+ reverb: float = 0.0) -> Dict[str, Any]:
212
+ global tts
213
  try:
214
  if not text or not text.strip():
215
  return {"error": "Please provide non-empty text."}
216
 
217
+ if tts.model_type != model_type:
218
+ tts = get_tts(model_type)
219
+
220
  audio_np, sr = tts.generate(text=text, language=language, voice=voice,
221
  pitch=float(pitch), speed=float(speed),
222
+ emotion=float(emotion), reverb=float(reverb))
 
223
 
224
  wav_bytes = wav_bytes_from_numpy(audio_np, sr)
225
  return {
 
256
  with gr.Row():
257
  with gr.Column(scale=2):
258
  text_input = gr.Textbox(label="Enter Text", placeholder="Type text here...", lines=4)
259
+ model_dropdown = gr.Dropdown(
260
+ choices=["Indic Parler-TTS", "IndicF5 (Humanized)"],
261
+ value="Indic Parler-TTS",
262
+ label="Model",
263
+ info="Parler-TTS: Style via text description. IndicF5: Near-human via voice reference (combined for ultimate humanization)."
264
+ )
265
  language_dropdown = gr.Dropdown(
266
  choices=list(tts.language_codes.keys()),
267
  value="hi",
268
  label="Language (code)",
269
+ info="Select language code (e.g. hi, bn, ta). Model auto-detects from text."
270
  )
271
  voice_dropdown = gr.Dropdown(choices=["neutral", "formal", "casual", "expressive", "emotional"],
272
  value="neutral", label="Voice Style")
273
  with gr.Column(scale=1):
274
+ pitch_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Pitch (normal: 1.0)")
275
+ speed_slider = gr.Slider(0.3, 3.0, value=1.0, step=0.1, label="Speed (normal: 1.0)")
276
+ emotion_slider = gr.Slider(0.0, 1.0, value=0.5, step=0.1, label="Emotion (normal: 0.5)")
277
+ reverb_slider = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Reverb (normal: 0.0)")
 
278
 
279
  with gr.Row():
280
  generate_btn = gr.Button("🎵 Generate Speech", variant="primary")
 
287
  # Bind UI
288
  generate_btn.click(
289
  fn=synthesize_speech,
290
+ inputs=[text_input, model_dropdown, language_dropdown, voice_dropdown, pitch_slider, speed_slider, emotion_slider, reverb_slider],
291
  outputs=[audio_output, status_output]
292
  )
293