andhikagg commited on
Commit
df977ab
·
verified ·
1 Parent(s): c5f3698

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -592
app.py CHANGED
@@ -9,15 +9,12 @@ import librosa
9
  import torch
10
  import asyncio
11
  import edge_tts
12
- import yt_dlp
13
- import ffmpeg
14
- import subprocess
15
  import sys
16
  import io
17
  import wave
18
  from datetime import datetime
19
  from fairseq import checkpoint_utils
20
- from fairseq.data.dictionary import Dictionary # Import the Dictionary class
21
  from lib.infer_pack.models import (
22
  SynthesizerTrnMs256NSFsid,
23
  SynthesizerTrnMs256NSFsid_nono,
@@ -26,672 +23,246 @@ from lib.infer_pack.models import (
26
  )
27
  from vc_infer_pipeline import VC
28
  from config import Config
 
29
  config = Config()
30
  logging.getLogger("numba").setLevel(logging.WARNING)
31
 
32
- spaces = True
33
- # os.getenv("SYSTEM") == "spaces"
34
-
35
- force_support = None
36
- if config.unsupported is False:
37
- if config.device == "mps" or config.device == "cpu":
38
- force_support = False
39
- else:
40
- force_support = True
41
-
42
- audio_mode = []
43
- f0method_mode = []
44
- f0method_info = ""
45
 
46
- if force_support is False or spaces is True:
47
- if spaces is True:
48
- audio_mode = ["Upload audio", "TTS Audio"]
49
- else:
50
- audio_mode = ["Input path", "Upload audio", "TTS Audio"]
51
- f0method_mode = ["pm", "harvest"]
52
- f0method_info = "PM is fast, Harvest is good but extremely slow, Rvmpe is alternative to harvest (might be better). (Default: PM)"
53
  else:
54
- audio_mode = ["Input path", "Upload audio", "Youtube", "TTS Audio"]
55
- f0method_mode = ["pm", "harvest", "crepe"]
56
- f0method_info = "PM is fast, Harvest is good but extremely slow, Rvmpe is alternative to harvest (might be better), and Crepe effect is good but requires GPU (Default: PM)"
57
 
 
 
 
58
  if os.path.isfile("rmvpe.pt"):
59
  f0method_mode.insert(2, "rmvpe")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
62
  def vc_fn(
63
- vc_audio_mode,
64
- vc_input,
65
- vc_upload,
66
- tts_text,
67
- tts_voice,
68
- f0_up_key,
69
- f0_method,
70
- index_rate,
71
- filter_radius,
72
- resample_sr,
73
- rms_mix_rate,
74
- protect,
75
  ):
 
 
76
  try:
77
- logs = []
78
- print(f"Converting using {model_name}...")
79
  logs.append(f"Converting using {model_name}...")
80
  yield "\n".join(logs), None
 
 
 
 
81
 
82
- if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
83
- audio, sr = librosa.load(vc_input, sr=16000, mono=True)
84
- elif vc_audio_mode == "Upload audio":
85
- if vc_upload is None:
86
- return "You need to upload an audio", None
87
- sampling_rate, audio = vc_upload
88
- duration = audio.shape[0] / sampling_rate
89
- if duration > 20 and spaces:
90
- return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
91
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
92
- if len(audio.shape) > 1:
93
- audio = librosa.to_mono(audio.transpose(1, 0))
94
- if sampling_rate != 16000:
95
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
96
- elif vc_audio_mode == "TTS Audio":
97
- if len(tts_text) > 100 and spaces:
98
- return "Text is too long", None
99
- if tts_text is None or tts_voice is None:
100
- return "You need to enter text and select a voice", None
101
- asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
102
- audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
103
- vc_input = "tts.mp3"
104
-
105
- logs.append(f"Success converting from mode...")
106
  yield "\n".join(logs), None
107
 
108
  times = [0, 0, 0]
109
  f0_up_key = int(f0_up_key)
 
110
  audio_opt = vc.pipeline(
111
- hubert_model,
112
- net_g,
113
- 0,
114
- audio,
115
- vc_input,
116
- times,
117
- f0_up_key,
118
- f0_method,
119
- file_index,
120
- # file_big_npy,
121
- index_rate,
122
- if_f0,
123
- filter_radius,
124
- tgt_sr,
125
- resample_sr,
126
- rms_mix_rate,
127
- version,
128
- protect,
129
- f0_file=None,
130
  )
 
131
  info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
132
  print(f"{model_name} | {info}")
133
- logs.append(f"Successfully Convert {model_name}\n{info}")
134
- # return "\n".join(logs), (tgt_sr, audio_opt)
135
  yield "\n".join(logs), (tgt_sr, audio_opt)
136
- # except GeneratorExit:
137
- # print("Generator was closed during processing")
138
- # return
139
- except:
140
- info = traceback.format_exc()
141
- print(info)
142
- # yield info, None
143
- return info, None
 
 
144
  return vc_fn
145
 
 
146
  def load_model():
147
  categories = []
148
  if os.path.isfile("weights/folder_info.json"):
149
  with open("weights/folder_info.json", "r", encoding="utf-8") as f:
150
  folder_info = json.load(f)
151
  for category_name, category_info in folder_info.items():
152
- if not category_info['enable']:
153
  continue
154
- category_title = category_info['title']
155
- category_folder = category_info['folder_path']
156
- description = category_info['description']
157
  models = []
158
  with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
159
  models_info = json.load(f)
160
  for character_name, info in models_info.items():
161
- if not info['enable']:
162
  continue
163
- model_title = info['title']
164
- model_name = info['model_path']
165
- model_author = info.get("author", None)
166
  model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
167
  model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
168
  cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
169
  tgt_sr = cpt["config"][-1]
170
- cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
171
  if_f0 = cpt.get("f0", 1)
172
  version = cpt.get("version", "v1")
173
  if version == "v1":
174
- if if_f0 == 1:
175
- net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half)
176
- else:
177
- net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
178
- model_version = "V1"
179
  elif version == "v2":
180
- if if_f0 == 1:
181
- net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half)
182
- else:
183
- net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
184
- model_version = "V2"
185
  del net_g.enc_q
186
  print(net_g.load_state_dict(cpt["weight"], strict=False))
187
  net_g.eval().to(config.device)
188
- if config.is_half:
189
- net_g = net_g.half()
190
- else:
191
- net_g = net_g.float()
192
  vc = VC(tgt_sr, config)
193
- print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
194
- models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
195
  categories.append([category_title, category_folder, description, models])
196
- else:
197
- categories = []
198
  return categories
199
 
200
- def download_audio(url, audio_provider):
201
- logs = []
202
- if url == "":
203
- raise gr.Error("URL Required!")
204
- return "URL Required"
205
- if not os.path.exists("dl_audio"):
206
- os.mkdir("dl_audio")
207
- if audio_provider == "Youtube":
208
- logs.append("Downloading the audio...")
209
- yield None, "\n".join(logs)
210
- ydl_opts = {
211
- 'noplaylist': True,
212
- 'format': 'bestaudio/best',
213
- 'postprocessors': [{
214
- 'key': 'FFmpegExtractAudio',
215
- 'preferredcodec': 'wav',
216
- }],
217
- "outtmpl": 'dl_audio/audio',
218
- }
219
- audio_path = "dl_audio/audio.wav"
220
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
221
- ydl.download([url])
222
- logs.append("Download Complete.")
223
- yield audio_path, "\n".join(logs)
224
-
225
- def cut_vocal_and_inst(split_model):
226
- logs = []
227
- logs.append("Starting the audio splitting process...")
228
- yield "\n".join(logs), None, None, None, None
229
- command = f"demucs --two-stems=vocals -n {split_model} dl_audio/audio.wav -o output"
230
- result = subprocess.Popen(command.split(), stdout=subprocess.PIPE, text=True)
231
- for line in result.stdout:
232
- logs.append(line)
233
- yield "\n".join(logs), None, None, None, None
234
- print(result.stdout)
235
- vocal = f"output/{split_model}/audio/vocals.wav"
236
- inst = f"output/{split_model}/audio/no_vocals.wav"
237
- logs.append("Audio splitting complete.")
238
- yield "\n".join(logs), vocal, inst, vocal
239
-
240
- def combine_vocal_and_inst(audio_data, vocal_volume, inst_volume, split_model):
241
- if not os.path.exists("output/result"):
242
- os.mkdir("output/result")
243
- vocal_path = "output/result/output.wav"
244
- output_path = "output/result/combine.mp3"
245
- inst_path = f"output/{split_model}/audio/no_vocals.wav"
246
- with wave.open(vocal_path, "w") as wave_file:
247
- wave_file.setnchannels(1)
248
- wave_file.setsampwidth(2)
249
- wave_file.setframerate(audio_data[0])
250
- wave_file.writeframes(audio_data[1].tobytes())
251
- command = f'ffmpeg -y -i {inst_path} -i {vocal_path} -filter_complex [0:a]volume={inst_volume}[i];[1:a]volume={vocal_volume}[v];[i][v]amix=inputs=2:duration=longest[a] -map [a] -b:a 320k -c:a libmp3lame {output_path}'
252
- result = subprocess.run(command.split(), stdout=subprocess.PIPE)
253
- print(result.stdout.decode())
254
- return output_path
255
-
256
  def load_hubert():
257
  global hubert_model
258
  torch.serialization.add_safe_globals([Dictionary])
259
- models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
260
- ["hubert_base.pt"],
261
- suffix="",
262
- )
263
- hubert_model = models[0]
264
- hubert_model = hubert_model.to(config.device)
265
- if config.is_half:
266
- hubert_model = hubert_model.half()
267
- else:
268
- hubert_model = hubert_model.float()
269
  hubert_model.eval()
270
 
 
271
  def change_audio_mode(vc_audio_mode):
272
- if vc_audio_mode == "Input path":
273
- return (
274
- # Input & Upload
275
- gr.Textbox.update(visible=True),
276
- gr.Checkbox.update(visible=False),
277
- gr.Audio.update(visible=False),
278
- # Youtube
279
- gr.Dropdown.update(visible=False),
280
- gr.Textbox.update(visible=False),
281
- gr.Textbox.update(visible=False),
282
- gr.Button.update(visible=False),
283
- # Splitter
284
- gr.Dropdown.update(visible=False),
285
- gr.Textbox.update(visible=False),
286
- gr.Button.update(visible=False),
287
- gr.Audio.update(visible=False),
288
- gr.Audio.update(visible=False),
289
- gr.Audio.update(visible=False),
290
- gr.Slider.update(visible=False),
291
- gr.Slider.update(visible=False),
292
- gr.Audio.update(visible=False),
293
- gr.Button.update(visible=False),
294
- # TTS
295
- gr.Textbox.update(visible=False),
296
- gr.Dropdown.update(visible=False)
297
- )
298
- elif vc_audio_mode == "Upload audio":
299
- return (
300
- # Input & Upload
301
- gr.Textbox.update(visible=False),
302
- gr.Checkbox.update(visible=True),
303
- gr.Audio.update(visible=True),
304
- # Youtube
305
- gr.Dropdown.update(visible=False),
306
- gr.Textbox.update(visible=False),
307
- gr.Textbox.update(visible=False),
308
- gr.Button.update(visible=False),
309
- # Splitter
310
- gr.Dropdown.update(visible=False),
311
- gr.Textbox.update(visible=False),
312
- gr.Button.update(visible=False),
313
- gr.Audio.update(visible=False),
314
- gr.Audio.update(visible=False),
315
- gr.Audio.update(visible=False),
316
- gr.Slider.update(visible=False),
317
- gr.Slider.update(visible=False),
318
- gr.Audio.update(visible=False),
319
- gr.Button.update(visible=False),
320
- # TTS
321
- gr.Textbox.update(visible=False),
322
- gr.Dropdown.update(visible=False)
323
- )
324
- elif vc_audio_mode == "Youtube":
325
- return (
326
- # Input & Upload
327
- gr.Textbox.update(visible=False),
328
- gr.Checkbox.update(visible=False),
329
- gr.Audio.update(visible=False),
330
- # Youtube
331
- gr.Dropdown.update(visible=True),
332
- gr.Textbox.update(visible=True),
333
- gr.Textbox.update(visible=True),
334
- gr.Button.update(visible=True),
335
- # Splitter
336
- gr.Dropdown.update(visible=True),
337
- gr.Textbox.update(visible=True),
338
- gr.Button.update(visible=True),
339
- gr.Audio.update(visible=True),
340
- gr.Audio.update(visible=True),
341
- gr.Audio.update(visible=True),
342
- gr.Slider.update(visible=True),
343
- gr.Slider.update(visible=True),
344
- gr.Audio.update(visible=True),
345
- gr.Button.update(visible=True),
346
- # TTS
347
- gr.Textbox.update(visible=False),
348
- gr.Dropdown.update(visible=False)
349
- )
350
- elif vc_audio_mode == "TTS Audio":
351
- return (
352
- # Input & Upload
353
- gr.Textbox.update(visible=False),
354
- gr.Checkbox.update(visible=False),
355
- gr.Audio.update(visible=False),
356
- # Youtube
357
- gr.Dropdown.update(visible=False),
358
- gr.Textbox.update(visible=False),
359
- gr.Textbox.update(visible=False),
360
- gr.Button.update(visible=False),
361
- # Splitter
362
- gr.Dropdown.update(visible=False),
363
- gr.Textbox.update(visible=False),
364
- gr.Button.update(visible=False),
365
- gr.Audio.update(visible=False),
366
- gr.Audio.update(visible=False),
367
- gr.Audio.update(visible=False),
368
- gr.Slider.update(visible=False),
369
- gr.Slider.update(visible=False),
370
- gr.Audio.update(visible=False),
371
- gr.Button.update(visible=False),
372
- # TTS
373
- gr.Textbox.update(visible=True),
374
- gr.Dropdown.update(visible=True)
375
- )
376
 
377
  def use_microphone(microphone):
378
- if microphone == True:
379
- return gr.Audio.update(source="microphone")
380
- else:
381
- return gr.Audio.update(source="upload")
382
 
383
  if __name__ == '__main__':
384
  load_hubert()
385
  categories = load_model()
386
  tts_voice_list = asyncio.new_event_loop().run_until_complete(edge_tts.list_voices())
387
  voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
 
388
  with gr.Blocks(theme=gr.themes.Base()) as app:
389
- gr.Markdown(
390
- "<div align='center'>\n\n"+
391
- "# RVC Blue Archive\n\n"+
392
- "### Recommended to use Google Colab to use other character and feature.\n\n"+
393
- "[![Colab](https://img.shields.io/badge/Colab-RVC%20Blue%20Archives-blue?style=for-the-badge&logo=googlecolab)](https://colab.research.google.com/drive/19Eo2xO7EKcMqvJDc_yXrWmixuNA4NtEU)\n\n"+
394
- "</div>\n\n"+
395
- "[![Repository](https://img.shields.io/badge/Github-Multi%20Model%20RVC%20Inference-blue?style=for-the-badge&logo=github)](https://github.com/ArkanDash/Multi-Model-RVC-Inference)\n\n"+
396
- "</div>"
397
- )
398
- if categories == []:
399
- gr.Markdown(
400
- "<div align='center'>\n\n"+
401
- "## No model found, please add the model into weights folder\n\n"+
402
- "</div>"
403
- )
404
  for (folder_title, folder, description, models) in categories:
405
  with gr.TabItem(folder_title):
406
  if description:
407
- gr.Markdown(f"### <center> {description}")
408
  with gr.Tabs():
409
  if not models:
410
- gr.Markdown("# <center> No Model Loaded.")
411
- gr.Markdown("## <center> Please add the model or fix your model path.")
412
  continue
413
  for (name, title, author, cover, model_version, vc_fn) in models:
414
  with gr.TabItem(name):
415
  with gr.Row():
416
  gr.Markdown(
417
- '<div align="center">'
418
- f'<div>{title}</div>\n'+
419
- f'<div>RVC {model_version} Model</div>\n'+
420
- (f'<div>Model author: {author}</div>' if author else "")+
421
- (f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else "")+
422
- '</div>'
423
  )
424
  with gr.Row():
425
- if spaces is False:
426
- with gr.TabItem("Input"):
427
- with gr.Row():
428
- with gr.Column():
429
- vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
430
- # Input
431
- vc_input = gr.Textbox(label="Input audio path", visible=False)
432
- # Upload
433
- vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
434
- vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
435
- # Youtube
436
- vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
437
- vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
438
- vc_log_yt = gr.Textbox(label="Output Information", visible=False, interactive=False)
439
- vc_download_button = gr.Button("Download Audio", variant="primary", visible=False)
440
- vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
441
- # TTS
442
- tts_text = gr.Textbox(label="TTS text", info="Text to speech input", visible=False)
443
- tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
444
- with gr.Column():
445
- vc_split_model = gr.Dropdown(label="Splitter Model", choices=["hdemucs_mmi", "htdemucs", "htdemucs_ft", "mdx", "mdx_q", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
446
- vc_split_log = gr.Textbox(label="Output Information", visible=False, interactive=False)
447
- vc_split = gr.Button("Split Audio", variant="primary", visible=False)
448
- vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
449
- vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
450
- with gr.TabItem("Convert"):
451
- with gr.Row():
452
- with gr.Column():
453
- vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
454
- f0method0 = gr.Radio(
455
- label="Pitch extraction algorithm",
456
- info=f0method_info,
457
- choices=f0method_mode,
458
- value="pm",
459
- interactive=True
460
- )
461
- index_rate1 = gr.Slider(
462
- minimum=0,
463
- maximum=1,
464
- label="Retrieval feature ratio",
465
- info="(Default: 0.7)",
466
- value=0.7,
467
- interactive=True,
468
- )
469
- filter_radius0 = gr.Slider(
470
- minimum=0,
471
- maximum=7,
472
- label="Apply Median Filtering",
473
- info="The value represents the filter radius and can reduce breathiness.",
474
- value=3,
475
- step=1,
476
- interactive=True,
477
- )
478
- resample_sr0 = gr.Slider(
479
- minimum=0,
480
- maximum=48000,
481
- label="Resample the output audio",
482
- info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling",
483
- value=0,
484
- step=1,
485
- interactive=True,
486
- )
487
- rms_mix_rate0 = gr.Slider(
488
- minimum=0,
489
- maximum=1,
490
- label="Volume Envelope",
491
- info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
492
- value=1,
493
- interactive=True,
494
- )
495
- protect0 = gr.Slider(
496
- minimum=0,
497
- maximum=0.5,
498
- label="Voice Protection",
499
- info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
500
- value=0.5,
501
- step=0.01,
502
- interactive=True,
503
- )
504
- with gr.Column():
505
- vc_log = gr.Textbox(label="Output Information", interactive=False)
506
- vc_output = gr.Audio(label="Output Audio", interactive=False)
507
- vc_convert = gr.Button("Convert", variant="primary")
508
- vc_vocal_volume = gr.Slider(
509
- minimum=0,
510
- maximum=10,
511
- label="Vocal volume",
512
- value=1,
513
- interactive=True,
514
- step=1,
515
- info="Adjust vocal volume (Default: 1}",
516
- visible=False
517
- )
518
- vc_inst_volume = gr.Slider(
519
- minimum=0,
520
- maximum=10,
521
- label="Instrument volume",
522
- value=1,
523
- interactive=True,
524
- step=1,
525
- info="Adjust instrument volume (Default: 1}",
526
- visible=False
527
- )
528
- vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False)
529
- vc_combine = gr.Button("Combine",variant="primary", visible=False)
530
- else:
531
- with gr.Column():
532
- vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Upload audio")
533
- # Input
534
- vc_input = gr.Textbox(label="Input audio path", visible=False)
535
- # Upload
536
- vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True, interactive=True)
537
- vc_upload = gr.Audio(label="Upload audio file", source="upload", visible=True, interactive=True)
538
- # Youtube
539
- vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
540
- vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
541
- vc_log_yt = gr.Textbox(label="Output Information", visible=False, interactive=False)
542
- vc_download_button = gr.Button("Download Audio", variant="primary", visible=False)
543
- vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
544
- # Splitter
545
- vc_split_model = gr.Dropdown(label="Splitter Model", choices=["hdemucs_mmi", "htdemucs", "htdemucs_ft", "mdx", "mdx_q", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
546
- vc_split_log = gr.Textbox(label="Output Information", visible=False, interactive=False)
547
- vc_split = gr.Button("Split Audio", variant="primary", visible=False)
548
- vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
549
- vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
550
- # TTS
551
- tts_text = gr.Textbox(label="TTS text", info="Text to speech input", visible=False)
552
- tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=False, allow_custom_value=False, value="en-US-AnaNeural-Female")
553
- with gr.Column():
554
- vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
555
- f0method0 = gr.Radio(
556
- label="Pitch extraction algorithm",
557
- info=f0method_info,
558
- choices=f0method_mode,
559
- value="pm",
560
- interactive=True
561
- )
562
- index_rate1 = gr.Slider(
563
- minimum=0,
564
- maximum=1,
565
- label="Retrieval feature ratio",
566
- info="(Default: 0.7)",
567
- value=0.7,
568
- interactive=True,
569
- )
570
- filter_radius0 = gr.Slider(
571
- minimum=0,
572
- maximum=7,
573
- label="Apply Median Filtering",
574
- info="The value represents the filter radius and can reduce breathiness.",
575
- value=3,
576
- step=1,
577
- interactive=True,
578
- )
579
- resample_sr0 = gr.Slider(
580
- minimum=0,
581
- maximum=48000,
582
- label="Resample the output audio",
583
- info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling",
584
- value=0,
585
- step=1,
586
- interactive=True,
587
- )
588
- rms_mix_rate0 = gr.Slider(
589
- minimum=0,
590
- maximum=1,
591
- label="Volume Envelope",
592
- info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
593
- value=1,
594
- interactive=True,
595
- )
596
- protect0 = gr.Slider(
597
- minimum=0,
598
- maximum=0.5,
599
- label="Voice Protection",
600
- info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
601
- value=0.5,
602
- step=0.01,
603
- interactive=True,
604
- )
605
- with gr.Column():
606
- vc_log = gr.Textbox(label="Output Information", interactive=False)
607
- vc_output = gr.Audio(label="Output Audio", interactive=False)
608
- vc_convert = gr.Button("Convert", variant="primary")
609
- vc_vocal_volume = gr.Slider(
610
- minimum=0,
611
- maximum=10,
612
- label="Vocal volume",
613
- value=1,
614
- interactive=True,
615
- step=1,
616
- info="Adjust vocal volume (Default: 1}",
617
- visible=False
618
- )
619
- vc_inst_volume = gr.Slider(
620
- minimum=0,
621
- maximum=10,
622
- label="Instrument volume",
623
- value=1,
624
- interactive=True,
625
- step=1,
626
- info="Adjust instrument volume (Default: 1}",
627
- visible=False
628
- )
629
- vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False)
630
- vc_combine = gr.Button("Combine",variant="primary", visible=False)
631
- vc_convert.click(
632
- fn=vc_fn,
633
- inputs=[
634
- vc_audio_mode,
635
- vc_input,
636
- vc_upload,
637
- tts_text,
638
- tts_voice,
639
- vc_transform0,
640
- f0method0,
641
- index_rate1,
642
- filter_radius0,
643
- resample_sr0,
644
- rms_mix_rate0,
645
- protect0,
646
- ],
647
- outputs=[vc_log ,vc_output]
648
- )
649
- vc_download_button.click(
650
- fn=download_audio,
651
- inputs=[vc_link, vc_download_audio],
652
- outputs=[vc_audio_preview, vc_log_yt]
653
- )
654
- vc_split.click(
655
- fn=cut_vocal_and_inst,
656
- inputs=[vc_split_model],
657
- outputs=[vc_split_log, vc_vocal_preview, vc_inst_preview, vc_input]
658
- )
659
- vc_combine.click(
660
- fn=combine_vocal_and_inst,
661
- inputs=[vc_output, vc_vocal_volume, vc_inst_volume, vc_split_model],
662
- outputs=[vc_combined_output]
663
- )
664
- vc_microphone_mode.change(
665
- fn=use_microphone,
666
- inputs=vc_microphone_mode,
667
- outputs=vc_upload
668
- )
669
- vc_audio_mode.change(
670
- fn=change_audio_mode,
671
- inputs=[vc_audio_mode],
672
- outputs=[
673
- vc_input,
674
- vc_microphone_mode,
675
- vc_upload,
676
- vc_download_audio,
677
- vc_link,
678
- vc_log_yt,
679
- vc_download_button,
680
- vc_split_model,
681
- vc_split_log,
682
- vc_split,
683
- vc_audio_preview,
684
- vc_vocal_preview,
685
- vc_inst_preview,
686
- vc_vocal_volume,
687
- vc_inst_volume,
688
- vc_combined_output,
689
- vc_combine,
690
- tts_text,
691
- tts_voice
692
- ]
693
- )
694
- # if spaces is True:
695
- # app.queue(max_size=20, api_open=config.api).launch(allowed_paths=["/"])
696
- # else:
697
- app.queue(max_size=20, api_open=config.api).launch(allowed_paths=["/"], share=False, server_name="0.0.0.0", server_port=7860)
 
9
  import torch
10
  import asyncio
11
  import edge_tts
 
 
 
12
  import sys
13
  import io
14
  import wave
15
  from datetime import datetime
16
  from fairseq import checkpoint_utils
17
+ from fairseq.data.dictionary import Dictionary
18
  from lib.infer_pack.models import (
19
  SynthesizerTrnMs256NSFsid,
20
  SynthesizerTrnMs256NSFsid_nono,
 
23
  )
24
  from vc_infer_pipeline import VC
25
  from config import Config
26
+
27
  config = Config()
28
  logging.getLogger("numba").setLevel(logging.WARNING)
29
 
30
+ spaces = True
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # Setup mode audio berdasarkan environment
33
+ if spaces:
34
+ audio_mode = ["Upload audio", "TTS Audio"]
 
 
 
 
35
  else:
36
+ audio_mode = ["Input path", "Upload audio", "TTS Audio"]
 
 
37
 
38
+ # Setup metode F0
39
+ f0method_mode = ["pm", "harvest"]
40
+ f0method_info = "PM is fast, Harvest is good but can be slow on CPU."
41
  if os.path.isfile("rmvpe.pt"):
42
  f0method_mode.insert(2, "rmvpe")
43
+ f0method_info = "PM is fast, Harvest is good but slow on CPU, Rvmpe is a good alternative."
44
+
45
+ # Fungsi helper yang telah diperbaiki untuk memuat audio
46
+ def _load_audio_input(vc_audio_mode, vc_input, vc_upload, tts_text, tts_voice, spaces_limit=20):
47
+ temp_file = None
48
+ if vc_audio_mode == "Input path" and vc_input:
49
+ print(f"Loading audio from path: {vc_input}")
50
+ audio, sr = librosa.load(vc_input, sr=16000, mono=True)
51
+ return audio, sr, None
52
+
53
+ if vc_audio_mode == "Upload audio":
54
+ if vc_upload is None:
55
+ raise ValueError("You need to upload an audio file.")
56
+ sampling_rate, audio = vc_upload
57
+ duration = len(audio) / sampling_rate
58
+ print(f"Loading uploaded audio. Original SR: {sampling_rate}, Duration: {duration:.2f}s")
59
+ if duration > spaces_limit and spaces:
60
+ raise ValueError(f"Audio is too long (> {spaces_limit}s). Please upload a shorter file.")
61
+
62
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
63
+ if len(audio.shape) > 1:
64
+ audio = librosa.to_mono(audio.transpose(1, 0))
65
+ if sampling_rate != 16000:
66
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
67
+ return audio, 16000, None
68
 
69
+ if vc_audio_mode == "TTS Audio":
70
+ if not tts_text or not tts_voice:
71
+ raise ValueError("You need to enter text and select a voice.")
72
+ if len(tts_text) > 100 and spaces:
73
+ raise ValueError("Text is too long (> 100 characters).")
74
+
75
+ temp_file = "tts.mp3"
76
+ print(f"Generating TTS audio for text: '{tts_text[:50]}...'")
77
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(temp_file))
78
+ audio, sr = librosa.load(temp_file, sr=16000, mono=True)
79
+ return audio, sr, temp_file
80
+
81
+ raise ValueError("Invalid audio mode or missing input.")
82
+
83
+ # Fungsi konversi utama yang sudah diperbaiki
84
  def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
85
  def vc_fn(
86
+ vc_audio_mode, vc_input, vc_upload, tts_text, tts_voice,
87
+ f0_up_key, f0_method, index_rate, filter_radius,
88
+ resample_sr, rms_mix_rate, protect,
 
 
 
 
 
 
 
 
 
89
  ):
90
+ logs = []
91
+ temp_audio_file = None
92
  try:
 
 
93
  logs.append(f"Converting using {model_name}...")
94
  yield "\n".join(logs), None
95
+
96
+ audio, sr, temp_audio_file = _load_audio_input(
97
+ vc_audio_mode, vc_input, vc_upload, tts_text, tts_voice
98
+ )
99
 
100
+ logs.append("Audio successfully loaded.")
101
+ logs.append(f"Starting RVC pipeline with F0 method: {f0_method}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  yield "\n".join(logs), None
103
 
104
  times = [0, 0, 0]
105
  f0_up_key = int(f0_up_key)
106
+
107
  audio_opt = vc.pipeline(
108
+ hubert_model, net_g, 0, audio, vc_input if vc_input else temp_audio_file,
109
+ times, f0_up_key, f0_method, file_index, index_rate,
110
+ if_f0, filter_radius, tgt_sr, resample_sr,
111
+ rms_mix_rate, version, protect, f0_file=None,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  )
113
+
114
  info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
115
  print(f"{model_name} | {info}")
116
+ logs.append(f"Successfully Converted!\n{info}")
 
117
  yield "\n".join(logs), (tgt_sr, audio_opt)
118
+
119
+ except Exception as e:
120
+ error_info = traceback.format_exc()
121
+ print(f"An error occurred: {error_info}")
122
+ return str(e), None
123
+
124
+ finally:
125
+ if temp_audio_file and os.path.exists(temp_audio_file):
126
+ os.remove(temp_audio_file)
127
+
128
  return vc_fn
129
 
130
+ # Fungsi load model (tidak berubah)
131
  def load_model():
132
  categories = []
133
  if os.path.isfile("weights/folder_info.json"):
134
  with open("weights/folder_info.json", "r", encoding="utf-8") as f:
135
  folder_info = json.load(f)
136
  for category_name, category_info in folder_info.items():
137
+ if not category_info.get('enable', True):
138
  continue
139
+ category_title, category_folder, description = category_info['title'], category_info['folder_path'], category_info['description']
 
 
140
  models = []
141
  with open(f"weights/{category_folder}/model_info.json", "r", encoding="utf-8") as f:
142
  models_info = json.load(f)
143
  for character_name, info in models_info.items():
144
+ if not info.get('enable', True):
145
  continue
146
+ model_title, model_name, model_author = info['title'], info['model_path'], info.get("author")
 
 
147
  model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
148
  model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
149
  cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
150
  tgt_sr = cpt["config"][-1]
151
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
152
  if_f0 = cpt.get("f0", 1)
153
  version = cpt.get("version", "v1")
154
  if version == "v1":
155
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) if if_f0 == 1 else SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
 
 
 
 
156
  elif version == "v2":
157
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) if if_f0 == 1 else SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
 
 
 
 
158
  del net_g.enc_q
159
  print(net_g.load_state_dict(cpt["weight"], strict=False))
160
  net_g.eval().to(config.device)
161
+ net_g = net_g.half() if config.is_half else net_g.float()
 
 
 
162
  vc = VC(tgt_sr, config)
163
+ print(f"Model loaded: {character_name} ({version})")
164
+ models.append((character_name, model_title, model_author, model_cover, version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
165
  categories.append([category_title, category_folder, description, models])
 
 
166
  return categories
167
 
168
+ # Fungsi load Hubert (tidak berubah)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  def load_hubert():
170
  global hubert_model
171
  torch.serialization.add_safe_globals([Dictionary])
172
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"], suffix="",)
173
+ hubert_model = models[0].to(config.device)
174
+ hubert_model = hubert_model.half() if config.is_half else hubert_model.float()
 
 
 
 
 
 
 
175
  hubert_model.eval()
176
 
177
+ # Fungsi untuk mengubah UI berdasarkan mode audio (disederhanakan)
178
  def change_audio_mode(vc_audio_mode):
179
+ is_input_path = vc_audio_mode == "Input path"
180
+ is_upload = vc_audio_mode == "Upload audio"
181
+ is_tts = vc_audio_mode == "TTS Audio"
182
+ return (
183
+ gr.Textbox.update(visible=is_input_path),
184
+ gr.Checkbox.update(visible=is_upload),
185
+ gr.Audio.update(visible=is_upload),
186
+ gr.Textbox.update(visible=is_tts),
187
+ gr.Dropdown.update(visible=is_tts)
188
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  def use_microphone(microphone):
191
+ return gr.Audio.update(source="microphone" if microphone else "upload")
 
 
 
192
 
193
  if __name__ == '__main__':
194
  load_hubert()
195
  categories = load_model()
196
  tts_voice_list = asyncio.new_event_loop().run_until_complete(edge_tts.list_voices())
197
  voices = [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list]
198
+
199
  with gr.Blocks(theme=gr.themes.Base()) as app:
200
+ gr.Markdown("# RVC Blue Archive\n### Voice Conversion App")
201
+ if not categories:
202
+ gr.Markdown("## No model found. Please add models to the 'weights' folder.")
203
+
 
 
 
 
 
 
 
 
 
 
 
204
  for (folder_title, folder, description, models) in categories:
205
  with gr.TabItem(folder_title):
206
  if description:
207
+ gr.Markdown(f"### <center>{description}</center>")
208
  with gr.Tabs():
209
  if not models:
210
+ gr.Markdown("## <center>No models loaded in this category.</center>")
 
211
  continue
212
  for (name, title, author, cover, model_version, vc_fn) in models:
213
  with gr.TabItem(name):
214
  with gr.Row():
215
  gr.Markdown(
216
+ f'<div align="center">'
217
+ f'<div>{title}</div>\n'
218
+ f'<div>RVC {model_version} Model</div>\n'
219
+ + (f'<div>Author: {author}</div>' if author else "")
220
+ + (f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else "")
221
+ + '</div>'
222
  )
223
  with gr.Row():
224
+ with gr.Column():
225
+ vc_audio_mode = gr.Dropdown(label="Input Mode", choices=audio_mode, value="Upload audio")
226
+ vc_input = gr.Textbox(label="Input Audio Path", visible=False)
227
+ vc_microphone_mode = gr.Checkbox(label="Use Microphone", value=False, visible=True)
228
+ vc_upload = gr.Audio(label="Upload Audio File", source="upload", visible=True)
229
+ tts_text = gr.Textbox(label="TTS Text", info="Text to speech input", visible=False)
230
+ tts_voice = gr.Dropdown(label="Edge-TTS Speaker", choices=voices, visible=False, value="en-US-AnaNeural-Female")
231
+
232
+ with gr.Column():
233
+ vc_transform0 = gr.Number(label="Transpose", value=0, info='e.g., 12 for male to female')
234
+ f0method0 = gr.Radio(label="Pitch Extraction Algorithm", info=f0method_info, choices=f0method_mode, value="pm")
235
+ index_rate1 = gr.Slider(minimum=0, maximum=1, label="Retrieval Feature Ratio", value=0.7)
236
+ filter_radius0 = gr.Slider(minimum=0, maximum=7, label="Median Filtering", value=3, step=1, info="Reduces breathiness")
237
+ resample_sr0 = gr.Slider(minimum=0, maximum=48000, label="Output Resample Rate", value=0, step=1, info="0 for no resampling")
238
+ rms_mix_rate0 = gr.Slider(minimum=0, maximum=1, label="Volume Envelope Ratio", value=1)
239
+ protect0 = gr.Slider(minimum=0, maximum=0.5, label="Voice Protection", value=0.5, step=0.01, info="Protects voiceless consonants")
240
+
241
+ with gr.Column():
242
+ vc_log = gr.Textbox(label="Output Information", interactive=False)
243
+ vc_output = gr.Audio(label="Output Audio", interactive=False)
244
+ vc_convert = gr.Button("Convert", variant="primary")
245
+
246
+ vc_convert.click(
247
+ fn=vc_fn,
248
+ inputs=[
249
+ vc_audio_mode, vc_input, vc_upload, tts_text, tts_voice,
250
+ vc_transform0, f0method0, index_rate1, filter_radius0,
251
+ resample_sr0, rms_mix_rate0, protect0,
252
+ ],
253
+ outputs=[vc_log, vc_output]
254
+ )
255
+
256
+ vc_audio_mode.change(
257
+ fn=change_audio_mode,
258
+ inputs=[vc_audio_mode],
259
+ outputs=[vc_input, vc_microphone_mode, vc_upload, tts_text, tts_voice]
260
+ )
261
+
262
+ vc_microphone_mode.change(
263
+ fn=use_microphone,
264
+ inputs=vc_microphone_mode,
265
+ outputs=vc_upload
266
+ )
267
+
268
+ app.queue(max_size=20).launch(share=False, server_name="0.0.0.0", server_port=7860)