Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
|
@@ -131,7 +131,7 @@ if os.path.isfile("rmvpe.pt"):
|
|
| 131 |
# yield info, None
|
| 132 |
# return vc_fn
|
| 133 |
|
| 134 |
-
def create_vc_fn(
|
| 135 |
def vc_fn(
|
| 136 |
vc_audio_mode,
|
| 137 |
vc_input,
|
|
@@ -147,38 +147,28 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
|
|
| 147 |
protect,
|
| 148 |
):
|
| 149 |
try:
|
| 150 |
-
|
| 151 |
-
print(f"Converting using {model_name}...")
|
| 152 |
-
logs.append(f"Converting using {model_name}...")
|
| 153 |
-
yield "\n".join(logs), None
|
| 154 |
-
|
| 155 |
-
# === PERBAIKAN logika or ===
|
| 156 |
-
if (vc_audio_mode == "Input path" or vc_audio_mode == "Youtube") and vc_input != "":
|
| 157 |
audio, sr = librosa.load(vc_input, sr=16000, mono=True)
|
| 158 |
-
|
| 159 |
elif vc_audio_mode == "Upload audio":
|
| 160 |
if vc_upload is None:
|
| 161 |
return "You need to upload an audio", None
|
| 162 |
sampling_rate, audio = vc_upload
|
| 163 |
duration = audio.shape[0] / sampling_rate
|
| 164 |
-
if duration > 20 and
|
| 165 |
return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
|
| 166 |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
| 167 |
if len(audio.shape) > 1:
|
| 168 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
| 169 |
if sampling_rate != 16000:
|
| 170 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
| 171 |
-
|
| 172 |
elif vc_audio_mode == "TTS Audio":
|
| 173 |
-
if len(tts_text) > 100 and
|
| 174 |
return "Text is too long", None
|
| 175 |
if tts_text is None or tts_voice is None:
|
| 176 |
return "You need to enter text and select a voice", None
|
| 177 |
asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
|
| 178 |
audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
|
| 179 |
vc_input = "tts.mp3"
|
| 180 |
-
|
| 181 |
-
# Lanjut pipeline
|
| 182 |
times = [0, 0, 0]
|
| 183 |
f0_up_key = int(f0_up_key)
|
| 184 |
audio_opt = vc.pipeline(
|
|
@@ -191,6 +181,7 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
|
|
| 191 |
f0_up_key,
|
| 192 |
f0_method,
|
| 193 |
file_index,
|
|
|
|
| 194 |
index_rate,
|
| 195 |
if_f0,
|
| 196 |
filter_radius,
|
|
@@ -202,18 +193,13 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
|
|
| 202 |
f0_file=None,
|
| 203 |
)
|
| 204 |
info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
|
| 205 |
-
print(f"{
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
except GeneratorExit:
|
| 210 |
-
# Ini sinyal normal ketika user stop proses di tengah jalan
|
| 211 |
-
raise
|
| 212 |
-
|
| 213 |
-
except Exception:
|
| 214 |
info = traceback.format_exc()
|
| 215 |
print(info)
|
| 216 |
-
|
|
|
|
| 217 |
|
| 218 |
def load_model():
|
| 219 |
categories = []
|
|
|
|
| 131 |
# yield info, None
|
| 132 |
# return vc_fn
|
| 133 |
|
| 134 |
+
def create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, file_index):
|
| 135 |
def vc_fn(
|
| 136 |
vc_audio_mode,
|
| 137 |
vc_input,
|
|
|
|
| 147 |
protect,
|
| 148 |
):
|
| 149 |
try:
|
| 150 |
+
if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
audio, sr = librosa.load(vc_input, sr=16000, mono=True)
|
|
|
|
| 152 |
elif vc_audio_mode == "Upload audio":
|
| 153 |
if vc_upload is None:
|
| 154 |
return "You need to upload an audio", None
|
| 155 |
sampling_rate, audio = vc_upload
|
| 156 |
duration = audio.shape[0] / sampling_rate
|
| 157 |
+
if duration > 20 and limitation:
|
| 158 |
return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
|
| 159 |
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
|
| 160 |
if len(audio.shape) > 1:
|
| 161 |
audio = librosa.to_mono(audio.transpose(1, 0))
|
| 162 |
if sampling_rate != 16000:
|
| 163 |
audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
|
|
|
|
| 164 |
elif vc_audio_mode == "TTS Audio":
|
| 165 |
+
if len(tts_text) > 100 and limitation:
|
| 166 |
return "Text is too long", None
|
| 167 |
if tts_text is None or tts_voice is None:
|
| 168 |
return "You need to enter text and select a voice", None
|
| 169 |
asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
|
| 170 |
audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
|
| 171 |
vc_input = "tts.mp3"
|
|
|
|
|
|
|
| 172 |
times = [0, 0, 0]
|
| 173 |
f0_up_key = int(f0_up_key)
|
| 174 |
audio_opt = vc.pipeline(
|
|
|
|
| 181 |
f0_up_key,
|
| 182 |
f0_method,
|
| 183 |
file_index,
|
| 184 |
+
# file_big_npy,
|
| 185 |
index_rate,
|
| 186 |
if_f0,
|
| 187 |
filter_radius,
|
|
|
|
| 193 |
f0_file=None,
|
| 194 |
)
|
| 195 |
info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
|
| 196 |
+
print(f"{model_title} | {info}")
|
| 197 |
+
return info, (tgt_sr, audio_opt)
|
| 198 |
+
except:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
info = traceback.format_exc()
|
| 200 |
print(info)
|
| 201 |
+
return info, None
|
| 202 |
+
return vc_fn
|
| 203 |
|
| 204 |
def load_model():
|
| 205 |
categories = []
|