BreezyVoice

Running

App Files Files Community

BreezyVoice / app.py

JacobLinCool

Update app.py

7987018 verified 5 months ago

raw

history blame contribute delete

6.99 kB

	# Copyright (c) 2025 MediaTek Reserch Inc (authors: Chan-Jan Hsu)
	# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu, Liu Yue)
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import spaces

	import os
	import sys
	ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
	sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))

	import argparse
	import gradio as gr
	import numpy as np
	import torch
	import torchaudio
	import random
	import librosa
	from transformers import pipeline
	import subprocess
	from scipy.signal import resample

	import logging
	logging.getLogger('matplotlib').setLevel(logging.WARNING)

	from cosyvoice.cli.cosyvoice import CosyVoice
	from cosyvoice.utils.file_utils import load_wav, speed_change

	#logging.basicConfig(level=logging.DEBUG,
	# format='%(asctime)s %(levelname)s %(message)s')

	def generate_seed():
	seed = random.randint(1, 100000000)
	return {
	"__type__": "update",
	"value": seed
	}

	def set_all_random_seed(seed):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)

	max_val = 0.8
	def postprocess(speech, top_db=60, hop_length=220, win_length=440):
	speech, _ = librosa.effects.trim(
	speech, top_db=top_db,
	frame_length=win_length,
	hop_length=hop_length
	)
	if speech.abs().max() > max_val:
	speech = speech / speech.abs().max() * max_val
	speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
	return speech

	@spaces.GPU
	def generate_audio(tts_text, prompt_text, prompt_wav, seed):
	# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode

	prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
	set_all_random_seed(seed)
	output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
	speed_factor = 1
	if speed_factor != 1.0:
	#try:
	#audio_data, sample_rate = speed_change(output["tts_speech"], target_sr, str(speed_factor))
	#audio_data = audio_data.numpy().flatten()
	new_length = int(len(output['tts_speech']) / speed_factor)
	audio_data = resample(output['tts_speech'], new_length)
	# except Exception as e:
	# print(f"Failed to change speed of audio: \n{e}")
	else:
	audio_data = output['tts_speech'].numpy().flatten()

	return (target_sr, audio_data)


	@spaces.GPU
	def generate_text(prompt_wav):
	if prompt_wav:
	results = asr_pipeline(prompt_wav)
	return results['text']
	return "No valid input detected."

	def main():
	with gr.Blocks(title="BreezyVoice 語音合成系統", theme="default") as demo:
	gr.Markdown(
	"""# BreezyVoice 語音合成系統

	#### Runs on Huggingface Zero GPU (H200)

	為了加快推理速度，g2pw 注音標註並未被啟動。"""
	)

	# All content arranged in a single column
	with gr.Column():
	# Configuration Section

	# Grouping prompt audio inputs and auto speech recognition in one block using Markdown
	gr.Markdown("### 步驟 1. 音訊樣本輸入 & 音訊樣本文本輸入")
	gr.Markdown("選擇 prompt 音訊檔案或錄製 prompt 音訊 (5~15秒)，並手動校對自動產生的音訊樣本文本。")
	prompt_wav = gr.Audio(
	type='filepath',
	label='選擇 prompt 音訊檔案（確保取樣率不低於 16khz）或錄製 prompt 音訊'
	)

	with gr.Blocks():
	prompt_text = gr.Textbox(
	label="音訊樣本文本輸入(此欄位應與音檔內容完全相同)",
	lines=2,
	placeholder="音訊樣本文本"
	)

	prompt_wav.input(
	fn=generate_text,
	inputs=[prompt_wav],
	outputs=prompt_text
	)

	gr.Examples(
	examples=[
	["examples/commonvoice-example-1.mp3", "明月幾時有，去問氣象局"],
	["examples/commonvoice-example-2.mp3", "雲林縣斗六市與林內鄉交界"],
	["examples/commonvoice-example-3.mp3", "法律應保障所有的人獲得相同的發展結果"]
	],
	inputs=[prompt_wav, prompt_text],
	label="範例"
	)

	# Input Section: Synthesis Text

	gr.Markdown("### 步驟 2.合成文本輸入")
	tts_text = gr.Textbox(
	label="輸入想要合成的文本",
	lines=2,
	placeholder="請輸入想要合成的文本...",
	value="我今天忙了一整天，現在好想睡覺喔 QQ"
	)


	# Output Section
	gr.Markdown("### 步驟 3. 合成音訊")
	# Generation button for audio synthesis (triggered manually)

	with gr.Accordion("進階設定", open=False):
	seed = gr.Number(value=0, label="隨機推理種子")
	#seed_button = gr.Button("隨機")
	seed_button = gr.Button(value="\U0001F3B2生成隨機推理種子\U0001F3B2")
	speed_factor = 1
	# speed_factor = gr.Slider(
	# minimum=0.25,
	# maximum=4,
	# step=0.05,
	# label="語速",
	# value=1.0,
	# interactive=True
	# )

	generate_button = gr.Button("生成音訊")
	audio_output = gr.Audio(label="合成音訊")

	# Set up callbacks for seed generation and audio synthesis
	seed_button.click(fn=generate_seed, inputs=[], outputs=seed)
	generate_button.click(
	fn=generate_audio,
	inputs=[tts_text, prompt_text, prompt_wav, seed],
	outputs=audio_output
	)

	demo.launch()

	if __name__ == '__main__':
	cosyvoice = CosyVoice('Splend1dchan/BreezyVoice')
	asr_pipeline = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-tiny",
	tokenizer="openai/whisper-tiny",
	device=0 # Use GPU (if available); set to -1 for CPU
	)
	sft_spk = cosyvoice.list_avaliable_spks()
	prompt_sr, target_sr = 16000, 22050
	default_data = np.zeros(target_sr)
	main()