Spaces:
Running
on
Zero
Running
on
Zero
Fix engine initalization and add model preload
Browse files
README.md
CHANGED
|
@@ -8,6 +8,9 @@ sdk_version: 5.36.2
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
short_description: Higgs Audio Demo
|
|
|
|
|
|
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
short_description: Higgs Audio Demo
|
| 11 |
+
preload_from_hub:
|
| 12 |
+
- "bosonai/higgs-audio-v2-generation-3B-staging"
|
| 13 |
+
- "bosonai/higgs-audio-v2-tokenizer-staging"
|
| 14 |
---
|
| 15 |
|
| 16 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
|
@@ -15,7 +15,7 @@ import time
|
|
| 15 |
from functools import lru_cache
|
| 16 |
import re
|
| 17 |
import spaces
|
| 18 |
-
|
| 19 |
|
| 20 |
# Import HiggsAudio components
|
| 21 |
from higgs_audio.serve.serve_engine import HiggsAudioServeEngine
|
|
@@ -64,12 +64,7 @@ PREDEFINED_EXAMPLES = {
|
|
| 64 |
"It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
|
| 65 |
"And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
|
| 66 |
"\n"
|
| 67 |
-
"So here's the big question: Do you want to understand how deep learning works?\n"
|
| 68 |
-
"How to use it to build powerful models that can predict, automate, and transform industries?\n"
|
| 69 |
-
"Well, today, I've got some exciting news for you.\n"
|
| 70 |
-
"\n"
|
| 71 |
-
"We're going to talk about a course that I highly recommend: Dive into Deep Learning.\n"
|
| 72 |
-
"It's not just another course; it's an entire experience that will take you from a beginner to someone who is well-versed in deep learning techniques.",
|
| 73 |
"description": "Single speaker example",
|
| 74 |
},
|
| 75 |
"single-speaker-zh": {
|
|
@@ -80,7 +75,6 @@ PREDEFINED_EXAMPLES = {
|
|
| 80 |
"<|scene_desc_end|>",
|
| 81 |
"input_text": "大家好, 欢迎收听本期的跟李沐学AI. 今天沐哥在忙着洗数据, 所以由我, 希格斯主播代替他讲这期视频.\n"
|
| 82 |
"今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
|
| 83 |
-
"无论你是开发者, 数据科学爱好者, 还是只是对人工智能感兴趣的人都一定听说过这个词. 它已经成为AI时代的一个研究热点.\n"
|
| 84 |
"那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
|
| 85 |
"或者说, 你能察觉到我其实是个机器人吗?",
|
| 86 |
"description": "Single speaker with Chinese text",
|
|
@@ -95,6 +89,11 @@ def encode_audio_file(file_path):
|
|
| 95 |
return base64.b64encode(audio_file.read()).decode("utf-8")
|
| 96 |
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
def load_voice_presets():
|
| 99 |
"""Load the voice presets from the voice_examples directory."""
|
| 100 |
try:
|
|
@@ -127,14 +126,15 @@ def get_voice_present(voice_preset):
|
|
| 127 |
|
| 128 |
|
| 129 |
@spaces.GPU
|
| 130 |
-
def initialize_engine(model_path, audio_tokenizer_path
|
| 131 |
"""Initialize the HiggsAudioServeEngine."""
|
| 132 |
global engine
|
| 133 |
try:
|
|
|
|
| 134 |
engine = HiggsAudioServeEngine(
|
| 135 |
model_name_or_path=model_path,
|
| 136 |
audio_tokenizer_name_or_path=audio_tokenizer_path,
|
| 137 |
-
device=
|
| 138 |
)
|
| 139 |
logger.info(f"Successfully initialized HiggsAudioServeEngine with model: {model_path}")
|
| 140 |
return True
|
|
@@ -217,10 +217,7 @@ def text_to_speech(
|
|
| 217 |
global engine
|
| 218 |
|
| 219 |
if engine is None:
|
| 220 |
-
|
| 221 |
-
logger.error(error_msg)
|
| 222 |
-
gr.Error(error_msg)
|
| 223 |
-
return f"❌ {error_msg}", None
|
| 224 |
|
| 225 |
try:
|
| 226 |
# Prepare ChatML sample
|
|
@@ -482,18 +479,6 @@ def main():
|
|
| 482 |
global DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH, VOICE_PRESETS
|
| 483 |
|
| 484 |
parser = argparse.ArgumentParser(description="Gradio UI for Text-to-Speech using HiggsAudioServeEngine")
|
| 485 |
-
parser.add_argument(
|
| 486 |
-
"--model-path",
|
| 487 |
-
type=str,
|
| 488 |
-
default=DEFAULT_MODEL_PATH,
|
| 489 |
-
help="Path to the Higgs Audio model.",
|
| 490 |
-
)
|
| 491 |
-
parser.add_argument(
|
| 492 |
-
"--audio-tokenizer-path",
|
| 493 |
-
type=str,
|
| 494 |
-
default=DEFAULT_AUDIO_TOKENIZER_PATH,
|
| 495 |
-
help="Path to the audio tokenizer.",
|
| 496 |
-
)
|
| 497 |
parser.add_argument(
|
| 498 |
"--device",
|
| 499 |
type=str,
|
|
@@ -507,13 +492,10 @@ def main():
|
|
| 507 |
args = parser.parse_args()
|
| 508 |
|
| 509 |
# Update default values if provided via command line
|
| 510 |
-
DEFAULT_MODEL_PATH = args.model_path
|
| 511 |
-
DEFAULT_AUDIO_TOKENIZER_PATH = args.audio_tokenizer_path
|
| 512 |
VOICE_PRESETS = load_voice_presets()
|
| 513 |
|
| 514 |
# Load model on startup
|
| 515 |
-
|
| 516 |
-
result = initialize_engine(args.model_path, args.audio_tokenizer_path, args.device)
|
| 517 |
|
| 518 |
# Exit if model loading failed
|
| 519 |
if not result:
|
|
|
|
| 15 |
from functools import lru_cache
|
| 16 |
import re
|
| 17 |
import spaces
|
| 18 |
+
import torch
|
| 19 |
|
| 20 |
# Import HiggsAudio components
|
| 21 |
from higgs_audio.serve.serve_engine import HiggsAudioServeEngine
|
|
|
|
| 64 |
"It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
|
| 65 |
"And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
|
| 66 |
"\n"
|
| 67 |
+
"So here's the big question: Do you want to understand how deep learning works?\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
"description": "Single speaker example",
|
| 69 |
},
|
| 70 |
"single-speaker-zh": {
|
|
|
|
| 75 |
"<|scene_desc_end|>",
|
| 76 |
"input_text": "大家好, 欢迎收听本期的跟李沐学AI. 今天沐哥在忙着洗数据, 所以由我, 希格斯主播代替他讲这期视频.\n"
|
| 77 |
"今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
|
|
|
|
| 78 |
"那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
|
| 79 |
"或者说, 你能察觉到我其实是个机器人吗?",
|
| 80 |
"description": "Single speaker with Chinese text",
|
|
|
|
| 89 |
return base64.b64encode(audio_file.read()).decode("utf-8")
|
| 90 |
|
| 91 |
|
| 92 |
+
def get_current_device():
|
| 93 |
+
"""Get the current device."""
|
| 94 |
+
return "cuda" if torch.cuda.is_available() else "cpu"
|
| 95 |
+
|
| 96 |
+
|
| 97 |
def load_voice_presets():
|
| 98 |
"""Load the voice presets from the voice_examples directory."""
|
| 99 |
try:
|
|
|
|
| 126 |
|
| 127 |
|
| 128 |
@spaces.GPU
|
| 129 |
+
def initialize_engine(model_path, audio_tokenizer_path) -> bool:
|
| 130 |
"""Initialize the HiggsAudioServeEngine."""
|
| 131 |
global engine
|
| 132 |
try:
|
| 133 |
+
logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
|
| 134 |
engine = HiggsAudioServeEngine(
|
| 135 |
model_name_or_path=model_path,
|
| 136 |
audio_tokenizer_name_or_path=audio_tokenizer_path,
|
| 137 |
+
device=get_current_device(),
|
| 138 |
)
|
| 139 |
logger.info(f"Successfully initialized HiggsAudioServeEngine with model: {model_path}")
|
| 140 |
return True
|
|
|
|
| 217 |
global engine
|
| 218 |
|
| 219 |
if engine is None:
|
| 220 |
+
initialize_engine(DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH)
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
try:
|
| 223 |
# Prepare ChatML sample
|
|
|
|
| 479 |
global DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH, VOICE_PRESETS
|
| 480 |
|
| 481 |
parser = argparse.ArgumentParser(description="Gradio UI for Text-to-Speech using HiggsAudioServeEngine")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 482 |
parser.add_argument(
|
| 483 |
"--device",
|
| 484 |
type=str,
|
|
|
|
| 492 |
args = parser.parse_args()
|
| 493 |
|
| 494 |
# Update default values if provided via command line
|
|
|
|
|
|
|
| 495 |
VOICE_PRESETS = load_voice_presets()
|
| 496 |
|
| 497 |
# Load model on startup
|
| 498 |
+
result = initialize_engine(DEFAULT_MODEL_PATH, DEFAULT_AUDIO_TOKENIZER_PATH)
|
|
|
|
| 499 |
|
| 500 |
# Exit if model loading failed
|
| 501 |
if not result:
|