Spaces:

stepfun-ai
/

Step-Audio-EditX

Running on Zero

App Files Files Community

xieli commited on 11 days ago

Commit

29b1042

1 Parent(s): f21ec03

feat: fix paralinguistic, clone prompt, style prompt, input limit

Browse files

feat: add log

feat: fix prompt

fat: fix

feat: add style tag

feat: remove bgm

feat: fix vq0206 token

feat: fix

Files changed (5) hide show

app.py +3 -3
config/__init__.py +2 -2
config/edit_config.py +2 -2
config/prompts.py +10 -7
tts.py +35 -88

app.py CHANGED Viewed

@@ -290,7 +290,7 @@ class EditxTab:
                 self.logger.debug(f"Using previous audio from history, count: {len(state['history_audio'])}")
             # For para-linguistic, use generated_text; otherwise use source text
-            if edit_type not in {"para-linguistic"}:
                 generated_text = text_to_use
             # Use GPU inference with models loaded inside GPU context
@@ -355,14 +355,14 @@ class EditxTab:
             with gr.Row():
                 with gr.Column():
                     self.model_input = gr.Textbox(label="Model Name", value="Step-Audio-EditX", scale=1)
-                    self.prompt_text_input = gr.Textbox(label="Audio Text Content", value="", scale=1)
                     self.prompt_audio_input = gr.Audio(
                         sources=["upload", "microphone"],
                         format="wav",
                         type="filepath",
                         label="Input Audio",
                     )
-                    self.generated_text = gr.Textbox(label="Clone Text", lines=1, max_lines=200)
                     with gr.Row():
                         self.button_tts = gr.Button("CLONE")
                         self.button_edit = gr.Button("EDIT")

                 self.logger.debug(f"Using previous audio from history, count: {len(state['history_audio'])}")
             # For para-linguistic, use generated_text; otherwise use source text
+            if edit_type not in {"paralinguistic"}:
                 generated_text = text_to_use
             # Use GPU inference with models loaded inside GPU context
             with gr.Row():
                 with gr.Column():
                     self.model_input = gr.Textbox(label="Model Name", value="Step-Audio-EditX", scale=1)
+                    self.prompt_text_input = gr.Textbox(label="Prompt Text", value="", scale=1)
                     self.prompt_audio_input = gr.Audio(
                         sources=["upload", "microphone"],
                         format="wav",
                         type="filepath",
                         label="Input Audio",
                     )
+                    self.generated_text = gr.Textbox(label="Target Text", lines=1, max_lines=200, max_length=100)
                     with gr.Row():
                         self.button_tts = gr.Button("CLONE")
                         self.button_edit = gr.Button("EDIT")

config/__init__.py CHANGED Viewed

@@ -2,11 +2,11 @@
 Configuration module for Step-Audio
 """
-from .prompts import TTS_SYSTEM_PROMPTS, AUDIO_EDIT_SYSTEM_PROMPT
 from .edit_config import get_supported_edit_types
 __all__ = [
-    'TTS_SYSTEM_PROMPTS',
     'AUDIO_EDIT_SYSTEM_PROMPT',
     'get_supported_edit_types'
 ]

 Configuration module for Step-Audio
 """
+from .prompts import AUDIO_EDIT_CLONE_SYSTEM_PROMPT_TPL, AUDIO_EDIT_SYSTEM_PROMPT
 from .edit_config import get_supported_edit_types
 __all__ = [
+    'AUDIO_EDIT_CLONE_SYSTEM_PROMPT_TPL',
     'AUDIO_EDIT_SYSTEM_PROMPT',
     'get_supported_edit_types'
 ]

config/edit_config.py CHANGED Viewed

@@ -23,10 +23,10 @@ def get_supported_edit_types():
             'generous', 'act_coy', 'warm', 'shy', 'comfort', 'authority',
             'chat', 'radio', 'soulful', 'story', 'vivid', 'program',
             'news', 'advertising', 'roar', 'murmur', 'shout', 'deeply', 'loudly',
-            'remove'
         ],
         "vad": [],
         "denoise": [],
-        "para-linguistic": [],
         "speed": ["faster", "slower", "more faster", "more slower"],
     }

             'generous', 'act_coy', 'warm', 'shy', 'comfort', 'authority',
             'chat', 'radio', 'soulful', 'story', 'vivid', 'program',
             'news', 'advertising', 'roar', 'murmur', 'shout', 'deeply', 'loudly',
+            'remove', 'exaggerated'
         ],
         "vad": [],
         "denoise": [],
+        "paralinguistic": [],
         "speed": ["faster", "slower", "more faster", "more slower"],
     }

config/prompts.py CHANGED Viewed

@@ -3,13 +3,16 @@
 包含所有TTS和编辑相关的系统提示
 """
-# TTS相关系统提示
-TTS_SYSTEM_PROMPTS = {
-    "sys_prompt_for_rap": "请参考对话历史里的音色，用RAP方式将文本内容大声说唱出来。",
-    "sys_prompt_for_vocal": "请参考对话历史里的音色，用哼唱的方式将文本内容大声唱出来。",
-    "sys_prompt_wo_spk": '以自然的语速读出下面的文字。',
-    "sys_prompt_with_spk": '请用{}的声音尽可能自然地说出下面这些话。',
-}
 AUDIO_EDIT_SYSTEM_PROMPT = """As a highly skilled audio editing and tuning specialist, you excel in interpreting user instructions and applying precise adjustments to meet their needs. Your expertise spans a wide range of enhancement capabilities, including but not limited to:
 # Emotional Enhancement

 包含所有TTS和编辑相关的系统提示
 """
+AUDIO_EDIT_CLONE_SYSTEM_PROMPT_TPL = """Generate audio with the following timbre, prosody and speaking style
+[speaker_start]
+speaker name: {speaker}
+speaker prompt text:
+{prompt_text}
+speaker audio tokens:
+{prompt_wav_tokens}
+[speaker_end]
+"""
 AUDIO_EDIT_SYSTEM_PROMPT = """As a highly skilled audio editing and tuning specialist, you excel in interpreting user instructions and applying precise adjustments to meet their needs. Your expertise spans a wide range of enhancement capabilities, including but not limited to:
 # Emotional Enhancement

tts.py CHANGED Viewed

@@ -13,7 +13,7 @@ from http import HTTPStatus
 import torchaudio
 from model_loader import model_loader, ModelSource
-from config.prompts import TTS_SYSTEM_PROMPTS, AUDIO_EDIT_SYSTEM_PROMPT
 from stepvocoder.cosyvoice2.cli.cosyvoice import CosyVoice
 from transformers.generation.logits_process import LogitsProcessor
 from transformers.generation.utils import LogitsProcessorList
@@ -101,40 +101,9 @@ class StepAudioTTS:
         )
         # Use system prompts from config module
-        self.tts_sys_prompt_dict = TTS_SYSTEM_PROMPTS
         self.edit_sys_prompt = AUDIO_EDIT_SYSTEM_PROMPT
-    def get_audio_tokens(self, input_audio_data_numpy, input_audio_sample_rate):
-        """
-        Extract audio tokens using audio_tokenizer
-        Args:
-            input_audio_data_numpy: Audio data as numpy array
-            input_audio_sample_rate: Sample rate of the audio
-        Returns:
-            str: Audio tokens as string
-        """
-        # Convert numpy array to tensor if needed
-        if isinstance(input_audio_data_numpy, torch.Tensor):
-            audio_tensor = input_audio_data_numpy
-        else:
-            audio_tensor = torch.from_numpy(input_audio_data_numpy).float()
-        # Ensure proper shape (add batch dimension if needed)
-        if len(audio_tensor.shape) == 1:
-            audio_tensor = audio_tensor.unsqueeze(0)
-        # Use the correct API: wav2token returns _, vq02_codes, vq06_codes
-        _, vq02_codes, vq06_codes = self.audio_tokenizer.wav2token(audio_tensor, input_audio_sample_rate)
-        # Merge VQ codes to token string
-        audio_tokens = self.audio_tokenizer.merge_vq0206_to_token_str(
-            vq02_codes, vq06_codes
-        )
-        return audio_tokens
     def clone(
         self,
         prompt_wav_path: str,
@@ -155,16 +124,19 @@ class StepAudioTTS:
         try:
             logger.debug(f"Starting voice cloning: {prompt_wav_path}")
             prompt_wav, sample_rate = torchaudio.load(prompt_wav_path)
-            prompt_code, prompt_token, prompt_token_len, speech_feat, speech_feat_len, speech_embedding = (
                 self.preprocess_prompt_wav(prompt_wav_path)
             )
             prompt_speaker = self.generate_clone_voice_id(prompt_text, prompt_wav)
-            token_ids = self._encode_audio_tts_prompt(
                 target_text,
                 prompt_text,
                 prompt_speaker,
-                prompt_code,
             )
             output_ids = self.llm.generate(
@@ -176,10 +148,11 @@ class StepAudioTTS:
             )
             output_ids = output_ids[:, len(token_ids) : -1]  # skip eos token
             logger.debug("Voice cloning generation completed")
             return (
                 self.cosy_model.token2wav_nonstream(
                     output_ids - 65536,
-                    prompt_token,
                     speech_feat.to(torch.bfloat16),
                     speech_embedding.to(torch.bfloat16),
                 ),
@@ -211,16 +184,16 @@ class StepAudioTTS:
             Tuple[torch.Tensor, int]: Edited audio tensor and sample rate
         """
         try:
-            logger.debug(f"Starting audio editing: {edit_type} - {edit_info}")
-            # Load input audio
-            input_audio, sample_rate = torchaudio.load(input_audio_path)
-            # Get audio tokens
-            audio_tokens = self.get_audio_tokens(input_audio, sample_rate)
             # Build instruction prefix based on edit type
             instruct_prefix = self._build_audio_edit_instruction(audio_text, edit_type, edit_info, text)
             # Encode the complete prompt to token sequence
             prompt_tokens = self._encode_audio_edit_prompt(
@@ -238,15 +211,12 @@ class StepAudioTTS:
                 logits_processor=LogitsProcessorList([RepetitionAwareLogitsProcessor()]),
             )
             output_ids = output_ids[:, len(prompt_tokens) : -1]  # skip eos token
-            _, prompt_token, _, speech_feat, _, speech_embedding = (
-                self.preprocess_prompt_wav(input_audio_path)
-            )
             logger.debug("Audio editing generation completed")
             return (
                 self.cosy_model.token2wav_nonstream(
                     output_ids - 65536,
-                    prompt_token,
                     speech_feat.to(torch.bfloat16),
                     speech_embedding.to(torch.bfloat16),
                 ),
@@ -285,16 +255,12 @@ class StepAudioTTS:
         elif edit_type == "style":
             if edit_info == "remove":
                 instruct_prefix = f"Remove any speaking styles in the following audio and the reference text is: {audio_text}\n"
-            elif edit_info in {"exaggerated","ethereal","whisper","act_coy","older"}:
-                instruct_prefix = f"Make the following audio more {edit_info} style. The text corresponding to the audio is: {audio_text}\n"
             else:
-                instruct_prefix=f"Make the following audio more {edit_info}. The text corresponding to the audio is: {audio_text}\n"
         elif edit_type == "denoise":
             instruct_prefix = f"Remove any noise from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all noise from the audio."
         elif edit_type == "vad":
             instruct_prefix = f"Remove any silent portions from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all silence from the audio."
-        elif edit_type == "bgm":
-            instruct_prefix = f"Remove any background music (BGM) from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all BGM from the audio."
         elif edit_type == "paralinguistic":
             instruct_prefix = f"Add some non-verbal sounds to make the audio more natural, the new text is : {text}\n  The text corresponding to the audio is: {audio_text}\n"
         else:
@@ -331,30 +297,22 @@ class StepAudioTTS:
         history.extend([4] + qrole_toks + human_turn_toks + [3] + [4] + arole_toks)
         return history
-    def _encode_audio_tts_prompt(
-        self, text: str, prompt_text: str, prompt_speaker: str, prompt_code: list
     ):
-        rap_or_vocal = self.detect_instruction_name(text) in ("RAP", "哼唱")
-        if rap_or_vocal:
-            if "哼唱" in text:
-                prompt = self.tts_sys_prompt_dict["sys_prompt_for_vocal"]
-            else:
-                prompt = self.tts_sys_prompt_dict["sys_prompt_for_rap"]
-        elif prompt_speaker:
-            prompt = self.tts_sys_prompt_dict["sys_prompt_with_spk"].format(prompt_speaker)
-        else:
-            prompt = self.tts_sys_prompt_dict["sys_prompt_wo_spk"]
         sys_tokens = self.tokenizer.encode(f"system\n{prompt}")
         history = [1]
         history.extend([4] + sys_tokens + [3])
         _prefix_tokens = self.tokenizer.encode("\n")
-        prompt_token_encode = self.tokenizer.encode("\n" + prompt_text)
-        prompt_tokens = prompt_token_encode[len(_prefix_tokens) :]
         target_token_encode = self.tokenizer.encode("\n" + text)
         target_tokens = target_token_encode[len(_prefix_tokens) :]
@@ -364,14 +322,6 @@ class StepAudioTTS:
         history.extend(
             [4]
             + qrole_toks
-            + prompt_tokens
-            + [3]
-            + [4]
-            + arole_toks
-            + prompt_code
-            + [3]
-            + [4]
-            + qrole_toks
             + target_tokens
             + [3]
             + [4]
@@ -410,20 +360,17 @@ class StepAudioTTS:
         prompt_wav, prompt_wav_sr = torchaudio.load(prompt_wav_path)
         if prompt_wav.shape[0] > 1:
             prompt_wav = prompt_wav.mean(dim=0, keepdim=True)  # 将多通道音频转换为单通道
-        prompt_token, prompt_token_len = self.cosy_model.frontend.extract_speech_token(
-            prompt_wav, prompt_wav_sr
-        )
         speech_feat, speech_feat_len = self.cosy_model.frontend.extract_speech_feat(
             prompt_wav, prompt_wav_sr
         )
         speech_embedding = self.cosy_model.frontend.extract_spk_embedding(
             prompt_wav, prompt_wav_sr
         )
-        prompt_code, _, _ = self.audio_tokenizer.wav2token(prompt_wav, prompt_wav_sr)
         return (
-            prompt_code,
-            prompt_token,
-            prompt_token_len,
             speech_feat,
             speech_feat_len,
             speech_embedding,

 import torchaudio
 from model_loader import model_loader, ModelSource
+from config.prompts import AUDIO_EDIT_CLONE_SYSTEM_PROMPT_TPL, AUDIO_EDIT_SYSTEM_PROMPT
 from stepvocoder.cosyvoice2.cli.cosyvoice import CosyVoice
 from transformers.generation.logits_process import LogitsProcessor
 from transformers.generation.utils import LogitsProcessorList
         )
         # Use system prompts from config module
+        self.edit_clone_sys_prompt_tpl = AUDIO_EDIT_CLONE_SYSTEM_PROMPT_TPL
         self.edit_sys_prompt = AUDIO_EDIT_SYSTEM_PROMPT
     def clone(
         self,
         prompt_wav_path: str,
         try:
             logger.debug(f"Starting voice cloning: {prompt_wav_path}")
             prompt_wav, sample_rate = torchaudio.load(prompt_wav_path)
+            vq0206_codes, vq02_codes_ori, vq06_codes_ori, speech_feat, speech_feat_len, speech_embedding = (
                 self.preprocess_prompt_wav(prompt_wav_path)
             )
             prompt_speaker = self.generate_clone_voice_id(prompt_text, prompt_wav)
+            prompt_wav_tokens = self.audio_tokenizer.merge_vq0206_to_token_str(
+                vq02_codes_ori, vq06_codes_ori
+            )
+            token_ids = self._encode_audio_edit_clone_prompt(
                 target_text,
                 prompt_text,
                 prompt_speaker,
+                vq0206_codes,
+                prompt_wav_tokens,
             )
             output_ids = self.llm.generate(
             )
             output_ids = output_ids[:, len(token_ids) : -1]  # skip eos token
             logger.debug("Voice cloning generation completed")
+            vq0206_codes_vocoder = torch.tensor([vq0206_codes], dtype=torch.long) - 65536
             return (
                 self.cosy_model.token2wav_nonstream(
                     output_ids - 65536,
+                    vq0206_codes_vocoder,
                     speech_feat.to(torch.bfloat16),
                     speech_embedding.to(torch.bfloat16),
                 ),
             Tuple[torch.Tensor, int]: Edited audio tensor and sample rate
         """
         try:
+            logger.debug(f"Starting audio editing: {edit_type} - {edit_info}")
+            vq0206_codes, vq02_codes_ori, vq06_codes_ori, speech_feat, _, speech_embedding = (
+                self.preprocess_prompt_wav(input_audio_path)
+            )
+            audio_tokens = self.audio_tokenizer.merge_vq0206_to_token_str(
+                vq02_codes_ori, vq06_codes_ori
+            )
             # Build instruction prefix based on edit type
             instruct_prefix = self._build_audio_edit_instruction(audio_text, edit_type, edit_info, text)
+            print(f"instruct_prefix: {instruct_prefix}")
             # Encode the complete prompt to token sequence
             prompt_tokens = self._encode_audio_edit_prompt(
                 logits_processor=LogitsProcessorList([RepetitionAwareLogitsProcessor()]),
             )
             output_ids = output_ids[:, len(prompt_tokens) : -1]  # skip eos token
+            vq0206_codes_vocoder = torch.tensor([vq0206_codes], dtype=torch.long) - 65536
             logger.debug("Audio editing generation completed")
             return (
                 self.cosy_model.token2wav_nonstream(
                     output_ids - 65536,
+                    vq0206_codes_vocoder,
                     speech_feat.to(torch.bfloat16),
                     speech_embedding.to(torch.bfloat16),
                 ),
         elif edit_type == "style":
             if edit_info == "remove":
                 instruct_prefix = f"Remove any speaking styles in the following audio and the reference text is: {audio_text}\n"
             else:
+                instruct_prefix = f"Make the following audio more {edit_info} style. The text corresponding to the audio is: {audio_text}\n"
         elif edit_type == "denoise":
             instruct_prefix = f"Remove any noise from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all noise from the audio."
         elif edit_type == "vad":
             instruct_prefix = f"Remove any silent portions from the given audio while preserving the voice content clearly. Ensure that the speech quality remains intact with minimal distortion, and eliminate all silence from the audio."
         elif edit_type == "paralinguistic":
             instruct_prefix = f"Add some non-verbal sounds to make the audio more natural, the new text is : {text}\n  The text corresponding to the audio is: {audio_text}\n"
         else:
         history.extend([4] + qrole_toks + human_turn_toks + [3] + [4] + arole_toks)
         return history
+    def _encode_audio_edit_clone_prompt(
+        self, text: str, prompt_text: str, prompt_speaker: str, prompt_code: list, prompt_wav_tokens: str
     ):
+        prompt = self.edit_clone_sys_prompt_tpl.format(
+            speaker=prompt_speaker,
+            prompt_text=prompt_text,
+            prompt_wav_tokens=prompt_wav_tokens
+        )
+        print(f"edit clone prompt: {prompt}")
         sys_tokens = self.tokenizer.encode(f"system\n{prompt}")
         history = [1]
         history.extend([4] + sys_tokens + [3])
         _prefix_tokens = self.tokenizer.encode("\n")
         target_token_encode = self.tokenizer.encode("\n" + text)
         target_tokens = target_token_encode[len(_prefix_tokens) :]
         history.extend(
             [4]
             + qrole_toks
             + target_tokens
             + [3]
             + [4]
         prompt_wav, prompt_wav_sr = torchaudio.load(prompt_wav_path)
         if prompt_wav.shape[0] > 1:
             prompt_wav = prompt_wav.mean(dim=0, keepdim=True)  # 将多通道音频转换为单通道
         speech_feat, speech_feat_len = self.cosy_model.frontend.extract_speech_feat(
             prompt_wav, prompt_wav_sr
         )
         speech_embedding = self.cosy_model.frontend.extract_spk_embedding(
             prompt_wav, prompt_wav_sr
         )
+        vq0206_codes, vq02_codes_ori, vq06_codes_ori = self.audio_tokenizer.wav2token(prompt_wav, prompt_wav_sr)
         return (
+            vq0206_codes,
+            vq02_codes_ori,
+            vq06_codes_ori,
             speech_feat,
             speech_feat_len,
             speech_embedding,