Spaces:

stepfun-ai
/

Step-Audio-EditX

Running on Zero

App Files Files Community

xieli commited on 5 days ago

Commit

781d823

1 Parent(s): 245708d

feat: support whisper asr

Browse files

feat: default enable auto transcribe

Files changed (2) hide show

app.py +54 -1
whisper_wrapper.py +75 -0

app.py CHANGED Viewed

@@ -28,6 +28,7 @@ from tokenizer import StepAudioTokenizer
 from tts import StepAudioTTS
 from model_loader import ModelSource
 from config.edit_config import get_supported_edit_types
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
@@ -36,12 +37,13 @@ logger = logging.getLogger(__name__)
 # Global variables for ZeroGPU-optimized loading
 encoder = None
 common_tts_engine = None
 args_global = None
 _model_lock = threading.Lock()  # Thread lock for model initialization
 def initialize_models():
     """Initialize models on first GPU call (ZeroGPU optimization: load inside GPU context)"""
-    global encoder, common_tts_engine, args_global
     # Fast path: check if already initialized (without lock)
     if common_tts_engine is not None:
@@ -87,6 +89,12 @@ def initialize_models():
                 device_map=args_global.device_map,
             )
             logger.info("✓ StepCommonAudioTTS loaded")
             print("Models initialized inside GPU context.")
             if ZEROGPU_AVAILABLE:
@@ -178,6 +186,7 @@ class EditxTab:
         self.args = args
         self.edit_type_list = list(get_supported_edit_types().keys())
         self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
     def history_messages_to_show(self, messages):
         """Convert message history to gradio chatbot format"""
@@ -415,6 +424,14 @@ class EditxTab:
             outputs=self.edit_info,
         )
     def update_edit_info(self, category):
         """Update sub-task dropdown based on main task selection"""
         category_items = get_supported_edit_types()
@@ -422,6 +439,36 @@ class EditxTab:
         value = None if len(choices) == 0 else choices[0]
         return gr.Dropdown(label="Sub-task", choices=choices, value=value)
 def launch_demo(args, editx_tab):
     """Launch the gradio demo"""
@@ -503,6 +550,12 @@ if __name__ == "__main__":
         default="cuda",
         help="Device mapping for model loading (default: cuda)"
     )
     args = parser.parse_args()

 from tts import StepAudioTTS
 from model_loader import ModelSource
 from config.edit_config import get_supported_edit_types
+from whisper_wrapper import WhisperWrapper
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 # Global variables for ZeroGPU-optimized loading
 encoder = None
 common_tts_engine = None
+whisper_asr = None
 args_global = None
 _model_lock = threading.Lock()  # Thread lock for model initialization
 def initialize_models():
     """Initialize models on first GPU call (ZeroGPU optimization: load inside GPU context)"""
+    global encoder, common_tts_engine, whisper_asr, args_global
     # Fast path: check if already initialized (without lock)
     if common_tts_engine is not None:
                 device_map=args_global.device_map,
             )
             logger.info("✓ StepCommonAudioTTS loaded")
+            # Initialize Whisper ASR (load outside GPU context, lighter model)
+            if whisper_asr is None:
+                whisper_asr = WhisperWrapper()
+                logger.info("✓ WhisperWrapper loaded")
             print("Models initialized inside GPU context.")
             if ZEROGPU_AVAILABLE:
         self.args = args
         self.edit_type_list = list(get_supported_edit_types().keys())
         self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+        self.enable_auto_transcribe = getattr(args, 'enable_auto_transcribe', False)
     def history_messages_to_show(self, messages):
         """Convert message history to gradio chatbot format"""
             outputs=self.edit_info,
         )
+        # Add audio transcription event only if enabled
+        if self.enable_auto_transcribe:
+            self.prompt_audio_input.change(
+                fn=self.transcribe_audio,
+                inputs=[self.prompt_audio_input, self.prompt_text_input],
+                outputs=self.prompt_text_input,
+            )
     def update_edit_info(self, category):
         """Update sub-task dropdown based on main task selection"""
         category_items = get_supported_edit_types()
         value = None if len(choices) == 0 else choices[0]
         return gr.Dropdown(label="Sub-task", choices=choices, value=value)
+    def transcribe_audio(self, audio_input, current_text):
+        """Transcribe audio using Whisper ASR when prompt text is empty"""
+        global whisper_asr
+        # Only transcribe if current text is empty
+        if current_text and current_text.strip():
+            return current_text  # Keep existing text
+        if not audio_input:
+            return ""  # No audio to transcribe
+        try:
+            # Initialize whisper if not already loaded
+            if whisper_asr is None:
+                if args_global is None:
+                    self.logger.error("Global args not set. Cannot initialize Whisper.")
+                    return ""
+                whisper_asr = WhisperWrapper()
+                self.logger.info("✓ WhisperWrapper initialized for ASR")
+            # Transcribe audio
+            transcribed_text = whisper_asr(audio_input)
+            self.logger.info(f"Audio transcribed: {transcribed_text}")
+            return transcribed_text
+        except Exception as e:
+            self.logger.error(f"Failed to transcribe audio: {e}")
+            return ""
 def launch_demo(args, editx_tab):
     """Launch the gradio demo"""
         default="cuda",
         help="Device mapping for model loading (default: cuda)"
     )
+    parser.add_argument(
+        "--enable-auto-transcribe",
+        action="store_true",
+        help="Enable automatic audio transcription when uploading audio files (default: disabled)"
+    )
+    parser.set_defaults(enable_auto_transcribe=True)
     args = parser.parse_args()

whisper_wrapper.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import logging
+import torch
+import torchaudio
+from transformers import pipeline
+class WhisperWrapper:
+    """Simplified Whisper ASR wrapper"""
+    def __init__(self, model_id="openai/whisper-large-v3"):
+        """
+        Initialize WhisperWrapper
+        Args:
+            model_id: Whisper model ID, default uses openai/whisper-large-v3
+        """
+        self.logger = logging.getLogger(__name__)
+        self.model = None
+        try:
+            self.model = pipeline("automatic-speech-recognition", model=model_id)
+            self.logger.info(f"✓ Whisper model loaded successfully: {model_id}")
+        except Exception as e:
+            self.logger.error(f"❌ Failed to load Whisper model: {e}")
+            raise
+    def __call__(self, audio_input):
+        """
+        Audio to text transcription
+        Args:
+            audio_input: Audio file path or audio tensor
+        Returns:
+            Transcribed text
+        """
+        if self.model is None:
+            raise RuntimeError("Whisper model not loaded")
+        try:
+            # Load audio
+            if isinstance(audio_input, str):
+                # Audio file path
+                audio, audio_sr = torchaudio.load(audio_input)
+                audio = torchaudio.functional.resample(audio, audio_sr, 16000)
+                # Handle stereo to mono conversion (pipeline may not handle this)
+                if audio.shape[0] > 1:
+                    audio = audio.mean(dim=0, keepdim=True)  # Convert stereo to mono by averaging
+                # Convert to numpy and squeeze
+                audio = audio.squeeze(0).numpy()
+            elif isinstance(audio_input, torch.Tensor):
+                # Tensor input
+                audio = audio_input.cpu()
+                audio = torchaudio.functional.resample(audio, audio_sr, 16000)
+                # Handle stereo to mono conversion
+                if audio.ndim > 1 and audio.shape[0] > 1:
+                    audio = audio.mean(dim=0, keepdim=True)
+                audio = audio.squeeze().numpy()
+            else:
+                raise ValueError(f"Unsupported audio input type: {type(audio_input)}")
+            # Transcribe
+            result = self.model(audio)
+            text = result.get("text", "").strip() if isinstance(result, dict) else str(result).strip()
+            self.logger.debug(f"Transcription result: {text}")
+            return text
+        except Exception as e:
+            self.logger.error(f"Audio transcription failed: {e}")
+            return ""
+    def is_available(self):
+        """Check if whisper model is available"""
+        return self.model is not None