Spaces:

balaharan
/

claude

Sleeping

App Files Files Community

balaharan commited on Sep 2

Commit

5bc7dec

verified ·

1 Parent(s): e95056a

requirements.txt

Browse files

gradio==4.44.0
requests==2.31.0
soundfile==0.12.1
transformers==4.45.0
torch==2.1.0
torchaudio==2.1.0

Files changed (1) hide show

app.py +123 -140

app.py CHANGED Viewed

@@ -1,178 +1,161 @@
 import gradio as gr
-import torch
-import torchaudio
-import warnings
 import os
-# Suppress warnings for cleaner output
-warnings.filterwarnings("ignore")
-# Global variables
-model = None
-processor = None
-device = None
-def load_model():
-    """Load the Granite Speech model with error handling"""
-    global model, processor, device
     try:
-        # Check available device
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"Using device: {device}")
-        # Import here to catch import errors
-        from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
-        model_name = "ibm-granite/granite-speech-3.3-2b"
-        # Load with memory optimization for free tier
-        print("Loading processor...")
-        processor = AutoProcessor.from_pretrained(model_name)
-        print("Loading model...")
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(
-            model_name,
-            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-            low_cpu_mem_usage=True,
-        ).to(device)
-        # Set to eval mode
-        model.eval()
-        return f"✅ Model loaded successfully on {device}!"
-    except ImportError as e:
-        return f"❌ Import error: {str(e)}. Please check requirements.txt"
-    except torch.cuda.OutOfMemoryError:
-        return "❌ GPU out of memory. Try restarting the Space or use CPU."
     except Exception as e:
-        return f"❌ Error loading model: {str(e)}"
-def transcribe_audio(audio_file):
-    """Simple transcription function"""
-    global model, processor, device
-    if model is None or processor is None:
-        return "❌ Please load the model first by clicking 'Load Model' button."
     if audio_file is None:
         return "❌ Please upload an audio file."
-    try:
-        # Load and preprocess audio
-        wav, sr = torchaudio.load(audio_file)
-        # Convert to mono if stereo
-        if wav.shape[0] > 1:
-            wav = wav.mean(dim=0, keepdim=True)
-        # Resample to 16kHz if needed
-        if sr != 16000:
-            resampler = torchaudio.transforms.Resample(sr, 16000)
-            wav = resampler(wav)
-        # Limit audio length for free tier (30 seconds max)
-        max_length = 16000 * 30  # 30 seconds at 16kHz
-        if wav.shape[1] > max_length:
-            wav = wav[:, :max_length]
-            print("Audio truncated to 30 seconds for processing")
-        # Create simple chat template
-        chat = [
-            {
-                "role": "system",
-                "content": "You are Granite, developed by IBM. You are a helpful AI assistant.",
-            },
-            {
-                "role": "user",
-                "content": "<|audio|>Please transcribe this audio.",
-            }
-        ]
-        # Apply chat template
-        tokenizer = processor.tokenizer
-        text = tokenizer.apply_chat_template(
-            chat, tokenize=False, add_generation_prompt=True
-        )
-        # Process inputs
-        model_inputs = processor(
-            text,
-            wav,
-            return_tensors="pt",
-            sampling_rate=16000
-        ).to(device)
-        # Generate with conservative settings
-        with torch.no_grad():
-            outputs = model.generate(
-                **model_inputs,
-                max_new_tokens=100,
-                num_beams=2,  # Reduced for speed
-                do_sample=False,
-                temperature=1.0,
-                pad_token_id=tokenizer.pad_token_id,
-            )
-        # Decode output
-        num_input_tokens = model_inputs["input_ids"].shape[-1]
-        new_tokens = outputs[0, num_input_tokens:].unsqueeze(0)
-        transcription = tokenizer.batch_decode(
-            new_tokens, skip_special_tokens=True
-        )[0]
-        return f"🎤 Transcription:\n\n{transcription}"
-    except Exception as e:
-        return f"❌ Error during transcription: {str(e)}"
-# Create Gradio interface
-def create_demo():
-    with gr.Blocks(title="Granite Speech Demo", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🎤 IBM Granite Speech 3.3-2B Demo
-        Upload an audio file to transcribe speech to text.
-        **Supported**: English, French, German, Spanish, Portuguese
         """)
         with gr.Row():
-            with gr.Column():
-                # Model loading
-                load_btn = gr.Button("🔄 Load Model", variant="primary", size="lg")
-                status = gr.Textbox(label="Status", interactive=False)
                 # Audio input
-                audio = gr.Audio(
-                    label="Upload Audio File",
                     type="filepath",
                     format="wav"
                 )
-                transcribe_btn = gr.Button("🎯 Transcribe", variant="secondary")
-            with gr.Column():
                 output = gr.Textbox(
-                    label="Transcription Result",
-                    lines=8,
-                    interactive=False
                 )
         gr.Markdown("""
-        ### 💡 Tips:
-        - Keep audio files under 30 seconds for free tier
-        - Clear speech works best
-        - WAV format recommended
         """)
-        # Event handlers
-        load_btn.click(load_model, outputs=status)
-        transcribe_btn.click(transcribe_audio, inputs=audio, outputs=output)
     return demo
 if __name__ == "__main__":
-    demo = create_demo()
-    demo.launch()

 import gradio as gr
+import requests
 import os
+from typing import Optional
+# Hugging Face Inference API
+API_URL = "https://api-inference.huggingface.co/models/ibm-granite/granite-speech-3.3-2b"
+def query_inference_api(audio_file_path: str, hf_token: Optional[str] = None) -> str:
+    """
+    Query the Hugging Face Inference API for speech transcription
+    """
+    headers = {}
+    if hf_token:
+        headers["Authorization"] = f"Bearer {hf_token}"
     try:
+        with open(audio_file_path, "rb") as f:
+            data = f.read()
+        response = requests.post(API_URL, headers=headers, data=data, timeout=60)
+        if response.status_code == 200:
+            result = response.json()
+            if isinstance(result, dict) and 'text' in result:
+                return result['text']
+            elif isinstance(result, list) and len(result) > 0:
+                return result[0].get('generated_text', str(result))
+            else:
+                return str(result)
+        else:
+            return f"API Error {response.status_code}: {response.text}"
+    except requests.exceptions.Timeout:
+        return "❌ Request timed out. The model might be loading. Please try again in a few minutes."
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+def transcribe_with_local_processing(audio_file_path: str) -> str:
+    """
+    Fallback: Simple local audio processing without heavy models
+    """
+    try:
+        import soundfile as sf
+        # Read audio file info
+        data, samplerate = sf.read(audio_file_path)
+        duration = len(data) / samplerate
+        return f"""
+📊 **Audio File Analysis:**
+- Duration: {duration:.2f} seconds
+- Sample Rate: {samplerate} Hz
+- Channels: {'Mono' if len(data.shape) == 1 else 'Stereo'}
+⚠️ **For actual transcription**:
+This demo shows the file was processed successfully.
+For full transcription, you would need:
+1. A Hugging Face token (free to get)
+2. Or run this on hardware with more resources
+The Granite Speech 3.3-2B model supports:
+- English, French, German, Spanish, Portuguese
+- Speech-to-text transcription
+- Speech translation to English
+        """
     except Exception as e:
+        return f"❌ Error processing audio: {str(e)}"
+def process_audio(audio_file, hf_token):
+    """Main processing function"""
     if audio_file is None:
         return "❌ Please upload an audio file."
+    # Try Inference API first if token provided
+    if hf_token and hf_token.strip():
+        result = query_inference_api(audio_file, hf_token.strip())
+        if not result.startswith("❌"):
+            return f"🎤 **Transcription Result:**\n\n{result}"
+    # Fallback to local processing
+    return transcribe_with_local_processing(audio_file)
+def create_interface():
+    """Create the Gradio interface"""
+    with gr.Blocks(
+        title="Granite Speech Demo",
+        theme=gr.themes.Soft(),
+        css="footer {visibility: hidden}"
+    ) as demo:
         gr.Markdown("""
         # 🎤 IBM Granite Speech 3.3-2B Demo
+        **Two ways to use this demo:**
+        1. **With HF Token** (recommended): Get free token from [Hugging Face Settings](https://huggingface.co/settings/tokens)
+        2. **Without Token**: Basic audio file analysis
+        **Supported Languages**: English, French, German, Spanish, Portuguese
         """)
         with gr.Row():
+            with gr.Column(scale=1):
+                # Token input
+                hf_token = gr.Textbox(
+                    label="🔑 Hugging Face Token (Optional)",
+                    placeholder="hf_xxx... (get from huggingface.co/settings/tokens)",
+                    type="password",
+                    info="Paste your free HF token for full transcription"
+                )
                 # Audio input
+                audio_input = gr.Audio(
+                    label="📁 Upload Audio File",
                     type="filepath",
                     format="wav"
                 )
+                # Process button
+                process_btn = gr.Button("🎯 Process Audio", variant="primary", size="lg")
+                # Example info
+                gr.Markdown("""
+                ### 💡 Tips:
+                - **Get HF Token**: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) → "New token" → "Read" access
+                - **Audio format**: WAV, MP3, M4A supported
+                - **Length**: Keep under 1 minute for best results
+                - **Quality**: Clear speech works best
+                """)
+            with gr.Column(scale=2):
+                # Output
                 output = gr.Textbox(
+                    label="📝 Results",
+                    lines=12,
+                    interactive=False,
+                    placeholder="Upload audio and click 'Process Audio' to see transcription..."
                 )
+        # Event handler
+        process_btn.click(
+            fn=process_audio,
+            inputs=[audio_input, hf_token],
+            outputs=output
+        )
+        # Footer info
         gr.Markdown("""
+        ---
+        **About**: This demo uses IBM's Granite Speech 3.3-2B model for automatic speech recognition.
+        Model supports multilingual transcription and translation capabilities.
         """)
     return demo
+# Launch the app
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(server_name="0.0.0.0", server_port=7860)