balaharan commited on
Commit
5bc7dec
Β·
verified Β·
1 Parent(s): e95056a

requirements.txt

Browse files

gradio==4.44.0
requests==2.31.0
soundfile==0.12.1
transformers==4.45.0
torch==2.1.0
torchaudio==2.1.0

Files changed (1) hide show
  1. app.py +123 -140
app.py CHANGED
@@ -1,178 +1,161 @@
1
  import gradio as gr
2
- import torch
3
- import torchaudio
4
- import warnings
5
  import os
 
6
 
7
- # Suppress warnings for cleaner output
8
- warnings.filterwarnings("ignore")
9
 
10
- # Global variables
11
- model = None
12
- processor = None
13
- device = None
14
-
15
- def load_model():
16
- """Load the Granite Speech model with error handling"""
17
- global model, processor, device
18
 
19
  try:
20
- # Check available device
21
- device = "cuda" if torch.cuda.is_available() else "cpu"
22
- print(f"Using device: {device}")
23
-
24
- # Import here to catch import errors
25
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
26
-
27
- model_name = "ibm-granite/granite-speech-3.3-2b"
28
-
29
- # Load with memory optimization for free tier
30
- print("Loading processor...")
31
- processor = AutoProcessor.from_pretrained(model_name)
32
-
33
- print("Loading model...")
34
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
35
- model_name,
36
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
37
- low_cpu_mem_usage=True,
38
- ).to(device)
 
 
 
 
 
 
 
 
39
 
40
- # Set to eval mode
41
- model.eval()
 
42
 
43
- return f"βœ… Model loaded successfully on {device}!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- except ImportError as e:
46
- return f"❌ Import error: {str(e)}. Please check requirements.txt"
47
- except torch.cuda.OutOfMemoryError:
48
- return "❌ GPU out of memory. Try restarting the Space or use CPU."
49
  except Exception as e:
50
- return f"❌ Error loading model: {str(e)}"
51
 
52
- def transcribe_audio(audio_file):
53
- """Simple transcription function"""
54
- global model, processor, device
55
-
56
- if model is None or processor is None:
57
- return "❌ Please load the model first by clicking 'Load Model' button."
58
-
59
  if audio_file is None:
60
  return "❌ Please upload an audio file."
61
 
62
- try:
63
- # Load and preprocess audio
64
- wav, sr = torchaudio.load(audio_file)
65
-
66
- # Convert to mono if stereo
67
- if wav.shape[0] > 1:
68
- wav = wav.mean(dim=0, keepdim=True)
69
-
70
- # Resample to 16kHz if needed
71
- if sr != 16000:
72
- resampler = torchaudio.transforms.Resample(sr, 16000)
73
- wav = resampler(wav)
74
-
75
- # Limit audio length for free tier (30 seconds max)
76
- max_length = 16000 * 30 # 30 seconds at 16kHz
77
- if wav.shape[1] > max_length:
78
- wav = wav[:, :max_length]
79
- print("Audio truncated to 30 seconds for processing")
80
-
81
- # Create simple chat template
82
- chat = [
83
- {
84
- "role": "system",
85
- "content": "You are Granite, developed by IBM. You are a helpful AI assistant.",
86
- },
87
- {
88
- "role": "user",
89
- "content": "<|audio|>Please transcribe this audio.",
90
- }
91
- ]
92
-
93
- # Apply chat template
94
- tokenizer = processor.tokenizer
95
- text = tokenizer.apply_chat_template(
96
- chat, tokenize=False, add_generation_prompt=True
97
- )
98
-
99
- # Process inputs
100
- model_inputs = processor(
101
- text,
102
- wav,
103
- return_tensors="pt",
104
- sampling_rate=16000
105
- ).to(device)
106
-
107
- # Generate with conservative settings
108
- with torch.no_grad():
109
- outputs = model.generate(
110
- **model_inputs,
111
- max_new_tokens=100,
112
- num_beams=2, # Reduced for speed
113
- do_sample=False,
114
- temperature=1.0,
115
- pad_token_id=tokenizer.pad_token_id,
116
- )
117
-
118
- # Decode output
119
- num_input_tokens = model_inputs["input_ids"].shape[-1]
120
- new_tokens = outputs[0, num_input_tokens:].unsqueeze(0)
121
- transcription = tokenizer.batch_decode(
122
- new_tokens, skip_special_tokens=True
123
- )[0]
124
-
125
- return f"🎀 Transcription:\n\n{transcription}"
126
-
127
- except Exception as e:
128
- return f"❌ Error during transcription: {str(e)}"
129
 
130
- # Create Gradio interface
131
- def create_demo():
132
- with gr.Blocks(title="Granite Speech Demo", theme=gr.themes.Soft()) as demo:
 
 
 
 
 
 
133
  gr.Markdown("""
134
  # 🎀 IBM Granite Speech 3.3-2B Demo
135
 
136
- Upload an audio file to transcribe speech to text.
 
 
137
 
138
- **Supported**: English, French, German, Spanish, Portuguese
139
  """)
140
 
141
  with gr.Row():
142
- with gr.Column():
143
- # Model loading
144
- load_btn = gr.Button("πŸ”„ Load Model", variant="primary", size="lg")
145
- status = gr.Textbox(label="Status", interactive=False)
 
 
 
 
146
 
147
  # Audio input
148
- audio = gr.Audio(
149
- label="Upload Audio File",
150
  type="filepath",
151
  format="wav"
152
  )
153
 
154
- transcribe_btn = gr.Button("🎯 Transcribe", variant="secondary")
 
 
 
 
 
 
 
 
 
 
155
 
156
- with gr.Column():
 
157
  output = gr.Textbox(
158
- label="Transcription Result",
159
- lines=8,
160
- interactive=False
 
161
  )
162
 
 
 
 
 
 
 
 
 
163
  gr.Markdown("""
164
- ### πŸ’‘ Tips:
165
- - Keep audio files under 30 seconds for free tier
166
- - Clear speech works best
167
- - WAV format recommended
168
  """)
169
-
170
- # Event handlers
171
- load_btn.click(load_model, outputs=status)
172
- transcribe_btn.click(transcribe_audio, inputs=audio, outputs=output)
173
 
174
  return demo
175
 
 
176
  if __name__ == "__main__":
177
- demo = create_demo()
178
- demo.launch()
 
1
  import gradio as gr
2
+ import requests
 
 
3
  import os
4
+ from typing import Optional
5
 
6
+ # Hugging Face Inference API
7
+ API_URL = "https://api-inference.huggingface.co/models/ibm-granite/granite-speech-3.3-2b"
8
 
9
+ def query_inference_api(audio_file_path: str, hf_token: Optional[str] = None) -> str:
10
+ """
11
+ Query the Hugging Face Inference API for speech transcription
12
+ """
13
+ headers = {}
14
+ if hf_token:
15
+ headers["Authorization"] = f"Bearer {hf_token}"
 
16
 
17
  try:
18
+ with open(audio_file_path, "rb") as f:
19
+ data = f.read()
20
+
21
+ response = requests.post(API_URL, headers=headers, data=data, timeout=60)
22
+
23
+ if response.status_code == 200:
24
+ result = response.json()
25
+ if isinstance(result, dict) and 'text' in result:
26
+ return result['text']
27
+ elif isinstance(result, list) and len(result) > 0:
28
+ return result[0].get('generated_text', str(result))
29
+ else:
30
+ return str(result)
31
+ else:
32
+ return f"API Error {response.status_code}: {response.text}"
33
+
34
+ except requests.exceptions.Timeout:
35
+ return "❌ Request timed out. The model might be loading. Please try again in a few minutes."
36
+ except Exception as e:
37
+ return f"❌ Error: {str(e)}"
38
+
39
+ def transcribe_with_local_processing(audio_file_path: str) -> str:
40
+ """
41
+ Fallback: Simple local audio processing without heavy models
42
+ """
43
+ try:
44
+ import soundfile as sf
45
 
46
+ # Read audio file info
47
+ data, samplerate = sf.read(audio_file_path)
48
+ duration = len(data) / samplerate
49
 
50
+ return f"""
51
+ πŸ“Š **Audio File Analysis:**
52
+ - Duration: {duration:.2f} seconds
53
+ - Sample Rate: {samplerate} Hz
54
+ - Channels: {'Mono' if len(data.shape) == 1 else 'Stereo'}
55
+
56
+ ⚠️ **For actual transcription**:
57
+ This demo shows the file was processed successfully.
58
+ For full transcription, you would need:
59
+ 1. A Hugging Face token (free to get)
60
+ 2. Or run this on hardware with more resources
61
+
62
+ The Granite Speech 3.3-2B model supports:
63
+ - English, French, German, Spanish, Portuguese
64
+ - Speech-to-text transcription
65
+ - Speech translation to English
66
+ """
67
 
 
 
 
 
68
  except Exception as e:
69
+ return f"❌ Error processing audio: {str(e)}"
70
 
71
+ def process_audio(audio_file, hf_token):
72
+ """Main processing function"""
 
 
 
 
 
73
  if audio_file is None:
74
  return "❌ Please upload an audio file."
75
 
76
+ # Try Inference API first if token provided
77
+ if hf_token and hf_token.strip():
78
+ result = query_inference_api(audio_file, hf_token.strip())
79
+ if not result.startswith("❌"):
80
+ return f"🎀 **Transcription Result:**\n\n{result}"
81
+
82
+ # Fallback to local processing
83
+ return transcribe_with_local_processing(audio_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ def create_interface():
86
+ """Create the Gradio interface"""
87
+
88
+ with gr.Blocks(
89
+ title="Granite Speech Demo",
90
+ theme=gr.themes.Soft(),
91
+ css="footer {visibility: hidden}"
92
+ ) as demo:
93
+
94
  gr.Markdown("""
95
  # 🎀 IBM Granite Speech 3.3-2B Demo
96
 
97
+ **Two ways to use this demo:**
98
+ 1. **With HF Token** (recommended): Get free token from [Hugging Face Settings](https://huggingface.co/settings/tokens)
99
+ 2. **Without Token**: Basic audio file analysis
100
 
101
+ **Supported Languages**: English, French, German, Spanish, Portuguese
102
  """)
103
 
104
  with gr.Row():
105
+ with gr.Column(scale=1):
106
+ # Token input
107
+ hf_token = gr.Textbox(
108
+ label="πŸ”‘ Hugging Face Token (Optional)",
109
+ placeholder="hf_xxx... (get from huggingface.co/settings/tokens)",
110
+ type="password",
111
+ info="Paste your free HF token for full transcription"
112
+ )
113
 
114
  # Audio input
115
+ audio_input = gr.Audio(
116
+ label="πŸ“ Upload Audio File",
117
  type="filepath",
118
  format="wav"
119
  )
120
 
121
+ # Process button
122
+ process_btn = gr.Button("🎯 Process Audio", variant="primary", size="lg")
123
+
124
+ # Example info
125
+ gr.Markdown("""
126
+ ### πŸ’‘ Tips:
127
+ - **Get HF Token**: [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) β†’ "New token" β†’ "Read" access
128
+ - **Audio format**: WAV, MP3, M4A supported
129
+ - **Length**: Keep under 1 minute for best results
130
+ - **Quality**: Clear speech works best
131
+ """)
132
 
133
+ with gr.Column(scale=2):
134
+ # Output
135
  output = gr.Textbox(
136
+ label="πŸ“ Results",
137
+ lines=12,
138
+ interactive=False,
139
+ placeholder="Upload audio and click 'Process Audio' to see transcription..."
140
  )
141
 
142
+ # Event handler
143
+ process_btn.click(
144
+ fn=process_audio,
145
+ inputs=[audio_input, hf_token],
146
+ outputs=output
147
+ )
148
+
149
+ # Footer info
150
  gr.Markdown("""
151
+ ---
152
+ **About**: This demo uses IBM's Granite Speech 3.3-2B model for automatic speech recognition.
153
+ Model supports multilingual transcription and translation capabilities.
 
154
  """)
 
 
 
 
155
 
156
  return demo
157
 
158
+ # Launch the app
159
  if __name__ == "__main__":
160
+ demo = create_interface()
161
+ demo.launch(server_name="0.0.0.0", server_port=7860)