Gary Simmons
commited on
Commit
·
5e5f9d1
1
Parent(s):
5557a0e
add YouTube video analysis tools and audio transcription capabilities, including documentation and test scripts
Browse files- README.md +49 -0
- app.py +10 -15
- docs/youtube_analysis_guide.md +159 -0
- requirements.txt +2 -1
- scripts/youtube_demo.py +112 -0
- tests/test_transcription_tools.py +213 -0
- tests/test_transcription_tools_standalone.py +188 -0
- tests/test_youtube_tools.py +70 -0
- tools/__init__.py +11 -0
- tools/transcription_tools.py +31 -0
- tools/youtube_video_analyzer.py +274 -0
README.md
CHANGED
|
@@ -12,4 +12,53 @@ hf_oauth: true
|
|
| 12 |
hf_oauth_expiration_minutes: 480
|
| 13 |
---
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 12 |
hf_oauth_expiration_minutes: 480
|
| 13 |
---
|
| 14 |
|
| 15 |
+
# Agent with YouTube Video Analysis
|
| 16 |
+
|
| 17 |
+
This agent includes advanced YouTube video analysis capabilities using yt-dlp and OpenCV for frame extraction and analysis.
|
| 18 |
+
|
| 19 |
+
## Features
|
| 20 |
+
|
| 21 |
+
### YouTube Video Analysis Tools
|
| 22 |
+
|
| 23 |
+
The agent is equipped with two powerful YouTube video analysis tools:
|
| 24 |
+
|
| 25 |
+
#### 1. `analyze_youtube_video(video_url, max_frames=6, interval_seconds=45.0)`
|
| 26 |
+
- **Purpose**: Downloads a YouTube video and extracts frames at regular intervals for detailed analysis
|
| 27 |
+
- **Parameters**:
|
| 28 |
+
- `video_url`: YouTube video URL (e.g., https://www.youtube.com/watch?v=VIDEO_ID)
|
| 29 |
+
- `max_frames`: Maximum number of frames to extract (1-10, default: 6)
|
| 30 |
+
- `interval_seconds`: Time interval between extractions (minimum: 10s, default: 45s)
|
| 31 |
+
- **Returns**: JSON with video metadata, frame timestamps, and detailed descriptions of each frame
|
| 32 |
+
- **Use cases**: Content analysis, scene detection, video summarization, accessibility descriptions
|
| 33 |
+
|
| 34 |
+
#### 2. `get_youtube_video_info(video_url)`
|
| 35 |
+
- **Purpose**: Quickly retrieves video metadata without downloading
|
| 36 |
+
- **Parameters**:
|
| 37 |
+
- `video_url`: YouTube video URL
|
| 38 |
+
- **Returns**: JSON with title, duration, uploader, view count, description, and resolution
|
| 39 |
+
- **Use cases**: Video verification, content filtering, metadata collection
|
| 40 |
+
|
| 41 |
+
### Technical Implementation
|
| 42 |
+
|
| 43 |
+
- **Video Processing**: Uses yt-dlp for robust YouTube video downloading
|
| 44 |
+
- **Frame Extraction**: OpenCV for efficient frame extraction and processing
|
| 45 |
+
- **Image Processing**: PIL and numpy for frame manipulation and encoding
|
| 46 |
+
- **Analysis Ready**: Frames are prepared for image analysis (base64 encoded, resized)
|
| 47 |
+
- **Error Handling**: Comprehensive error handling for network issues, invalid URLs, and processing failures
|
| 48 |
+
|
| 49 |
+
### Example Usage
|
| 50 |
+
|
| 51 |
+
The agent can answer questions like:
|
| 52 |
+
- "Analyze this YouTube video and tell me what happens: [URL]"
|
| 53 |
+
- "Extract 5 frames from this video every 60 seconds: [URL]"
|
| 54 |
+
- "What is the title and duration of this video: [URL]"
|
| 55 |
+
- "Describe the visual content of this tutorial video: [URL]"
|
| 56 |
+
|
| 57 |
+
### Dependencies
|
| 58 |
+
|
| 59 |
+
- yt-dlp: YouTube video downloading
|
| 60 |
+
- opencv-python: Computer vision and frame extraction
|
| 61 |
+
- PIL (Pillow): Image processing
|
| 62 |
+
- numpy: Numerical operations
|
| 63 |
+
|
| 64 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
|
@@ -17,6 +17,7 @@ from smolagents import (
|
|
| 17 |
LiteLLMModel,
|
| 18 |
tool,
|
| 19 |
)
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
# (Keep Constants as is)
|
|
@@ -178,21 +179,6 @@ model = RateLimitedModel(
|
|
| 178 |
)
|
| 179 |
|
| 180 |
|
| 181 |
-
@tool
|
| 182 |
-
def transcribe_audio(audio_bytes: bytes) -> str:
|
| 183 |
-
"""
|
| 184 |
-
Given an audio file (bytes), return the transcription (text).
|
| 185 |
-
|
| 186 |
-
Args:
|
| 187 |
-
audio_bytes: Raw bytes of the audio file to transcribe. Can be the full contents
|
| 188 |
-
of a WAV/MP3/OGG file or other common audio container. The function should
|
| 189 |
-
accept bytes and return the recognized text as a string.
|
| 190 |
-
"""
|
| 191 |
-
speech_tool = SpeechToTextTool()
|
| 192 |
-
transcription = speech_tool.transcribe(audio_bytes)
|
| 193 |
-
return transcription
|
| 194 |
-
|
| 195 |
-
|
| 196 |
class BasicAgent:
|
| 197 |
def __init__(self, name: str = "GGSAgent"):
|
| 198 |
self.name = name
|
|
@@ -204,6 +190,8 @@ class BasicAgent:
|
|
| 204 |
WikipediaSearchTool(),
|
| 205 |
SpeechToTextTool(),
|
| 206 |
transcribe_audio,
|
|
|
|
|
|
|
| 207 |
],
|
| 208 |
model=model,
|
| 209 |
max_steps=20,
|
|
@@ -223,6 +211,13 @@ class BasicAgent:
|
|
| 223 |
"time",
|
| 224 |
"threading",
|
| 225 |
"random",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
],
|
| 227 |
add_base_tools=True,
|
| 228 |
)
|
|
|
|
| 17 |
LiteLLMModel,
|
| 18 |
tool,
|
| 19 |
)
|
| 20 |
+
from tools import analyze_youtube_video, get_youtube_video_info, transcribe_audio
|
| 21 |
|
| 22 |
|
| 23 |
# (Keep Constants as is)
|
|
|
|
| 179 |
)
|
| 180 |
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
class BasicAgent:
|
| 183 |
def __init__(self, name: str = "GGSAgent"):
|
| 184 |
self.name = name
|
|
|
|
| 190 |
WikipediaSearchTool(),
|
| 191 |
SpeechToTextTool(),
|
| 192 |
transcribe_audio,
|
| 193 |
+
analyze_youtube_video,
|
| 194 |
+
get_youtube_video_info,
|
| 195 |
],
|
| 196 |
model=model,
|
| 197 |
max_steps=20,
|
|
|
|
| 211 |
"time",
|
| 212 |
"threading",
|
| 213 |
"random",
|
| 214 |
+
"cv2",
|
| 215 |
+
"numpy",
|
| 216 |
+
"PIL",
|
| 217 |
+
"base64",
|
| 218 |
+
"io",
|
| 219 |
+
"pathlib",
|
| 220 |
+
"subprocess",
|
| 221 |
],
|
| 222 |
add_base_tools=True,
|
| 223 |
)
|
docs/youtube_analysis_guide.md
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# YouTube Video Analysis Tools Documentation
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
|
| 5 |
+
This project now includes powerful YouTube video analysis capabilities that allow the agent to:
|
| 6 |
+
|
| 7 |
+
1. **Extract metadata** from YouTube videos without downloading them
|
| 8 |
+
2. **Download videos** and extract frames at specified intervals
|
| 9 |
+
3. **Analyze visual content** of video frames
|
| 10 |
+
4. **Provide timestamped descriptions** of video content
|
| 11 |
+
|
| 12 |
+
## Tools Available
|
| 13 |
+
|
| 14 |
+
### 1. `get_youtube_video_info(video_url)`
|
| 15 |
+
|
| 16 |
+
**Purpose**: Quick metadata retrieval without downloading the video.
|
| 17 |
+
|
| 18 |
+
**Parameters**:
|
| 19 |
+
- `video_url` (str): YouTube video URL
|
| 20 |
+
|
| 21 |
+
**Returns**: JSON string containing:
|
| 22 |
+
- Video title, duration, uploader
|
| 23 |
+
- View count, upload date
|
| 24 |
+
- Resolution and description excerpt
|
| 25 |
+
- Status (success/error)
|
| 26 |
+
|
| 27 |
+
**Example Usage**:
|
| 28 |
+
```python
|
| 29 |
+
result = get_youtube_video_info("https://www.youtube.com/watch?v=VIDEO_ID")
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### 2. `analyze_youtube_video(video_url, max_frames=6, interval_seconds=45.0)`
|
| 33 |
+
|
| 34 |
+
**Purpose**: Full video analysis with frame extraction and description.
|
| 35 |
+
|
| 36 |
+
**Parameters**:
|
| 37 |
+
- `video_url` (str): YouTube video URL
|
| 38 |
+
- `max_frames` (int): Maximum frames to extract (1-10, default: 6)
|
| 39 |
+
- `interval_seconds` (float): Time between extractions (min: 10s, default: 45s)
|
| 40 |
+
|
| 41 |
+
**Returns**: JSON string containing:
|
| 42 |
+
- Video metadata
|
| 43 |
+
- Frame analyses with timestamps
|
| 44 |
+
- Detailed descriptions of visual content
|
| 45 |
+
- Extraction summary
|
| 46 |
+
|
| 47 |
+
**Example Usage**:
|
| 48 |
+
```python
|
| 49 |
+
result = analyze_youtube_video(
|
| 50 |
+
"https://www.youtube.com/watch?v=VIDEO_ID",
|
| 51 |
+
max_frames=5,
|
| 52 |
+
interval_seconds=30.0
|
| 53 |
+
)
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
## Agent Integration
|
| 57 |
+
|
| 58 |
+
The tools are integrated into the `BasicAgent` and can be used through natural language queries:
|
| 59 |
+
|
| 60 |
+
### Example Queries
|
| 61 |
+
|
| 62 |
+
1. **Video Information**:
|
| 63 |
+
- "What is the title and duration of this video: [URL]?"
|
| 64 |
+
- "Get information about this YouTube video: [URL]"
|
| 65 |
+
- "How many views does this video have: [URL]?"
|
| 66 |
+
|
| 67 |
+
2. **Content Analysis**:
|
| 68 |
+
- "Analyze this YouTube video and tell me what happens: [URL]"
|
| 69 |
+
- "Describe the visual content of this tutorial: [URL]"
|
| 70 |
+
- "What can you see in this video: [URL]?"
|
| 71 |
+
|
| 72 |
+
3. **Frame Extraction**:
|
| 73 |
+
- "Extract 5 frames from this video every 60 seconds: [URL]"
|
| 74 |
+
- "Show me frames from the beginning, middle, and end of this video: [URL]"
|
| 75 |
+
- "Analyze key moments in this video: [URL]"
|
| 76 |
+
|
| 77 |
+
## Technical Details
|
| 78 |
+
|
| 79 |
+
### Dependencies
|
| 80 |
+
- **yt-dlp**: YouTube video downloading
|
| 81 |
+
- **opencv-python**: Frame extraction and processing
|
| 82 |
+
- **PIL (Pillow)**: Image processing and encoding
|
| 83 |
+
- **numpy**: Numerical operations for image arrays
|
| 84 |
+
|
| 85 |
+
### Processing Pipeline
|
| 86 |
+
1. **Video Download**: yt-dlp downloads video in optimal quality (≤720p)
|
| 87 |
+
2. **Frame Extraction**: OpenCV extracts frames at specified intervals
|
| 88 |
+
3. **Image Processing**: Frames are resized (512px width) and converted to base64
|
| 89 |
+
4. **Analysis Ready**: Frames prepared for image analysis models
|
| 90 |
+
|
| 91 |
+
### Performance Considerations
|
| 92 |
+
- **Download Limits**: Videos are limited to ≤720p to reduce bandwidth
|
| 93 |
+
- **Frame Limits**: Maximum 10 frames to control processing time
|
| 94 |
+
- **Interval Limits**: Minimum 10 seconds between frames to avoid redundancy
|
| 95 |
+
- **Timeout Handling**: Robust error handling for network issues
|
| 96 |
+
|
| 97 |
+
### Error Handling
|
| 98 |
+
- Invalid YouTube URLs
|
| 99 |
+
- Network connectivity issues
|
| 100 |
+
- Video download failures
|
| 101 |
+
- Processing errors
|
| 102 |
+
- Unsupported video formats
|
| 103 |
+
|
| 104 |
+
## Usage Examples
|
| 105 |
+
|
| 106 |
+
### In Agent Conversations
|
| 107 |
+
|
| 108 |
+
**User**: "Can you analyze this YouTube video and tell me what it's about? https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
| 109 |
+
|
| 110 |
+
**Agent Response**: The agent will:
|
| 111 |
+
1. First get video metadata to understand duration and title
|
| 112 |
+
2. Extract frames at intervals throughout the video
|
| 113 |
+
3. Analyze each frame for visual content
|
| 114 |
+
4. Provide a comprehensive summary with timestamps
|
| 115 |
+
|
| 116 |
+
### Sample Output Structure
|
| 117 |
+
|
| 118 |
+
```json
|
| 119 |
+
{
|
| 120 |
+
"status": "success",
|
| 121 |
+
"video_info": {
|
| 122 |
+
"title": "Video Title",
|
| 123 |
+
"duration": "3:33",
|
| 124 |
+
"uploader": "Channel Name"
|
| 125 |
+
},
|
| 126 |
+
"analysis_summary": "Analyzed 6 frames from 'Video Title' (Duration: 3:33) at 30s intervals.",
|
| 127 |
+
"frames_extracted": 6,
|
| 128 |
+
"frame_analyses": [
|
| 129 |
+
{
|
| 130 |
+
"timestamp_seconds": 0,
|
| 131 |
+
"timestamp_formatted": "0:00",
|
| 132 |
+
"description": "Description of what's visible in the frame"
|
| 133 |
+
}
|
| 134 |
+
]
|
| 135 |
+
}
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
## Best Practices
|
| 139 |
+
|
| 140 |
+
1. **Start with video info** for unknown videos to check duration and content
|
| 141 |
+
2. **Use appropriate intervals** - shorter for action videos, longer for static content
|
| 142 |
+
3. **Limit frame count** for long videos to avoid excessive processing
|
| 143 |
+
4. **Handle errors gracefully** - network issues are common with video downloads
|
| 144 |
+
|
| 145 |
+
## Limitations
|
| 146 |
+
|
| 147 |
+
- Requires internet connection for video access
|
| 148 |
+
- Processing time depends on video length and quality
|
| 149 |
+
- Geographic restrictions may apply to some videos
|
| 150 |
+
- Rate limiting may occur with excessive usage
|
| 151 |
+
|
| 152 |
+
## Future Enhancements
|
| 153 |
+
|
| 154 |
+
Potential improvements could include:
|
| 155 |
+
- Integration with image analysis models for automated descriptions
|
| 156 |
+
- Audio transcription combined with visual analysis
|
| 157 |
+
- Scene change detection for intelligent frame selection
|
| 158 |
+
- Batch processing for multiple videos
|
| 159 |
+
- Caching mechanisms for frequently accessed videos
|
requirements.txt
CHANGED
|
@@ -13,4 +13,5 @@ wikipedia-api
|
|
| 13 |
yt-dlp
|
| 14 |
openai-whisper
|
| 15 |
torch
|
| 16 |
-
transformers
|
|
|
|
|
|
| 13 |
yt-dlp
|
| 14 |
openai-whisper
|
| 15 |
torch
|
| 16 |
+
transformers
|
| 17 |
+
opencv-python
|
scripts/youtube_demo.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Example usage of YouTube Video Analysis Tools
|
| 4 |
+
|
| 5 |
+
This script demonstrates how to use the YouTube video analysis tools
|
| 6 |
+
that have been added to the agent.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import sys
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 14 |
+
from tools import analyze_youtube_video, get_youtube_video_info
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def demo_video_info():
|
| 18 |
+
"""Demonstrate getting video information."""
|
| 19 |
+
print("🎬 Getting Video Information")
|
| 20 |
+
print("-" * 40)
|
| 21 |
+
|
| 22 |
+
# Example with a well-known short video
|
| 23 |
+
video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
result = get_youtube_video_info(video_url)
|
| 27 |
+
info = json.loads(result)
|
| 28 |
+
|
| 29 |
+
if info.get("status") == "success":
|
| 30 |
+
print(f"✅ Title: {info.get('title')}")
|
| 31 |
+
print(f"⏱️ Duration: {info.get('duration')}")
|
| 32 |
+
print(f"👤 Uploader: {info.get('uploader')}")
|
| 33 |
+
print(f"👁️ Views: {info.get('view_count'):,}")
|
| 34 |
+
print(f"📏 Resolution: {info.get('resolution')}")
|
| 35 |
+
else:
|
| 36 |
+
print(f"❌ Error: {info.get('error')}")
|
| 37 |
+
|
| 38 |
+
except Exception as e:
|
| 39 |
+
print(f"❌ Exception: {e}")
|
| 40 |
+
|
| 41 |
+
print("\n")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def demo_frame_analysis():
|
| 45 |
+
"""Demonstrate frame analysis (this would take longer)."""
|
| 46 |
+
print("🎞️ Frame Analysis Example")
|
| 47 |
+
print("-" * 40)
|
| 48 |
+
print("Note: This would download and analyze video frames.")
|
| 49 |
+
print("For demonstration, we'll show how to call it:")
|
| 50 |
+
print()
|
| 51 |
+
|
| 52 |
+
example_code = """
|
| 53 |
+
# Example usage:
|
| 54 |
+
result = analyze_youtube_video(
|
| 55 |
+
"https://www.youtube.com/watch?v=dQw4w9WgXcQ",
|
| 56 |
+
max_frames=3,
|
| 57 |
+
interval_seconds=60.0
|
| 58 |
+
)
|
| 59 |
+
analysis = json.loads(result)
|
| 60 |
+
|
| 61 |
+
if analysis.get('status') == 'success':
|
| 62 |
+
print(f"Video: {analysis['video_info']['title']}")
|
| 63 |
+
print(f"Frames analyzed: {analysis['frames_extracted']}")
|
| 64 |
+
|
| 65 |
+
for frame in analysis['frame_analyses']:
|
| 66 |
+
timestamp = frame['timestamp_formatted']
|
| 67 |
+
description = frame['description']
|
| 68 |
+
print(f" {timestamp}: {description}")
|
| 69 |
+
"""
|
| 70 |
+
|
| 71 |
+
print(example_code)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def demo_agent_integration():
|
| 75 |
+
"""Show how these tools integrate with the agent."""
|
| 76 |
+
print("🤖 Agent Integration")
|
| 77 |
+
print("-" * 40)
|
| 78 |
+
|
| 79 |
+
example_queries = [
|
| 80 |
+
"Get information about this YouTube video: https://www.youtube.com/watch?v=dQw4w9WgXcQ",
|
| 81 |
+
"Analyze this video and describe what happens in it: [YouTube URL]",
|
| 82 |
+
"Extract 5 frames from this tutorial video every 30 seconds: [YouTube URL]",
|
| 83 |
+
"What is the duration and title of this video: [YouTube URL]",
|
| 84 |
+
"Describe the visual content of this educational video: [YouTube URL]",
|
| 85 |
+
]
|
| 86 |
+
|
| 87 |
+
print("Example queries the agent can now handle:")
|
| 88 |
+
print()
|
| 89 |
+
for i, query in enumerate(example_queries, 1):
|
| 90 |
+
print(f"{i}. {query}")
|
| 91 |
+
|
| 92 |
+
print("\n" + "=" * 50)
|
| 93 |
+
print("The agent now has YouTube video analysis capabilities!")
|
| 94 |
+
print("Users can ask questions about YouTube videos and get:")
|
| 95 |
+
print("• Video metadata (title, duration, uploader)")
|
| 96 |
+
print("• Frame-by-frame visual analysis")
|
| 97 |
+
print("• Content summaries and descriptions")
|
| 98 |
+
print("• Timestamp-based scene analysis")
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
if __name__ == "__main__":
|
| 102 |
+
print("YouTube Video Analysis Tools - Demo")
|
| 103 |
+
print("=" * 50)
|
| 104 |
+
|
| 105 |
+
# Demo 1: Video info
|
| 106 |
+
demo_video_info()
|
| 107 |
+
|
| 108 |
+
# Demo 2: Frame analysis explanation
|
| 109 |
+
demo_frame_analysis()
|
| 110 |
+
|
| 111 |
+
# Demo 3: Agent integration
|
| 112 |
+
demo_agent_integration()
|
tests/test_transcription_tools.py
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for audio transcription tools
|
| 4 |
+
|
| 5 |
+
This script tests the audio transcription functionality.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
import os
|
| 10 |
+
import io
|
| 11 |
+
import wave
|
| 12 |
+
import struct
|
| 13 |
+
import unittest
|
| 14 |
+
from unittest.mock import Mock, patch, MagicMock
|
| 15 |
+
|
| 16 |
+
# Add the parent directory to the path to import from tools
|
| 17 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 18 |
+
|
| 19 |
+
# Import the transcription tool directly to avoid YouTube tool dependencies
|
| 20 |
+
from tools.transcription_tools import transcribe_audio
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class TestTranscriptionTools(unittest.TestCase):
|
| 24 |
+
"""Test cases for transcription tools."""
|
| 25 |
+
|
| 26 |
+
def setUp(self):
|
| 27 |
+
"""Set up test fixtures."""
|
| 28 |
+
# Create a simple WAV file in memory for testing
|
| 29 |
+
self.sample_audio_bytes = self._create_test_wav_bytes()
|
| 30 |
+
|
| 31 |
+
def _create_test_wav_bytes(self):
|
| 32 |
+
"""Create a simple WAV file as bytes for testing."""
|
| 33 |
+
# Create a simple sine wave WAV file
|
| 34 |
+
sample_rate = 44100
|
| 35 |
+
duration = 1 # 1 second
|
| 36 |
+
frequency = 440 # A4 note
|
| 37 |
+
|
| 38 |
+
# Generate sine wave samples
|
| 39 |
+
samples = []
|
| 40 |
+
for i in range(sample_rate * duration):
|
| 41 |
+
sample = int(
|
| 42 |
+
32767
|
| 43 |
+
* 0.3
|
| 44 |
+
* (1.0 if (i // (sample_rate // frequency // 2)) % 2 == 0 else -1.0)
|
| 45 |
+
)
|
| 46 |
+
samples.append(sample)
|
| 47 |
+
|
| 48 |
+
# Create WAV file in memory
|
| 49 |
+
wav_buffer = io.BytesIO()
|
| 50 |
+
with wave.open(wav_buffer, "wb") as wav_file:
|
| 51 |
+
wav_file.setnchannels(1) # Mono
|
| 52 |
+
wav_file.setsampwidth(2) # 2 bytes per sample
|
| 53 |
+
wav_file.setframerate(sample_rate)
|
| 54 |
+
wav_file.writeframes(struct.pack("<" + "h" * len(samples), *samples))
|
| 55 |
+
|
| 56 |
+
return wav_buffer.getvalue()
|
| 57 |
+
|
| 58 |
+
@patch("tools.transcription_tools.SpeechToTextTool")
|
| 59 |
+
def test_transcribe_audio_success(self, mock_speech_tool_class):
|
| 60 |
+
"""Test successful audio transcription."""
|
| 61 |
+
# Setup mock
|
| 62 |
+
mock_speech_tool = Mock()
|
| 63 |
+
mock_speech_tool.transcribe.return_value = (
|
| 64 |
+
"Hello, this is a test transcription."
|
| 65 |
+
)
|
| 66 |
+
mock_speech_tool_class.return_value = mock_speech_tool
|
| 67 |
+
|
| 68 |
+
# Test transcription
|
| 69 |
+
result = transcribe_audio(self.sample_audio_bytes)
|
| 70 |
+
|
| 71 |
+
# Assertions
|
| 72 |
+
self.assertEqual(result, "Hello, this is a test transcription.")
|
| 73 |
+
mock_speech_tool_class.assert_called_once()
|
| 74 |
+
mock_speech_tool.transcribe.assert_called_once_with(self.sample_audio_bytes)
|
| 75 |
+
|
| 76 |
+
@patch("tools.transcription_tools.SpeechToTextTool")
|
| 77 |
+
def test_transcribe_audio_empty_bytes(self, mock_speech_tool_class):
|
| 78 |
+
"""Test transcription with empty audio bytes."""
|
| 79 |
+
# Setup mock
|
| 80 |
+
mock_speech_tool = Mock()
|
| 81 |
+
mock_speech_tool.transcribe.return_value = ""
|
| 82 |
+
mock_speech_tool_class.return_value = mock_speech_tool
|
| 83 |
+
|
| 84 |
+
# Test transcription with empty bytes
|
| 85 |
+
result = transcribe_audio(b"")
|
| 86 |
+
|
| 87 |
+
# Assertions
|
| 88 |
+
self.assertEqual(result, "")
|
| 89 |
+
mock_speech_tool.transcribe.assert_called_once_with(b"")
|
| 90 |
+
|
| 91 |
+
@patch("tools.transcription_tools.SpeechToTextTool")
|
| 92 |
+
def test_transcribe_audio_tool_exception(self, mock_speech_tool_class):
|
| 93 |
+
"""Test transcription when SpeechToTextTool raises an exception."""
|
| 94 |
+
# Setup mock to raise exception
|
| 95 |
+
mock_speech_tool = Mock()
|
| 96 |
+
mock_speech_tool.transcribe.side_effect = Exception(
|
| 97 |
+
"Transcription service unavailable"
|
| 98 |
+
)
|
| 99 |
+
mock_speech_tool_class.return_value = mock_speech_tool
|
| 100 |
+
|
| 101 |
+
# Test that our function re-raises with a more descriptive message
|
| 102 |
+
with self.assertRaises(Exception) as context:
|
| 103 |
+
transcribe_audio(self.sample_audio_bytes)
|
| 104 |
+
|
| 105 |
+
self.assertIn("Failed to transcribe audio", str(context.exception))
|
| 106 |
+
self.assertIn("Transcription service unavailable", str(context.exception))
|
| 107 |
+
|
| 108 |
+
@patch("tools.transcription_tools.SpeechToTextTool")
|
| 109 |
+
def test_transcribe_audio_invalid_format(self, mock_speech_tool_class):
|
| 110 |
+
"""Test transcription with invalid audio format."""
|
| 111 |
+
# Setup mock to raise exception for invalid format
|
| 112 |
+
mock_speech_tool = Mock()
|
| 113 |
+
mock_speech_tool.transcribe.side_effect = Exception("Invalid audio format")
|
| 114 |
+
mock_speech_tool_class.return_value = mock_speech_tool
|
| 115 |
+
|
| 116 |
+
# Test with invalid audio data
|
| 117 |
+
invalid_audio = b"This is not audio data"
|
| 118 |
+
|
| 119 |
+
with self.assertRaises(Exception) as context:
|
| 120 |
+
transcribe_audio(invalid_audio)
|
| 121 |
+
|
| 122 |
+
self.assertIn("Failed to transcribe audio", str(context.exception))
|
| 123 |
+
self.assertIn("Invalid audio format", str(context.exception))
|
| 124 |
+
|
| 125 |
+
def test_transcribe_audio_function_signature(self):
|
| 126 |
+
"""Test that the function has the expected signature and documentation."""
|
| 127 |
+
# Check function exists and is callable
|
| 128 |
+
self.assertTrue(callable(transcribe_audio))
|
| 129 |
+
|
| 130 |
+
# Note: The @tool decorator may modify the function, so docstring and attributes
|
| 131 |
+
# may not be preserved in the usual way. This is expected behavior.
|
| 132 |
+
|
| 133 |
+
# Check if it's decorated as a smolagents tool (may not be detectable in all cases)
|
| 134 |
+
has_tool_attr = hasattr(transcribe_audio, "_smolagents_tool")
|
| 135 |
+
if has_tool_attr:
|
| 136 |
+
print("Function is properly decorated as a smolagents tool")
|
| 137 |
+
else:
|
| 138 |
+
print("Tool decoration may not be detectable (this is normal)")
|
| 139 |
+
|
| 140 |
+
# The function should at least be callable
|
| 141 |
+
self.assertTrue(callable(transcribe_audio))
|
| 142 |
+
|
| 143 |
+
@patch("tools.transcription_tools.SpeechToTextTool")
|
| 144 |
+
def test_transcribe_audio_with_various_formats_description(
|
| 145 |
+
self, mock_speech_tool_class
|
| 146 |
+
):
|
| 147 |
+
"""Test that transcription works with different audio formats (mocked)."""
|
| 148 |
+
# Setup mock
|
| 149 |
+
mock_speech_tool = Mock()
|
| 150 |
+
mock_speech_tool.transcribe.return_value = "Transcribed content"
|
| 151 |
+
mock_speech_tool_class.return_value = mock_speech_tool
|
| 152 |
+
|
| 153 |
+
# Test different "formats" (really just different byte content)
|
| 154 |
+
formats_to_test = [
|
| 155 |
+
(b"WAV_FILE_CONTENT", "WAV format"),
|
| 156 |
+
(b"MP3_FILE_CONTENT", "MP3 format"),
|
| 157 |
+
(b"OGG_FILE_CONTENT", "OGG format"),
|
| 158 |
+
]
|
| 159 |
+
|
| 160 |
+
for audio_bytes, format_name in formats_to_test:
|
| 161 |
+
with self.subTest(format=format_name):
|
| 162 |
+
result = transcribe_audio(audio_bytes)
|
| 163 |
+
self.assertEqual(result, "Transcribed content")
|
| 164 |
+
mock_speech_tool.transcribe.assert_called_with(audio_bytes)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def test_basic_functionality():
|
| 168 |
+
"""Basic integration test (without mocking)."""
|
| 169 |
+
print("Testing transcription tools basic functionality...")
|
| 170 |
+
|
| 171 |
+
try:
|
| 172 |
+
# Import the function to make sure it exists and imports work
|
| 173 |
+
from tools.transcription_tools import transcribe_audio
|
| 174 |
+
|
| 175 |
+
print("✅ Successfully imported transcribe_audio function")
|
| 176 |
+
|
| 177 |
+
# Check the function is decorated as a tool
|
| 178 |
+
if hasattr(transcribe_audio, "_smolagents_tool"):
|
| 179 |
+
print("✅ Function is properly decorated as a smolagents tool")
|
| 180 |
+
else:
|
| 181 |
+
print("⚠️ Function may not be properly decorated as a tool")
|
| 182 |
+
|
| 183 |
+
# Check docstring
|
| 184 |
+
if transcribe_audio.__doc__ and "audio_bytes" in transcribe_audio.__doc__:
|
| 185 |
+
print("✅ Function has proper documentation")
|
| 186 |
+
else:
|
| 187 |
+
print("⚠️ Function documentation may be incomplete")
|
| 188 |
+
|
| 189 |
+
return True
|
| 190 |
+
|
| 191 |
+
except ImportError as e:
|
| 192 |
+
print(f"❌ Import error: {e}")
|
| 193 |
+
return False
|
| 194 |
+
except Exception as e:
|
| 195 |
+
print(f"❌ Error: {e}")
|
| 196 |
+
return False
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
if __name__ == "__main__":
|
| 200 |
+
print("Audio Transcription Tools Test")
|
| 201 |
+
print("=" * 50)
|
| 202 |
+
|
| 203 |
+
# Run basic functionality test first
|
| 204 |
+
basic_success = test_basic_functionality()
|
| 205 |
+
print("-" * 50)
|
| 206 |
+
|
| 207 |
+
if basic_success:
|
| 208 |
+
# Run unit tests
|
| 209 |
+
print("Running unit tests...")
|
| 210 |
+
unittest.main(verbosity=2, exit=False)
|
| 211 |
+
else:
|
| 212 |
+
print("❌ Basic functionality test failed - skipping unit tests")
|
| 213 |
+
sys.exit(1)
|
tests/test_transcription_tools_standalone.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for audio transcription tools (standalone version)
|
| 4 |
+
|
| 5 |
+
This script tests the audio transcription functionality directly
|
| 6 |
+
without importing the full tools package.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import sys
|
| 10 |
+
import os
|
| 11 |
+
import io
|
| 12 |
+
import wave
|
| 13 |
+
import struct
|
| 14 |
+
import unittest
|
| 15 |
+
from unittest.mock import Mock, patch
|
| 16 |
+
|
| 17 |
+
# Add the parent directory to the path to import from tools
|
| 18 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_basic_functionality():
|
| 22 |
+
"""Basic integration test without importing the full tools package."""
|
| 23 |
+
print("Testing transcription tools basic functionality...")
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
# Import the function directly to make sure it exists and imports work
|
| 27 |
+
from tools.transcription_tools import transcribe_audio
|
| 28 |
+
|
| 29 |
+
print("✅ Successfully imported transcribe_audio function")
|
| 30 |
+
|
| 31 |
+
# Check the function is decorated as a tool
|
| 32 |
+
if hasattr(transcribe_audio, "_smolagents_tool"):
|
| 33 |
+
print("✅ Function is properly decorated as a smolagents tool")
|
| 34 |
+
else:
|
| 35 |
+
print("⚠️ Function may not be properly decorated as a tool")
|
| 36 |
+
|
| 37 |
+
# Check docstring
|
| 38 |
+
if transcribe_audio.__doc__ and "audio_bytes" in transcribe_audio.__doc__:
|
| 39 |
+
print("✅ Function has proper documentation")
|
| 40 |
+
else:
|
| 41 |
+
print("⚠️ Function documentation may be incomplete")
|
| 42 |
+
|
| 43 |
+
return True
|
| 44 |
+
|
| 45 |
+
except ImportError as e:
|
| 46 |
+
print(f"❌ Import error: {e}")
|
| 47 |
+
return False
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"❌ Error: {e}")
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_transcribe_with_mock():
|
| 54 |
+
"""Test transcription function with mocked SpeechToTextTool."""
|
| 55 |
+
print("Testing transcription with mocked tool...")
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
# Import the function
|
| 59 |
+
from tools.transcription_tools import transcribe_audio
|
| 60 |
+
|
| 61 |
+
# Create sample audio bytes (simple WAV file structure)
|
| 62 |
+
sample_rate = 44100
|
| 63 |
+
duration = 1
|
| 64 |
+
samples = []
|
| 65 |
+
for i in range(sample_rate * duration):
|
| 66 |
+
sample = int(
|
| 67 |
+
32767
|
| 68 |
+
* 0.3
|
| 69 |
+
* (1.0 if (i // (sample_rate // 440 // 2)) % 2 == 0 else -1.0)
|
| 70 |
+
)
|
| 71 |
+
samples.append(sample)
|
| 72 |
+
|
| 73 |
+
wav_buffer = io.BytesIO()
|
| 74 |
+
with wave.open(wav_buffer, "wb") as wav_file:
|
| 75 |
+
wav_file.setnchannels(1)
|
| 76 |
+
wav_file.setsampwidth(2)
|
| 77 |
+
wav_file.setframerate(sample_rate)
|
| 78 |
+
wav_file.writeframes(struct.pack("<" + "h" * len(samples), *samples))
|
| 79 |
+
|
| 80 |
+
sample_audio_bytes = wav_buffer.getvalue()
|
| 81 |
+
|
| 82 |
+
# Mock the SpeechToTextTool
|
| 83 |
+
with patch(
|
| 84 |
+
"tools.transcription_tools.SpeechToTextTool"
|
| 85 |
+
) as mock_speech_tool_class:
|
| 86 |
+
mock_speech_tool = Mock()
|
| 87 |
+
mock_speech_tool.transcribe.return_value = (
|
| 88 |
+
"Hello, this is a test transcription."
|
| 89 |
+
)
|
| 90 |
+
mock_speech_tool_class.return_value = mock_speech_tool
|
| 91 |
+
|
| 92 |
+
# Test transcription
|
| 93 |
+
result = transcribe_audio(sample_audio_bytes)
|
| 94 |
+
|
| 95 |
+
# Verify the result
|
| 96 |
+
if result == "Hello, this is a test transcription.":
|
| 97 |
+
print("✅ Transcription function returned expected result")
|
| 98 |
+
else:
|
| 99 |
+
print(f"⚠️ Unexpected result: {result}")
|
| 100 |
+
|
| 101 |
+
# Verify the mock was called correctly
|
| 102 |
+
mock_speech_tool_class.assert_called_once()
|
| 103 |
+
mock_speech_tool.transcribe.assert_called_once_with(sample_audio_bytes)
|
| 104 |
+
print("✅ SpeechToTextTool was called with correct parameters")
|
| 105 |
+
|
| 106 |
+
return True
|
| 107 |
+
|
| 108 |
+
except Exception as e:
|
| 109 |
+
print(f"❌ Error during mocked test: {e}")
|
| 110 |
+
return False
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def test_error_handling():
|
| 114 |
+
"""Test error handling in transcription function."""
|
| 115 |
+
print("Testing error handling...")
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
from tools.transcription_tools import transcribe_audio
|
| 119 |
+
|
| 120 |
+
# Mock the SpeechToTextTool to raise an exception
|
| 121 |
+
with patch(
|
| 122 |
+
"tools.transcription_tools.SpeechToTextTool"
|
| 123 |
+
) as mock_speech_tool_class:
|
| 124 |
+
mock_speech_tool = Mock()
|
| 125 |
+
mock_speech_tool.transcribe.side_effect = Exception(
|
| 126 |
+
"Transcription service unavailable"
|
| 127 |
+
)
|
| 128 |
+
mock_speech_tool_class.return_value = mock_speech_tool
|
| 129 |
+
|
| 130 |
+
# Test that our function re-raises with a more descriptive message
|
| 131 |
+
try:
|
| 132 |
+
transcribe_audio(b"some_audio_data")
|
| 133 |
+
print("❌ Function should have raised an exception")
|
| 134 |
+
return False
|
| 135 |
+
except Exception as e:
|
| 136 |
+
if "Failed to transcribe audio" in str(
|
| 137 |
+
e
|
| 138 |
+
) and "Transcription service unavailable" in str(e):
|
| 139 |
+
print(
|
| 140 |
+
"✅ Function properly handles and re-raises exceptions with descriptive message"
|
| 141 |
+
)
|
| 142 |
+
return True
|
| 143 |
+
else:
|
| 144 |
+
print(f"⚠��� Exception message not as expected: {e}")
|
| 145 |
+
return False
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"❌ Error during error handling test: {e}")
|
| 149 |
+
return False
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
if __name__ == "__main__":
|
| 153 |
+
print("Audio Transcription Tools Test (Standalone)")
|
| 154 |
+
print("=" * 50)
|
| 155 |
+
|
| 156 |
+
# Run tests sequentially
|
| 157 |
+
tests = [
|
| 158 |
+
("Basic functionality", test_basic_functionality),
|
| 159 |
+
("Mocked transcription", test_transcribe_with_mock),
|
| 160 |
+
("Error handling", test_error_handling),
|
| 161 |
+
]
|
| 162 |
+
|
| 163 |
+
passed = 0
|
| 164 |
+
failed = 0
|
| 165 |
+
|
| 166 |
+
for test_name, test_func in tests:
|
| 167 |
+
print(f"\nRunning: {test_name}")
|
| 168 |
+
print("-" * 30)
|
| 169 |
+
try:
|
| 170 |
+
if test_func():
|
| 171 |
+
passed += 1
|
| 172 |
+
print(f"✅ {test_name} PASSED")
|
| 173 |
+
else:
|
| 174 |
+
failed += 1
|
| 175 |
+
print(f"❌ {test_name} FAILED")
|
| 176 |
+
except Exception as e:
|
| 177 |
+
failed += 1
|
| 178 |
+
print(f"❌ {test_name} FAILED with exception: {e}")
|
| 179 |
+
|
| 180 |
+
print("\n" + "=" * 50)
|
| 181 |
+
print(f"Test Results: {passed} passed, {failed} failed")
|
| 182 |
+
|
| 183 |
+
if failed == 0:
|
| 184 |
+
print("🎉 All tests passed!")
|
| 185 |
+
sys.exit(0)
|
| 186 |
+
else:
|
| 187 |
+
print("💥 Some tests failed!")
|
| 188 |
+
sys.exit(1)
|
tests/test_youtube_tools.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for YouTube video analysis tools
|
| 4 |
+
|
| 5 |
+
This script tests the YouTube video analysis functionality without running the full agent.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 12 |
+
|
| 13 |
+
from tools.youtube_tools import get_youtube_video_info, analyze_youtube_video
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_video_info():
|
| 17 |
+
"""Test getting video information without downloading."""
|
| 18 |
+
print("Testing video info retrieval...")
|
| 19 |
+
|
| 20 |
+
# Use a popular, short video for testing
|
| 21 |
+
test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" # Rick Roll (classic!)
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
result = get_youtube_video_info(test_url)
|
| 25 |
+
print("Video info result:")
|
| 26 |
+
print(result)
|
| 27 |
+
print("-" * 50)
|
| 28 |
+
return True
|
| 29 |
+
except Exception as e:
|
| 30 |
+
print(f"Error in video info test: {e}")
|
| 31 |
+
return False
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def test_video_analysis():
|
| 35 |
+
"""Test full video analysis with frame extraction."""
|
| 36 |
+
print("Testing video analysis with frame extraction...")
|
| 37 |
+
|
| 38 |
+
# Use a shorter video for testing to avoid long download times
|
| 39 |
+
test_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
result = analyze_youtube_video(test_url, max_frames=3, interval_seconds=30.0)
|
| 43 |
+
print("Video analysis result:")
|
| 44 |
+
print(result)
|
| 45 |
+
print("-" * 50)
|
| 46 |
+
return True
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"Error in video analysis test: {e}")
|
| 49 |
+
return False
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
if __name__ == "__main__":
|
| 53 |
+
print("YouTube Video Analysis Tools Test")
|
| 54 |
+
print("=" * 50)
|
| 55 |
+
|
| 56 |
+
# Test 1: Video info
|
| 57 |
+
info_success = test_video_info()
|
| 58 |
+
|
| 59 |
+
# Test 2: Full analysis (commented out for now due to potential long execution time)
|
| 60 |
+
print(
|
| 61 |
+
"Skipping full video analysis test - uncomment in test_video_analysis() to run"
|
| 62 |
+
)
|
| 63 |
+
analysis_success = True # test_video_analysis()
|
| 64 |
+
|
| 65 |
+
print("=" * 50)
|
| 66 |
+
if info_success and analysis_success:
|
| 67 |
+
print("✅ All tests passed!")
|
| 68 |
+
else:
|
| 69 |
+
print("❌ Some tests failed!")
|
| 70 |
+
sys.exit(1)
|
tools/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tools package for the Agents Course Final Assignment
|
| 3 |
+
|
| 4 |
+
This package contains custom tools for the agent, including YouTube video analysis
|
| 5 |
+
and audio transcription capabilities.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from .youtube_tools import analyze_youtube_video, get_youtube_video_info
|
| 9 |
+
from .transcription_tools import transcribe_audio
|
| 10 |
+
|
| 11 |
+
__all__ = ["analyze_youtube_video", "get_youtube_video_info", "transcribe_audio"]
|
tools/transcription_tools.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio transcription tools for the Agents Course Final Assignment
|
| 3 |
+
|
| 4 |
+
This module provides tools for transcribing audio files to text.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from smolagents import SpeechToTextTool, tool
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
@tool
|
| 11 |
+
def transcribe_audio(audio_bytes: bytes) -> str:
|
| 12 |
+
"""
|
| 13 |
+
Given an audio file (bytes), return the transcription (text).
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
audio_bytes: Raw bytes of the audio file to transcribe. Can be the full contents
|
| 17 |
+
of a WAV/MP3/OGG file or other common audio container. The function should
|
| 18 |
+
accept bytes and return the recognized text as a string.
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
str: The transcribed text from the audio file.
|
| 22 |
+
|
| 23 |
+
Raises:
|
| 24 |
+
Exception: If transcription fails due to invalid audio format or other errors.
|
| 25 |
+
"""
|
| 26 |
+
try:
|
| 27 |
+
speech_tool = SpeechToTextTool()
|
| 28 |
+
transcription = speech_tool.transcribe(audio_bytes)
|
| 29 |
+
return transcription
|
| 30 |
+
except Exception as e:
|
| 31 |
+
raise Exception(f"Failed to transcribe audio: {str(e)}")
|
tools/youtube_video_analyzer.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
YouTube Video Frame Analysis Tool
|
| 3 |
+
|
| 4 |
+
This tool uses yt-dlp to download YouTube videos and extract frames for image analysis.
|
| 5 |
+
It can analyze frames at specified intervals and provide descriptions of video content.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import tempfile
|
| 10 |
+
import subprocess
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from typing import List, Dict, Any, Optional
|
| 13 |
+
import json
|
| 14 |
+
import cv2
|
| 15 |
+
import numpy as np
|
| 16 |
+
from PIL import Image
|
| 17 |
+
import base64
|
| 18 |
+
import io
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def extract_video_frames(
|
| 22 |
+
video_url: str,
|
| 23 |
+
max_frames: int = 10,
|
| 24 |
+
interval_seconds: float = 30.0,
|
| 25 |
+
frame_width: int = 512,
|
| 26 |
+
) -> List[str]:
|
| 27 |
+
"""
|
| 28 |
+
Extract frames from a YouTube video at specified intervals.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
video_url: YouTube video URL
|
| 32 |
+
max_frames: Maximum number of frames to extract
|
| 33 |
+
interval_seconds: Interval between frame extractions in seconds
|
| 34 |
+
frame_width: Width to resize frames (maintains aspect ratio)
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
List of base64-encoded frame images
|
| 38 |
+
"""
|
| 39 |
+
frames = []
|
| 40 |
+
|
| 41 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 42 |
+
# Download video using yt-dlp
|
| 43 |
+
video_path = os.path.join(temp_dir, "video.%(ext)s")
|
| 44 |
+
|
| 45 |
+
ydl_opts = {
|
| 46 |
+
"format": "best[height<=720]/best", # Limit quality to reduce download time
|
| 47 |
+
"outtmpl": video_path,
|
| 48 |
+
"quiet": True,
|
| 49 |
+
"no_warnings": True,
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
import yt_dlp
|
| 54 |
+
|
| 55 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 56 |
+
ydl.download([video_url])
|
| 57 |
+
|
| 58 |
+
# Find the downloaded video file
|
| 59 |
+
video_files = list(Path(temp_dir).glob("video.*"))
|
| 60 |
+
if not video_files:
|
| 61 |
+
raise Exception("No video file found after download")
|
| 62 |
+
|
| 63 |
+
actual_video_path = str(video_files[0])
|
| 64 |
+
|
| 65 |
+
# Extract frames using OpenCV
|
| 66 |
+
cap = cv2.VideoCapture(actual_video_path)
|
| 67 |
+
|
| 68 |
+
if not cap.isOpened():
|
| 69 |
+
raise Exception("Could not open video file")
|
| 70 |
+
|
| 71 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 72 |
+
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 73 |
+
duration = total_frames / fps if fps > 0 else 0
|
| 74 |
+
|
| 75 |
+
frame_interval = int(fps * interval_seconds)
|
| 76 |
+
frame_count = 0
|
| 77 |
+
extracted_count = 0
|
| 78 |
+
|
| 79 |
+
while extracted_count < max_frames:
|
| 80 |
+
ret, frame = cap.read()
|
| 81 |
+
if not ret:
|
| 82 |
+
break
|
| 83 |
+
|
| 84 |
+
if frame_count % frame_interval == 0:
|
| 85 |
+
# Resize frame
|
| 86 |
+
height, width = frame.shape[:2]
|
| 87 |
+
aspect_ratio = width / height
|
| 88 |
+
new_width = frame_width
|
| 89 |
+
new_height = int(frame_width / aspect_ratio)
|
| 90 |
+
|
| 91 |
+
resized_frame = cv2.resize(frame, (new_width, new_height))
|
| 92 |
+
|
| 93 |
+
# Convert BGR to RGB
|
| 94 |
+
rgb_frame = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
|
| 95 |
+
|
| 96 |
+
# Convert to PIL Image
|
| 97 |
+
pil_image = Image.fromarray(rgb_frame)
|
| 98 |
+
|
| 99 |
+
# Convert to base64
|
| 100 |
+
buffer = io.BytesIO()
|
| 101 |
+
pil_image.save(buffer, format="JPEG", quality=85)
|
| 102 |
+
img_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
| 103 |
+
|
| 104 |
+
timestamp = frame_count / fps
|
| 105 |
+
frames.append(
|
| 106 |
+
{
|
| 107 |
+
"timestamp": timestamp,
|
| 108 |
+
"image_base64": img_base64,
|
| 109 |
+
"frame_number": frame_count,
|
| 110 |
+
}
|
| 111 |
+
)
|
| 112 |
+
extracted_count += 1
|
| 113 |
+
|
| 114 |
+
frame_count += 1
|
| 115 |
+
|
| 116 |
+
cap.release()
|
| 117 |
+
|
| 118 |
+
except Exception as e:
|
| 119 |
+
raise Exception(f"Error processing video: {str(e)}")
|
| 120 |
+
|
| 121 |
+
return frames
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def analyze_frame_with_description(
|
| 125 |
+
frame_data: Dict[str, Any],
|
| 126 |
+
analysis_prompt: str = "Describe what you see in this image in detail.",
|
| 127 |
+
) -> Dict[str, Any]:
|
| 128 |
+
"""
|
| 129 |
+
Analyze a single frame using image analysis.
|
| 130 |
+
|
| 131 |
+
Args:
|
| 132 |
+
frame_data: Dictionary containing frame information and base64 image
|
| 133 |
+
analysis_prompt: Prompt for image analysis
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
Dictionary with analysis results
|
| 137 |
+
"""
|
| 138 |
+
try:
|
| 139 |
+
# For now, return a placeholder analysis
|
| 140 |
+
# In a real implementation, this would use an image analysis model
|
| 141 |
+
analysis = {
|
| 142 |
+
"timestamp": frame_data["timestamp"],
|
| 143 |
+
"frame_number": frame_data["frame_number"],
|
| 144 |
+
"description": f"Frame at {frame_data['timestamp']:.1f}s - Image analysis would be performed here",
|
| 145 |
+
"image_base64": (
|
| 146 |
+
frame_data["image_base64"][:100] + "..."
|
| 147 |
+
if len(frame_data["image_base64"]) > 100
|
| 148 |
+
else frame_data["image_base64"]
|
| 149 |
+
),
|
| 150 |
+
}
|
| 151 |
+
return analysis
|
| 152 |
+
except Exception as e:
|
| 153 |
+
return {
|
| 154 |
+
"timestamp": frame_data.get("timestamp", 0),
|
| 155 |
+
"frame_number": frame_data.get("frame_number", 0),
|
| 156 |
+
"error": str(e),
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def get_video_metadata(video_url: str) -> Dict[str, Any]:
|
| 161 |
+
"""
|
| 162 |
+
Get metadata for a YouTube video without downloading it.
|
| 163 |
+
|
| 164 |
+
Args:
|
| 165 |
+
video_url: YouTube video URL
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
Dictionary containing video metadata
|
| 169 |
+
"""
|
| 170 |
+
try:
|
| 171 |
+
import yt_dlp
|
| 172 |
+
|
| 173 |
+
ydl_opts = {
|
| 174 |
+
"quiet": True,
|
| 175 |
+
"no_warnings": True,
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 179 |
+
info = ydl.extract_info(video_url, download=False)
|
| 180 |
+
|
| 181 |
+
metadata = {
|
| 182 |
+
"title": info.get("title", "Unknown"),
|
| 183 |
+
"duration": info.get("duration", 0),
|
| 184 |
+
"uploader": info.get("uploader", "Unknown"),
|
| 185 |
+
"view_count": info.get("view_count", 0),
|
| 186 |
+
"upload_date": info.get("upload_date", "Unknown"),
|
| 187 |
+
"description": (
|
| 188 |
+
info.get("description", "")[:500] + "..."
|
| 189 |
+
if info.get("description", "")
|
| 190 |
+
else "No description"
|
| 191 |
+
),
|
| 192 |
+
"width": info.get("width", 0),
|
| 193 |
+
"height": info.get("height", 0),
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
return metadata
|
| 197 |
+
|
| 198 |
+
except Exception as e:
|
| 199 |
+
return {"error": f"Failed to get video metadata: {str(e)}"}
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def analyze_youtube_video_frames(
|
| 203 |
+
video_url: str,
|
| 204 |
+
max_frames: int = 8,
|
| 205 |
+
interval_seconds: float = 30.0,
|
| 206 |
+
include_metadata: bool = True,
|
| 207 |
+
analysis_prompt: str = "Describe what you see in this image.",
|
| 208 |
+
) -> Dict[str, Any]:
|
| 209 |
+
"""
|
| 210 |
+
Complete pipeline to analyze frames from a YouTube video.
|
| 211 |
+
|
| 212 |
+
Args:
|
| 213 |
+
video_url: YouTube video URL
|
| 214 |
+
max_frames: Maximum number of frames to extract and analyze
|
| 215 |
+
interval_seconds: Interval between frame extractions
|
| 216 |
+
include_metadata: Whether to include video metadata
|
| 217 |
+
analysis_prompt: Prompt for frame analysis
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
Dictionary with complete analysis results
|
| 221 |
+
"""
|
| 222 |
+
results = {
|
| 223 |
+
"video_url": video_url,
|
| 224 |
+
"extraction_settings": {
|
| 225 |
+
"max_frames": max_frames,
|
| 226 |
+
"interval_seconds": interval_seconds,
|
| 227 |
+
},
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
try:
|
| 231 |
+
# Get video metadata
|
| 232 |
+
if include_metadata:
|
| 233 |
+
print("Fetching video metadata...")
|
| 234 |
+
results["metadata"] = get_video_metadata(video_url)
|
| 235 |
+
|
| 236 |
+
# Extract frames
|
| 237 |
+
print(f"Extracting up to {max_frames} frames from video...")
|
| 238 |
+
frames = extract_video_frames(video_url, max_frames, interval_seconds)
|
| 239 |
+
|
| 240 |
+
if not frames:
|
| 241 |
+
results["error"] = "No frames could be extracted from the video"
|
| 242 |
+
return results
|
| 243 |
+
|
| 244 |
+
print(f"Successfully extracted {len(frames)} frames")
|
| 245 |
+
|
| 246 |
+
# Analyze each frame
|
| 247 |
+
print("Analyzing extracted frames...")
|
| 248 |
+
frame_analyses = []
|
| 249 |
+
for i, frame_data in enumerate(frames):
|
| 250 |
+
print(f"Analyzing frame {i+1}/{len(frames)}...")
|
| 251 |
+
analysis = analyze_frame_with_description(frame_data, analysis_prompt)
|
| 252 |
+
frame_analyses.append(analysis)
|
| 253 |
+
|
| 254 |
+
results["frames_analyzed"] = len(frame_analyses)
|
| 255 |
+
results["frame_analyses"] = frame_analyses
|
| 256 |
+
results["success"] = True
|
| 257 |
+
|
| 258 |
+
# Generate summary
|
| 259 |
+
if results.get("metadata"):
|
| 260 |
+
duration_str = f"{results['metadata']['duration']//60}:{results['metadata']['duration']%60:02d}"
|
| 261 |
+
results["summary"] = (
|
| 262 |
+
f"Analyzed {len(frame_analyses)} frames from '{results['metadata']['title']}' "
|
| 263 |
+
f"(Duration: {duration_str}) at {interval_seconds}s intervals."
|
| 264 |
+
)
|
| 265 |
+
else:
|
| 266 |
+
results["summary"] = (
|
| 267 |
+
f"Analyzed {len(frame_analyses)} frames from video at {interval_seconds}s intervals."
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
except Exception as e:
|
| 271 |
+
results["error"] = f"Video analysis failed: {str(e)}"
|
| 272 |
+
results["success"] = False
|
| 273 |
+
|
| 274 |
+
return results
|