{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "cdbd4c42-21fc-4796-8161-8b80e7b310c7", "metadata": {}, "outputs": [], "source": [ "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", "from langchain_chroma import Chroma\n", "from langchain_huggingface import HuggingFaceEmbeddings\n", "from bytez import Bytez\n", "from youtube_transcript_api import YouTubeTranscriptApi\n", "import gradio as gr\n", "from dotenv import load_dotenv\n", "import os\n", "from urllib.parse import urlparse, parse_qs" ] }, { "cell_type": "code", "execution_count": 2, "id": "d1943b98-7f49-43fb-81cc-cae13781b4b1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "load_dotenv(\"secrets.env\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "e8ad246a-8136-4916-a086-eb28ad34a958", "metadata": {}, "outputs": [], "source": [ "api_key = os.getenv(\"BYTEZ_API_KEY\")\n", "sdk = Bytez(api_key)" ] }, { "cell_type": "code", "execution_count": 4, "id": "6d9a1500-148a-4ff3-a1ec-81e1faaa3bfc", "metadata": {}, "outputs": [], "source": [ "#toy function\n", "def video_id_extractor(link):\n", " if \"watch?v=\" in link:\n", " return link[32:43]\n", " else:\n", " return link[17:28]" ] }, { "cell_type": "code", "execution_count": 5, "id": "313a5521-41b6-4453-a26b-b2142f7992c0", "metadata": {}, "outputs": [], "source": [ "#production ready function\n", "def video_id_extractor(link):\n", " parsed_url = urlparse(link)\n", " \n", " if \"youtube.com\" in parsed_url.netloc:\n", " return parse_qs(parsed_url.query).get(\"v\", [None])[0]\n", " \n", " elif \"youtu.be\" in parsed_url.netloc:\n", " return parsed_url.path.lstrip(\"/\")\n", " \n", " return None" ] }, { "cell_type": "code", "execution_count": 6, "id": "ef1b022a-b9b9-481d-bbc8-df9cae7137b1", "metadata": {}, "outputs": [], "source": [ "def generate_transcript(video_id):\n", " trans = YouTubeTranscriptApi()\n", " try:\n", " transcript_raw = trans.fetch(video_id = video_id)\n", " except Exception:\n", " return None\n", " transcript = \"\"\n", " for i in transcript_raw.snippets:\n", " transcript += f\" {i.text}\"\n", " return transcript" ] }, { "cell_type": "code", "execution_count": 7, "id": "052cfe69-ff3e-40f5-b574-00f1df69446d", "metadata": {}, "outputs": [], "source": [ "def create_and_save_vs(trans):\n", " try:\n", " splitter = RecursiveCharacterTextSplitter(chunk_size = 100, chunk_overlap = 50)\n", " docs = splitter.split_text(trans)\n", " embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')\n", " vector_store_db = Chroma.from_texts(docs, embeddings, persist_directory='chroma_db')\n", " except Exception:\n", " return None\n", " return vector_store_db" ] }, { "cell_type": "code", "execution_count": 19, "id": "b6e02c40-008d-46af-8728-96f8f359cc75", "metadata": {}, "outputs": [], "source": [ "def generate_summary(trans):\n", " try: \n", " model = sdk.model(\"openai/gpt-4o\")\n", " if len(trans.split(\" \")) > 90000:\n", " trans = trans.split(\" \")[0:85000]\n", " trans = \" \".join(trans)\n", " except Exception:\n", " return None\n", " Inp = [{\"role\": \"system\", \"content\": \"You are a youtube transcipt sammurizer. Sammurize the transcript under 100 words\"}, {\"role\":\"user\", \"content\":trans}]\n", " trails = 4\n", " failed = True\n", " time_to_sleep = 3\n", " while failed and trails > 0:\n", " res = model.run(Inp)\n", " if type(res) == list and len(res) == 3:\n", " failed = False\n", " trails -= 1\n", " return res[0][\"content\"]\n", " else:\n", " time.sleep(time_to_sleep)\n", " time_to_sleep = time_to_sleep **2\n", " trails -= 1\n", " return None\n", " " ] }, { "cell_type": "code", "execution_count": 10, "id": "07a21fd9-7c21-4349-9b0b-eccc84a6c243", "metadata": {}, "outputs": [], "source": [ "def setter(link):\n", " yield gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \"\", \"\"\n", " video_id = video_id_extractor(link)\n", " if not video_id:\n", " yield gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), \"\", \"\"\n", " transcript = generate_transcript(video_id)\n", " if not transcript:\n", " yield gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), \"\", \"\"\n", " vectorstore = create_and_save_vs(transcript)\n", " if not vectorstore:\n", " yield gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), \"\", \"\"\n", " summary = generate_summary(transcript)\n", " if not summary:\n", " yield gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), \"\", \"\"\n", " yield gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), summary, vectorstore" ] }, { "cell_type": "code", "execution_count": 11, "id": "6acf3a97-a4a6-4e56-aa77-578dfdb0a4e8", "metadata": {}, "outputs": [], "source": [ "def execute(vec, query):\n", " try:\n", " res = vec.similarity_search(query, k=3)\n", " result = \"\"\n", " for i in res:\n", " result += f\"\\n{i.page_content}\"\n", " model = sdk.model(\"openai/gpt-4o\")\n", " inp = [{\"role\": \"system\", \"content\": \"You are a helpful assistant - you will be asked a query and provided with a context. You have to answer that query based on the provided context - do not make things up. Do not reveal the whole context, answer as like you already knew the context\"}, {\"role\":\"user\", \"content\":f\"query: {query} | context: {result}\"}]\n", " res = model.run(inp)\n", " return res[0]['content'], gr.update(visible=True), gr.update(visible=False)\n", " except Exception:\n", " return \"\", gr.update(visible=False), gr.update(visible=True)" ] }, { "cell_type": "code", "execution_count": 21, "id": "0c0ad086-1de1-4a88-92cf-ad442ef9cb0f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "* Running on local URL: http://127.0.0.1:7865\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "Transform lengthy videos into concise knowledge
\n", "