Agents_Course_Final_Assignment

Sleeping

App Files Files Community

Gary Simmons commited on 17 days ago

Commit

eec60f5

1 Parent(s): c829b8b

add file fetching and parsing utilities for GAIA validation tasks and update requirements.txt

Browse files

Files changed (3) hide show

app.py +81 -27
libs/questionHelper/file_tools.py +490 -0
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ import time
 import threading
 import random
 from litellm import RateLimitError
-import os
 from smolagents import (
     CodeAgent,
     DuckDuckGoSearchTool,
@@ -16,6 +15,7 @@ from smolagents import (
     SpeechToTextTool,
     LiteLLMModel,
 )
 from libs.chess.chess_tools import analyze_chess_image, analyze_chess_position
 from libs.transcription.transcription_tools import transcribe_audio
 from libs.youtube.youtube_tools import analyze_youtube_video, get_youtube_video_info
@@ -24,6 +24,8 @@ from libs.youtube.youtube_tools import analyze_youtube_video, get_youtube_video_
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 # --- Basic Agent Definition ---
@@ -194,7 +196,7 @@ class BasicAgent:
                 analyze_youtube_video,
                 get_youtube_video_info,
                 analyze_chess_position,
-                analyze_chess_image
             ],
             model=model,
             max_steps=20,
@@ -246,7 +248,7 @@ class BasicAgent:
             return f"AGENT ERROR: {e}"
 def run_and_submit_all(profile: gr.OAuthProfile | None):
@@ -255,10 +257,10 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
-    space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
     if profile:
-        username= f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
@@ -286,16 +288,16 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         response.raise_for_status()
         questions_data = response.json()
         if not questions_data:
-             print("Fetched questions list is empty.")
-             return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
     except requests.exceptions.JSONDecodeError as e:
-         print(f"Error decoding JSON response from questions endpoint: {e}")
-         print(f"Response text: {response.text[:500]}")
-         return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
@@ -310,13 +312,62 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
-            submitted_answer = agent(question_text)
-            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
-            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
-             print(f"Error running agent on task {task_id}: {e}")
-             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
@@ -398,20 +449,19 @@ with gr.Blocks() as demo:
     run_button = gr.Button("Run Evaluation & Submit All Answers")
-    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
     # Removed max_rows=10 from DataFrame constructor
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
-    run_button.click(
-        fn=run_and_submit_all,
-        outputs=[status_output, results_table]
-    )
 if __name__ == "__main__":
-    print("\n" + "-"*30 + " App Starting " + "-"*30)
     # Check for SPACE_HOST and SPACE_ID at startup for information
     space_host_startup = os.getenv("SPACE_HOST")
-    space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
     if space_host_startup:
         print(f"✅ SPACE_HOST found: {space_host_startup}")
@@ -419,14 +469,18 @@ if __name__ == "__main__":
     else:
         print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
-    if space_id_startup: # Print repo URLs if SPACE_ID is found
         print(f"✅ SPACE_ID found: {space_id_startup}")
         print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
-        print(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
     else:
-        print("ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
-    print("-"*(60 + len(" App Starting ")) + "\n")
     print("Launching Gradio Interface for Basic Agent Evaluation...")
-    demo.launch(debug=True, share=False)

 import threading
 import random
 from litellm import RateLimitError
 from smolagents import (
     CodeAgent,
     DuckDuckGoSearchTool,
     SpeechToTextTool,
     LiteLLMModel,
 )
+from libs.questionHelper.file_tools import fetch_task_files
 from libs.chess.chess_tools import analyze_chess_image, analyze_chess_position
 from libs.transcription.transcription_tools import transcribe_audio
 from libs.youtube.youtube_tools import analyze_youtube_video, get_youtube_video_info
 # (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+FILES_AVAILABLE_PREFIX = "FILES_AVAILABLE: "
+FILES_AVAILABLE_SUFFIX = "\n\n"
 # --- Basic Agent Definition ---
                 analyze_youtube_video,
                 get_youtube_video_info,
                 analyze_chess_position,
+                analyze_chess_image,
             ],
             model=model,
             max_steps=20,
             return f"AGENT ERROR: {e}"
+CACHE_DIR = "cache/gaia_validation"
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     and displays the results.
     """
     # --- Determine HF Space Runtime URL and Repo URL ---
+    space_id = os.getenv("SPACE_ID")  # Get the SPACE_ID for sending link to the code
     if profile:
+        username = f"{profile.username}"
         print(f"User logged in: {username}")
     else:
         print("User not logged in.")
         response.raise_for_status()
         questions_data = response.json()
         if not questions_data:
+            print("Fetched questions list is empty.")
+            return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
         return f"Error fetching questions: {e}", None
     except requests.exceptions.JSONDecodeError as e:
+        print(f"Error decoding JSON response from questions endpoint: {e}")
+        print(f"Response text: {response.text[:500]}")
+        return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
         return f"An unexpected error occurred fetching questions: {e}", None
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
+        # Fetch any associated files from GAIA validation (if present) and prepend a brief summary to the question
         try:
+            try:
+                file_results = fetch_task_files(
+                    task_id, dest_dir=CACHE_DIR, transcribe_mp3=False
+                )
+            except Exception as e:
+                print(f"Warning: failed to fetch files for {task_id}: {e}")
+                file_results = {}
+            # Build a compact file summary for the agent prompt
+            file_summaries = []
+            for ext, info in (file_results or {}).items():
+                status = info.get("status")
+                path = info.get("path")
+                if status == "ok" and path:
+                    file_summaries.append(f"{ext}=OK@{path}")
+                else:
+                    file_summaries.append(f"{ext}={status}")
+            files_note = (
+                ""
+                if not file_summaries
+                else (
+                    FILES_AVAILABLE_PREFIX
+                    + "; ".join(file_summaries)
+                    + FILES_AVAILABLE_SUFFIX
+                )
+            )
+            prompt_with_files = files_note + question_text
+            submitted_answer = agent(prompt_with_files)
+            answers_payload.append(
+                {"task_id": task_id, "submitted_answer": submitted_answer}
+            )
+            results_log.append(
+                {
+                    "Task ID": task_id,
+                    "Question": question_text,
+                    "Submitted Answer": submitted_answer,
+                }
+            )
         except Exception as e:
+            print(f"Error running agent on task {task_id}: {e}")
+            error_answer = f"AGENT ERROR: {e}"
+            answers_payload.append(
+                {"task_id": task_id, "submitted_answer": error_answer}
+            )
+            results_log.append(
+                {
+                    "Task ID": task_id,
+                    "Question": question_text,
+                    "Submitted Answer": error_answer,
+                }
+            )
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
     run_button = gr.Button("Run Evaluation & Submit All Answers")
+    status_output = gr.Textbox(
+        label="Run Status / Submission Result", lines=5, interactive=False
+    )
     # Removed max_rows=10 from DataFrame constructor
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
+    run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 if __name__ == "__main__":
+    print("\n" + "-" * 30 + " App Starting " + "-" * 30)
     # Check for SPACE_HOST and SPACE_ID at startup for information
     space_host_startup = os.getenv("SPACE_HOST")
+    space_id_startup = os.getenv("SPACE_ID")  # Get SPACE_ID at startup
     if space_host_startup:
         print(f"✅ SPACE_HOST found: {space_host_startup}")
     else:
         print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
+    if space_id_startup:  # Print repo URLs if SPACE_ID is found
         print(f"✅ SPACE_ID found: {space_id_startup}")
         print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
+        print(
+            f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main"
+        )
     else:
+        print(
+            "ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined."
+        )
+    print("-" * (60 + len(" App Starting ")) + "\n")
     print("Launching Gradio Interface for Basic Agent Evaluation...")
+    demo.launch(debug=True, share=False)

libs/questionHelper/file_tools.py ADDED Viewed

	@@ -0,0 +1,490 @@

+"""
+file_tools.py
+Helpers to fetch files from the GAIA validation folder on Hugging Face by task_id
+and to normalize/parse the common file types found there.
+Public API:
+  fetch_task_files(task_id, dest_dir='cache/gaia_validation', transcribe_mp3=False, mp3_model='small')
+Returns a dict mapping extension -> { status: 'ok'|'miss'|'error', path: str|None, content: parsed-object-or-None }
+Supported extensions (with lazy imports): txt, py, xlsx, mp3, pdf, jpg, png, pdb, csv, zip, docx, jsonld
+This module uses lazy imports for optional heavy dependencies and provides informative errors
+when a handler is requested but the dependency is not installed.
+"""
+from pathlib import Path
+from typing import Dict, Any
+import requests
+import json
+import io
+import zipfile
+HF_BASE_RESOLVE = (
+    "https://huggingface.co/datasets/gaia-benchmark/GAIA/resolve/main/2023/validation"
+)
+HEADERS = {"User-Agent": "gaia-task-fetcher/1.0"}
+# full list of extensions we expect to encounter
+EXTENSIONS = [
+    "txt",
+    "py",
+    "xlsx",
+    "mp3",
+    "pdf",
+    "jpg",
+    "png",
+    "pdb",
+    "csv",
+    "zip",
+    "docx",
+    "jsonld",
+]
+# lazy imports containers
+_pd = None
+_whisper = None
+_pypdf2 = None
+_pil = None
+_pytesseract = None
+_docx = None
+_rdflib = None
+# helpers for lazy import
+def _ensure_pandas():
+    global _pd
+    if _pd is None:
+        try:
+            import pandas as pd
+            _pd = pd
+        except (ImportError, ModuleNotFoundError) as e:
+            raise RuntimeError(
+                "pandas (and openpyxl) are required to read xlsx/csv files. Install with `pip install pandas openpyxl`"
+            ) from e
+    return _pd
+def _ensure_whisper():
+    global _whisper
+    if _whisper is None:
+        try:
+            import whisper
+            _whisper = whisper
+        except (ImportError, ModuleNotFoundError) as e:
+            raise RuntimeError(
+                "whisper package not available. Install `pip install -U openai-whisper` and ensure ffmpeg is installed on PATH"
+            ) from e
+    return _whisper
+def _ensure_pypdf2():
+    global _pypdf2
+    if _pypdf2 is None:
+        try:
+            import PyPDF2
+            _pypdf2 = PyPDF2
+        except (ImportError, ModuleNotFoundError) as e:
+            raise RuntimeError(
+                "PyPDF2 required to read pdf files. Install with `pip install PyPDF2`"
+            ) from e
+    return _pypdf2
+def _ensure_pil():
+    global _pil
+    if _pil is None:
+        try:
+            from PIL import Image
+            _pil = Image
+        except (ImportError, ModuleNotFoundError) as e:
+            raise RuntimeError(
+                "Pillow required to read image files. Install with `pip install Pillow`"
+            ) from e
+    return _pil
+def _ensure_pytesseract():
+    global _pytesseract
+    if _pytesseract is None:
+        try:
+            import pytesseract
+            _pytesseract = pytesseract
+        except (ImportError, ModuleNotFoundError) as e:
+            raise RuntimeError(
+                "pytesseract required for OCR. Install with `pip install pytesseract` and ensure tesseract binary is available on PATH"
+            ) from e
+    return _pytesseract
+def _ensure_docx():
+    global _docx
+    if _docx is None:
+        try:
+            import docx
+            _docx = docx
+        except (ImportError, ModuleNotFoundError) as e:
+            raise RuntimeError(
+                "python-docx required to read .docx files. Install with `pip install python-docx`"
+            ) from e
+    return _docx
+def _ensure_rdflib():
+    global _rdflib
+    if _rdflib is None:
+        try:
+            import rdflib
+            _rdflib = rdflib
+        except (ImportError, ModuleNotFoundError) as e:
+            # JSON-LD handling can also be done with plain json, but rdflib provides expansion
+            raise RuntimeError(
+                "rdflib required for advanced jsonld handling. Install with `pip install rdflib`"
+            ) from e
+    return _rdflib
+def _download(url: str, dest_path: Path, timeout=30) -> bool:
+    """Download URL to dest_path. Return True if downloaded, False on 404/other non-200."""
+    resp = requests.get(url, headers=HEADERS, timeout=timeout, stream=True)
+    if resp.status_code != 200:
+        return False
+    dest_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(dest_path, "wb") as f:
+        for chunk in resp.iter_content(1024 * 64):
+            if chunk:
+                f.write(chunk)
+    return True
+# simple readers/parsers
+def _read_txt(path: Path) -> str:
+    return path.read_text(encoding="utf-8", errors="replace")
+def _read_py(path: Path) -> str:
+    return path.read_text(encoding="utf-8", errors="replace")
+def _read_xlsx(path: Path):
+    pd = _ensure_pandas()
+    # read all sheets by default, return dict sheet_name -> DataFrame
+    try:
+        # prefer openpyxl engine if available
+        return pd.read_excel(path, sheet_name=None, engine="openpyxl")
+    except Exception as e:
+        print(f"Warning: Failed to read {path} with openpyxl engine: {e}. Falling back to default engine.")
+        try:
+            return pd.read_excel(path, sheet_name=None)
+        except Exception as e2:
+            raise RuntimeError(f"Failed to read {path} with both openpyxl and default engine: {e2}") from e2
+def _read_csv(path: Path):
+    pd = _ensure_pandas()
+    return pd.read_csv(path)
+def _read_pdf(path: Path) -> str:
+    PyPDF2 = _ensure_pypdf2()
+    text_parts = []
+    try:
+        with open(path, "rb") as fh:
+            reader = PyPDF2.PdfReader(fh)
+            for page in reader.pages:
+                try:
+                    text = page.extract_text()
+                except Exception:
+                    text = None
+                if text:
+                    text_parts.append(text)
+    except Exception as e:
+        raise RuntimeError(f"pdf-read-error: {e}") from e
+    return "\n\n".join(text_parts)
+def _read_image(path: Path, ocr=False) -> Dict[str, Any]:
+    Image = _ensure_pil()
+    info = {}
+    img = Image.open(path)
+    info["format"] = img.format
+    info["mode"] = img.mode
+    info["size"] = img.size
+    # basic EXIF when available
+    try:
+        exif = img.getexif()
+        info["exif"] = exif
+    except Exception:
+        info["exif"] = None
+    if ocr:
+        pytesseract = _ensure_pytesseract()
+        try:
+            info["ocr_text"] = pytesseract.image_to_string(img)
+        except Exception as e:
+            info["ocr_error"] = str(e)
+    return info
+def _read_pdb(path: Path) -> str:
+    # PDB files are plain text describing molecular structures. Return the text and optionally parsed info later.
+    return _read_txt(path)
+def _read_docx(path: Path) -> str:
+    docx = _ensure_docx()
+    document = docx.Document(path)
+    parts = []
+    for para in document.paragraphs:
+        parts.append(para.text)
+    return "\n".join(parts)
+def _read_jsonld(path: Path) -> Any:
+    # JSON-LD is JSON; try to load and return the structure. For expansion use rdflib if available.
+    data = json.loads(path.read_text(encoding="utf-8", errors="replace"))
+    # if rdflib available, user may want to expand/normalize; leave raw data as default
+    return data
+def _handle_zip(path: Path) -> Dict[str, Any]:
+    # return list of entries and optionally extract on demand
+    info = {"names": [], "size": path.stat().st_size}
+    try:
+        with zipfile.ZipFile(path, "r") as zf:
+            info["names"] = zf.namelist()
+    except Exception as e:
+        raise RuntimeError(f"zip-read-error: {e}") from e
+    return info
+def _transcribe_mp3(path: Path, model_name="small") -> str:
+    whisper = _ensure_whisper()
+    model = whisper.load_model(model_name)
+    result = model.transcribe(str(path))
+    return result.get("text", "")
+def fetch_task_files(
+    task_id: str,
+    dest_dir: str = "cache/gaia_validation",
+    transcribe_mp3: bool = False,
+    mp3_model: str = "small",
+    image_ocr: bool = False,
+) -> Dict[str, Any]:
+    """Fetches the candidate files for task_id and attempts to parse them according to extension.
+    Returns: { ext: { status: 'ok'|'miss'|'error', path: str|None, content: parsed_object_or_None } }
+    """
+    out = {}
+    base = Path(dest_dir) / task_id
+    base.mkdir(parents=True, exist_ok=True)
+    for ext in EXTENSIONS:
+        filename = f"{task_id}.{ext}"
+        url = f"{HF_BASE_RESOLVE}/{filename}"
+        dest = base / filename
+        entry = {"status": None, "path": None, "content": None}
+        try:
+            if dest.exists() and dest.stat().st_size > 0:
+                downloaded = True
+            else:
+                downloaded = _download(url, dest)
+            if not downloaded:
+                entry["status"] = "miss"
+                out[ext] = entry
+                continue
+            entry["status"] = "ok"
+            entry["path"] = str(dest)
+            # dispatch to handler by ext
+            if ext == "txt":
+                entry["content"] = _read_txt(dest)
+            elif ext == "py":
+                entry["content"] = _read_py(dest)
+            elif ext == "xlsx":
+                try:
+                    entry["content"] = _read_xlsx(dest)
+                except Exception as e:
+                    entry["status"] = "error"
+                    entry["content"] = f"xlsx-read-error: {e}"
+            elif ext == "csv":
+                try:
+                    entry["content"] = _read_csv(dest)
+                except Exception as e:
+                    entry["status"] = "error"
+                    entry["content"] = f"csv-read-error: {e}"
+            elif ext == "mp3":
+                if transcribe_mp3:
+                    try:
+                        entry["content"] = _transcribe_mp3(dest, model_name=mp3_model)
+                    except Exception as e:
+                        entry["status"] = "error"
+                        entry["content"] = f"mp3-transcribe-error: {e}"
+                else:
+                    entry["content"] = None
+            elif ext == "pdf":
+                try:
+                    entry["content"] = _read_pdf(dest)
+                except Exception as e:
+                    entry["status"] = "error"
+                    entry["content"] = f"pdf-read-error: {e}"
+            elif ext in ("jpg", "png"):
+                try:
+                    entry["content"] = _read_image(dest, ocr=image_ocr)
+                except Exception as e:
+                    entry["status"] = "error"
+                    entry["content"] = f"image-read-error: {e}"
+            elif ext == "pdb":
+                entry["content"] = _read_pdb(dest)
+            elif ext == "zip":
+                try:
+                    entry["content"] = _handle_zip(dest)
+                except Exception as e:
+                    entry["status"] = "error"
+                    entry["content"] = f"zip-read-error: {e}"
+            elif ext == "docx":
+                try:
+                    entry["content"] = _read_docx(dest)
+                except Exception as e:
+                    entry["status"] = "error"
+                    entry["content"] = f"docx-read-error: {e}"
+            elif ext == "jsonld":
+                try:
+                    entry["content"] = _read_jsonld(dest)
+                except Exception as e:
+                    entry["status"] = "error"
+                    entry["content"] = f"jsonld-read-error: {e}"
+            else:
+                entry["content"] = None
+            out[ext] = entry
+        except Exception as e:
+            out[ext] = {
+                "status": "error",
+                "path": str(dest) if dest else None,
+                "content": f"exception: {e}",
+            }
+    return out
+if __name__ == "__main__":
+    import argparse
+    import json
+    parser = argparse.ArgumentParser(
+        description="Fetch GAIA validation files for a task_id and parse common file types."
+    )
+    parser.add_argument("task_id")
+    parser.add_argument("--dest", default="cache/gaia_validation")
+    parser.add_argument("--transcribe-mp3", action="store_true")
+    parser.add_argument("--mp3-model", default="small")
+    parser.add_argument(
+        "--image-ocr",
+        action="store_true",
+        help="Run OCR on images (requires pytesseract + tesseract)",
+    )
+    parser.add_argument(
+        "--test-image-ocr",
+        action="store_true",
+        help="Run test for image OCR error/success handling",
+    )
+    args = parser.parse_args()
+    if args.test_image_ocr:
+        def test_image_ocr_handling():
+            from unittest.mock import patch, MagicMock
+            # Mock PIL.Image.open and pytesseract.image_to_string
+            with patch("libs.questionHelper.file_tools._ensure_pil") as mock_pil, \
+                 patch("libs.questionHelper.file_tools._ensure_pytesseract") as mock_tess:
+                mock_img = MagicMock()
+                mock_img.format = "JPEG"
+                mock_img.mode = "RGB"
+                mock_img.size = (100, 100)
+                mock_img.getexif.return_value = {"dummy": "exif"}
+                mock_pil.return_value.open.return_value = mock_img
+                # Success case
+                mock_tess.return_value.image_to_string.return_value = "Extracted text"
+                result = _read_image(Path("dummy.jpg"), ocr=True)
+                assert result["ocr_text"] == "Extracted text"
+                print("OCR success test passed:", result)
+                # Error case
+                mock_tess.return_value.image_to_string.side_effect = Exception("OCR failed")
+                result = _read_image(Path("dummy.jpg"), ocr=True)
+                assert "ocr_error" in result and result["ocr_error"] == "OCR failed"
+                print("OCR error test passed:", result)
+        test_image_ocr_handling()
+    else:
+        results = fetch_task_files(
+            args.task_id,
+            dest_dir=args.dest,
+            transcribe_mp3=args.transcribe_mp3,
+            mp3_model=args.mp3_model,
+            image_ocr=args.image_ocr,
+        )
+        printable = {}
+        for k, v in results.items():
+            c = v.get("content")
+            # For pandas DataFrames, provide summary
+            try:
+                if hasattr(c, "shape") or (
+                    isinstance(c, dict)
+                    and all(
+                        hasattr(df, "shape")
+                        for df in (c.values() if isinstance(c, dict) else [])
+                    )
+                ):
+                    # if it's a dict of DataFrames (xlsx, multiple sheets), summarize
+                    if isinstance(c, dict):
+                        printable[k] = {
+                            **v,
+                            "content": {
+                                s: {
+                                    "shape": getattr(df, "shape", None),
+                                    "columns": (
+                                        list(df.columns)
+                                        if hasattr(df, "columns")
+                                        else None
+                                    ),
+                                }
+                                for s, df in c.items()
+                            },
+                        }
+                        continue
+                    else:
+                        printable[k] = {
+                            **v,
+                            "content": {
+                                "type": "DataFrame",
+                                "shape": getattr(c, "shape", None),
+                                "columns": (
+                                    list(c.columns) if hasattr(c, "columns") else None
+                                ),
+                            },
+                        }
+                        continue
+            except Exception:
+                pass
+            # truncate long strings
+            if isinstance(c, str) and len(c) > 1000:
+                printable[k] = {**v, "content": c[:1000] + "...(truncated)"}
+            else:
+                printable[k] = v
+        print(json.dumps(printable, indent=2, default=str))

requirements.txt CHANGED Viewed

@@ -16,4 +16,7 @@ torch
 transformers
 opencv-python
 python-chess>=1.9.0
-pytesseract

 transformers
 opencv-python
 python-chess>=1.9.0
+pytesseract
+rdflib
+docx
+PyPDF2