Gary Simmons commited on
Commit
eec60f5
·
1 Parent(s): c829b8b

add file fetching and parsing utilities for GAIA validation tasks and update requirements.txt

Browse files
Files changed (3) hide show
  1. app.py +81 -27
  2. libs/questionHelper/file_tools.py +490 -0
  3. requirements.txt +4 -1
app.py CHANGED
@@ -6,7 +6,6 @@ import time
6
  import threading
7
  import random
8
  from litellm import RateLimitError
9
- import os
10
  from smolagents import (
11
  CodeAgent,
12
  DuckDuckGoSearchTool,
@@ -16,6 +15,7 @@ from smolagents import (
16
  SpeechToTextTool,
17
  LiteLLMModel,
18
  )
 
19
  from libs.chess.chess_tools import analyze_chess_image, analyze_chess_position
20
  from libs.transcription.transcription_tools import transcribe_audio
21
  from libs.youtube.youtube_tools import analyze_youtube_video, get_youtube_video_info
@@ -24,6 +24,8 @@ from libs.youtube.youtube_tools import analyze_youtube_video, get_youtube_video_
24
  # (Keep Constants as is)
25
  # --- Constants ---
26
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
 
27
 
28
 
29
  # --- Basic Agent Definition ---
@@ -194,7 +196,7 @@ class BasicAgent:
194
  analyze_youtube_video,
195
  get_youtube_video_info,
196
  analyze_chess_position,
197
- analyze_chess_image
198
  ],
199
  model=model,
200
  max_steps=20,
@@ -246,7 +248,7 @@ class BasicAgent:
246
  return f"AGENT ERROR: {e}"
247
 
248
 
249
-
250
 
251
 
252
  def run_and_submit_all(profile: gr.OAuthProfile | None):
@@ -255,10 +257,10 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
255
  and displays the results.
256
  """
257
  # --- Determine HF Space Runtime URL and Repo URL ---
258
- space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
259
 
260
  if profile:
261
- username= f"{profile.username}"
262
  print(f"User logged in: {username}")
263
  else:
264
  print("User not logged in.")
@@ -286,16 +288,16 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
286
  response.raise_for_status()
287
  questions_data = response.json()
288
  if not questions_data:
289
- print("Fetched questions list is empty.")
290
- return "Fetched questions list is empty or invalid format.", None
291
  print(f"Fetched {len(questions_data)} questions.")
292
  except requests.exceptions.RequestException as e:
293
  print(f"Error fetching questions: {e}")
294
  return f"Error fetching questions: {e}", None
295
  except requests.exceptions.JSONDecodeError as e:
296
- print(f"Error decoding JSON response from questions endpoint: {e}")
297
- print(f"Response text: {response.text[:500]}")
298
- return f"Error decoding server response for questions: {e}", None
299
  except Exception as e:
300
  print(f"An unexpected error occurred fetching questions: {e}")
301
  return f"An unexpected error occurred fetching questions: {e}", None
@@ -310,13 +312,62 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
310
  if not task_id or question_text is None:
311
  print(f"Skipping item with missing task_id or question: {item}")
312
  continue
 
313
  try:
314
- submitted_answer = agent(question_text)
315
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
316
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  except Exception as e:
318
- print(f"Error running agent on task {task_id}: {e}")
319
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
 
 
 
 
 
 
 
320
 
321
  if not answers_payload:
322
  print("Agent did not produce any answers to submit.")
@@ -398,20 +449,19 @@ with gr.Blocks() as demo:
398
 
399
  run_button = gr.Button("Run Evaluation & Submit All Answers")
400
 
401
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 
 
402
  # Removed max_rows=10 from DataFrame constructor
403
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
404
 
405
- run_button.click(
406
- fn=run_and_submit_all,
407
- outputs=[status_output, results_table]
408
- )
409
 
410
  if __name__ == "__main__":
411
- print("\n" + "-"*30 + " App Starting " + "-"*30)
412
  # Check for SPACE_HOST and SPACE_ID at startup for information
413
  space_host_startup = os.getenv("SPACE_HOST")
414
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
415
 
416
  if space_host_startup:
417
  print(f"✅ SPACE_HOST found: {space_host_startup}")
@@ -419,14 +469,18 @@ if __name__ == "__main__":
419
  else:
420
  print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
421
 
422
- if space_id_startup: # Print repo URLs if SPACE_ID is found
423
  print(f"✅ SPACE_ID found: {space_id_startup}")
424
  print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
425
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
 
 
426
  else:
427
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
 
 
428
 
429
- print("-"*(60 + len(" App Starting ")) + "\n")
430
 
431
  print("Launching Gradio Interface for Basic Agent Evaluation...")
432
- demo.launch(debug=True, share=False)
 
6
  import threading
7
  import random
8
  from litellm import RateLimitError
 
9
  from smolagents import (
10
  CodeAgent,
11
  DuckDuckGoSearchTool,
 
15
  SpeechToTextTool,
16
  LiteLLMModel,
17
  )
18
+ from libs.questionHelper.file_tools import fetch_task_files
19
  from libs.chess.chess_tools import analyze_chess_image, analyze_chess_position
20
  from libs.transcription.transcription_tools import transcribe_audio
21
  from libs.youtube.youtube_tools import analyze_youtube_video, get_youtube_video_info
 
24
  # (Keep Constants as is)
25
  # --- Constants ---
26
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
27
+ FILES_AVAILABLE_PREFIX = "FILES_AVAILABLE: "
28
+ FILES_AVAILABLE_SUFFIX = "\n\n"
29
 
30
 
31
  # --- Basic Agent Definition ---
 
196
  analyze_youtube_video,
197
  get_youtube_video_info,
198
  analyze_chess_position,
199
+ analyze_chess_image,
200
  ],
201
  model=model,
202
  max_steps=20,
 
248
  return f"AGENT ERROR: {e}"
249
 
250
 
251
+ CACHE_DIR = "cache/gaia_validation"
252
 
253
 
254
  def run_and_submit_all(profile: gr.OAuthProfile | None):
 
257
  and displays the results.
258
  """
259
  # --- Determine HF Space Runtime URL and Repo URL ---
260
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
261
 
262
  if profile:
263
+ username = f"{profile.username}"
264
  print(f"User logged in: {username}")
265
  else:
266
  print("User not logged in.")
 
288
  response.raise_for_status()
289
  questions_data = response.json()
290
  if not questions_data:
291
+ print("Fetched questions list is empty.")
292
+ return "Fetched questions list is empty or invalid format.", None
293
  print(f"Fetched {len(questions_data)} questions.")
294
  except requests.exceptions.RequestException as e:
295
  print(f"Error fetching questions: {e}")
296
  return f"Error fetching questions: {e}", None
297
  except requests.exceptions.JSONDecodeError as e:
298
+ print(f"Error decoding JSON response from questions endpoint: {e}")
299
+ print(f"Response text: {response.text[:500]}")
300
+ return f"Error decoding server response for questions: {e}", None
301
  except Exception as e:
302
  print(f"An unexpected error occurred fetching questions: {e}")
303
  return f"An unexpected error occurred fetching questions: {e}", None
 
312
  if not task_id or question_text is None:
313
  print(f"Skipping item with missing task_id or question: {item}")
314
  continue
315
+ # Fetch any associated files from GAIA validation (if present) and prepend a brief summary to the question
316
  try:
317
+ try:
318
+ file_results = fetch_task_files(
319
+ task_id, dest_dir=CACHE_DIR, transcribe_mp3=False
320
+ )
321
+ except Exception as e:
322
+ print(f"Warning: failed to fetch files for {task_id}: {e}")
323
+ file_results = {}
324
+
325
+ # Build a compact file summary for the agent prompt
326
+ file_summaries = []
327
+ for ext, info in (file_results or {}).items():
328
+ status = info.get("status")
329
+ path = info.get("path")
330
+ if status == "ok" and path:
331
+ file_summaries.append(f"{ext}=OK@{path}")
332
+ else:
333
+ file_summaries.append(f"{ext}={status}")
334
+
335
+ files_note = (
336
+ ""
337
+ if not file_summaries
338
+ else (
339
+ FILES_AVAILABLE_PREFIX
340
+ + "; ".join(file_summaries)
341
+ + FILES_AVAILABLE_SUFFIX
342
+ )
343
+ )
344
+
345
+ prompt_with_files = files_note + question_text
346
+
347
+ submitted_answer = agent(prompt_with_files)
348
+ answers_payload.append(
349
+ {"task_id": task_id, "submitted_answer": submitted_answer}
350
+ )
351
+ results_log.append(
352
+ {
353
+ "Task ID": task_id,
354
+ "Question": question_text,
355
+ "Submitted Answer": submitted_answer,
356
+ }
357
+ )
358
  except Exception as e:
359
+ print(f"Error running agent on task {task_id}: {e}")
360
+ error_answer = f"AGENT ERROR: {e}"
361
+ answers_payload.append(
362
+ {"task_id": task_id, "submitted_answer": error_answer}
363
+ )
364
+ results_log.append(
365
+ {
366
+ "Task ID": task_id,
367
+ "Question": question_text,
368
+ "Submitted Answer": error_answer,
369
+ }
370
+ )
371
 
372
  if not answers_payload:
373
  print("Agent did not produce any answers to submit.")
 
449
 
450
  run_button = gr.Button("Run Evaluation & Submit All Answers")
451
 
452
+ status_output = gr.Textbox(
453
+ label="Run Status / Submission Result", lines=5, interactive=False
454
+ )
455
  # Removed max_rows=10 from DataFrame constructor
456
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
457
 
458
+ run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 
 
 
459
 
460
  if __name__ == "__main__":
461
+ print("\n" + "-" * 30 + " App Starting " + "-" * 30)
462
  # Check for SPACE_HOST and SPACE_ID at startup for information
463
  space_host_startup = os.getenv("SPACE_HOST")
464
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
465
 
466
  if space_host_startup:
467
  print(f"✅ SPACE_HOST found: {space_host_startup}")
 
469
  else:
470
  print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
471
 
472
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
473
  print(f"✅ SPACE_ID found: {space_id_startup}")
474
  print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
475
+ print(
476
+ f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main"
477
+ )
478
  else:
479
+ print(
480
+ "ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined."
481
+ )
482
 
483
+ print("-" * (60 + len(" App Starting ")) + "\n")
484
 
485
  print("Launching Gradio Interface for Basic Agent Evaluation...")
486
+ demo.launch(debug=True, share=False)
libs/questionHelper/file_tools.py ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ file_tools.py
3
+
4
+ Helpers to fetch files from the GAIA validation folder on Hugging Face by task_id
5
+ and to normalize/parse the common file types found there.
6
+
7
+ Public API:
8
+ fetch_task_files(task_id, dest_dir='cache/gaia_validation', transcribe_mp3=False, mp3_model='small')
9
+
10
+ Returns a dict mapping extension -> { status: 'ok'|'miss'|'error', path: str|None, content: parsed-object-or-None }
11
+
12
+ Supported extensions (with lazy imports): txt, py, xlsx, mp3, pdf, jpg, png, pdb, csv, zip, docx, jsonld
13
+
14
+ This module uses lazy imports for optional heavy dependencies and provides informative errors
15
+ when a handler is requested but the dependency is not installed.
16
+
17
+ """
18
+
19
+ from pathlib import Path
20
+ from typing import Dict, Any
21
+ import requests
22
+ import json
23
+ import io
24
+ import zipfile
25
+
26
+ HF_BASE_RESOLVE = (
27
+ "https://huggingface.co/datasets/gaia-benchmark/GAIA/resolve/main/2023/validation"
28
+ )
29
+ HEADERS = {"User-Agent": "gaia-task-fetcher/1.0"}
30
+ # full list of extensions we expect to encounter
31
+ EXTENSIONS = [
32
+ "txt",
33
+ "py",
34
+ "xlsx",
35
+ "mp3",
36
+ "pdf",
37
+ "jpg",
38
+ "png",
39
+ "pdb",
40
+ "csv",
41
+ "zip",
42
+ "docx",
43
+ "jsonld",
44
+ ]
45
+
46
+ # lazy imports containers
47
+ _pd = None
48
+ _whisper = None
49
+ _pypdf2 = None
50
+ _pil = None
51
+ _pytesseract = None
52
+ _docx = None
53
+ _rdflib = None
54
+
55
+
56
+ # helpers for lazy import
57
+ def _ensure_pandas():
58
+ global _pd
59
+ if _pd is None:
60
+ try:
61
+ import pandas as pd
62
+
63
+ _pd = pd
64
+ except (ImportError, ModuleNotFoundError) as e:
65
+ raise RuntimeError(
66
+ "pandas (and openpyxl) are required to read xlsx/csv files. Install with `pip install pandas openpyxl`"
67
+ ) from e
68
+ return _pd
69
+
70
+
71
+ def _ensure_whisper():
72
+ global _whisper
73
+ if _whisper is None:
74
+ try:
75
+ import whisper
76
+
77
+ _whisper = whisper
78
+ except (ImportError, ModuleNotFoundError) as e:
79
+ raise RuntimeError(
80
+ "whisper package not available. Install `pip install -U openai-whisper` and ensure ffmpeg is installed on PATH"
81
+ ) from e
82
+ return _whisper
83
+
84
+
85
+ def _ensure_pypdf2():
86
+ global _pypdf2
87
+ if _pypdf2 is None:
88
+ try:
89
+ import PyPDF2
90
+
91
+ _pypdf2 = PyPDF2
92
+ except (ImportError, ModuleNotFoundError) as e:
93
+ raise RuntimeError(
94
+ "PyPDF2 required to read pdf files. Install with `pip install PyPDF2`"
95
+ ) from e
96
+ return _pypdf2
97
+
98
+
99
+ def _ensure_pil():
100
+ global _pil
101
+ if _pil is None:
102
+ try:
103
+ from PIL import Image
104
+
105
+ _pil = Image
106
+ except (ImportError, ModuleNotFoundError) as e:
107
+ raise RuntimeError(
108
+ "Pillow required to read image files. Install with `pip install Pillow`"
109
+ ) from e
110
+ return _pil
111
+
112
+
113
+ def _ensure_pytesseract():
114
+ global _pytesseract
115
+ if _pytesseract is None:
116
+ try:
117
+ import pytesseract
118
+
119
+ _pytesseract = pytesseract
120
+ except (ImportError, ModuleNotFoundError) as e:
121
+ raise RuntimeError(
122
+ "pytesseract required for OCR. Install with `pip install pytesseract` and ensure tesseract binary is available on PATH"
123
+ ) from e
124
+ return _pytesseract
125
+
126
+
127
+ def _ensure_docx():
128
+ global _docx
129
+ if _docx is None:
130
+ try:
131
+ import docx
132
+
133
+ _docx = docx
134
+ except (ImportError, ModuleNotFoundError) as e:
135
+ raise RuntimeError(
136
+ "python-docx required to read .docx files. Install with `pip install python-docx`"
137
+ ) from e
138
+ return _docx
139
+
140
+
141
+ def _ensure_rdflib():
142
+ global _rdflib
143
+ if _rdflib is None:
144
+ try:
145
+ import rdflib
146
+
147
+ _rdflib = rdflib
148
+ except (ImportError, ModuleNotFoundError) as e:
149
+ # JSON-LD handling can also be done with plain json, but rdflib provides expansion
150
+ raise RuntimeError(
151
+ "rdflib required for advanced jsonld handling. Install with `pip install rdflib`"
152
+ ) from e
153
+ return _rdflib
154
+
155
+
156
+ def _download(url: str, dest_path: Path, timeout=30) -> bool:
157
+ """Download URL to dest_path. Return True if downloaded, False on 404/other non-200."""
158
+ resp = requests.get(url, headers=HEADERS, timeout=timeout, stream=True)
159
+ if resp.status_code != 200:
160
+ return False
161
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
162
+ with open(dest_path, "wb") as f:
163
+ for chunk in resp.iter_content(1024 * 64):
164
+ if chunk:
165
+ f.write(chunk)
166
+ return True
167
+
168
+
169
+ # simple readers/parsers
170
+
171
+
172
+ def _read_txt(path: Path) -> str:
173
+ return path.read_text(encoding="utf-8", errors="replace")
174
+
175
+
176
+ def _read_py(path: Path) -> str:
177
+ return path.read_text(encoding="utf-8", errors="replace")
178
+
179
+
180
+ def _read_xlsx(path: Path):
181
+ pd = _ensure_pandas()
182
+ # read all sheets by default, return dict sheet_name -> DataFrame
183
+ try:
184
+ # prefer openpyxl engine if available
185
+ return pd.read_excel(path, sheet_name=None, engine="openpyxl")
186
+ except Exception as e:
187
+ print(f"Warning: Failed to read {path} with openpyxl engine: {e}. Falling back to default engine.")
188
+ try:
189
+ return pd.read_excel(path, sheet_name=None)
190
+ except Exception as e2:
191
+ raise RuntimeError(f"Failed to read {path} with both openpyxl and default engine: {e2}") from e2
192
+
193
+
194
+ def _read_csv(path: Path):
195
+ pd = _ensure_pandas()
196
+ return pd.read_csv(path)
197
+
198
+
199
+ def _read_pdf(path: Path) -> str:
200
+ PyPDF2 = _ensure_pypdf2()
201
+ text_parts = []
202
+ try:
203
+ with open(path, "rb") as fh:
204
+ reader = PyPDF2.PdfReader(fh)
205
+ for page in reader.pages:
206
+ try:
207
+ text = page.extract_text()
208
+ except Exception:
209
+ text = None
210
+ if text:
211
+ text_parts.append(text)
212
+ except Exception as e:
213
+ raise RuntimeError(f"pdf-read-error: {e}") from e
214
+ return "\n\n".join(text_parts)
215
+
216
+
217
+ def _read_image(path: Path, ocr=False) -> Dict[str, Any]:
218
+ Image = _ensure_pil()
219
+ info = {}
220
+ img = Image.open(path)
221
+ info["format"] = img.format
222
+ info["mode"] = img.mode
223
+ info["size"] = img.size
224
+ # basic EXIF when available
225
+ try:
226
+ exif = img.getexif()
227
+ info["exif"] = exif
228
+ except Exception:
229
+ info["exif"] = None
230
+ if ocr:
231
+ pytesseract = _ensure_pytesseract()
232
+ try:
233
+ info["ocr_text"] = pytesseract.image_to_string(img)
234
+ except Exception as e:
235
+ info["ocr_error"] = str(e)
236
+ return info
237
+
238
+
239
+ def _read_pdb(path: Path) -> str:
240
+ # PDB files are plain text describing molecular structures. Return the text and optionally parsed info later.
241
+ return _read_txt(path)
242
+
243
+
244
+ def _read_docx(path: Path) -> str:
245
+ docx = _ensure_docx()
246
+ document = docx.Document(path)
247
+ parts = []
248
+ for para in document.paragraphs:
249
+ parts.append(para.text)
250
+ return "\n".join(parts)
251
+
252
+
253
+ def _read_jsonld(path: Path) -> Any:
254
+ # JSON-LD is JSON; try to load and return the structure. For expansion use rdflib if available.
255
+ data = json.loads(path.read_text(encoding="utf-8", errors="replace"))
256
+ # if rdflib available, user may want to expand/normalize; leave raw data as default
257
+ return data
258
+
259
+
260
+ def _handle_zip(path: Path) -> Dict[str, Any]:
261
+ # return list of entries and optionally extract on demand
262
+ info = {"names": [], "size": path.stat().st_size}
263
+ try:
264
+ with zipfile.ZipFile(path, "r") as zf:
265
+ info["names"] = zf.namelist()
266
+ except Exception as e:
267
+ raise RuntimeError(f"zip-read-error: {e}") from e
268
+ return info
269
+
270
+
271
+ def _transcribe_mp3(path: Path, model_name="small") -> str:
272
+ whisper = _ensure_whisper()
273
+ model = whisper.load_model(model_name)
274
+ result = model.transcribe(str(path))
275
+ return result.get("text", "")
276
+
277
+
278
+ def fetch_task_files(
279
+ task_id: str,
280
+ dest_dir: str = "cache/gaia_validation",
281
+ transcribe_mp3: bool = False,
282
+ mp3_model: str = "small",
283
+ image_ocr: bool = False,
284
+ ) -> Dict[str, Any]:
285
+ """Fetches the candidate files for task_id and attempts to parse them according to extension.
286
+
287
+ Returns: { ext: { status: 'ok'|'miss'|'error', path: str|None, content: parsed_object_or_None } }
288
+ """
289
+ out = {}
290
+ base = Path(dest_dir) / task_id
291
+ base.mkdir(parents=True, exist_ok=True)
292
+
293
+ for ext in EXTENSIONS:
294
+ filename = f"{task_id}.{ext}"
295
+ url = f"{HF_BASE_RESOLVE}/{filename}"
296
+ dest = base / filename
297
+ entry = {"status": None, "path": None, "content": None}
298
+ try:
299
+ if dest.exists() and dest.stat().st_size > 0:
300
+ downloaded = True
301
+ else:
302
+ downloaded = _download(url, dest)
303
+ if not downloaded:
304
+ entry["status"] = "miss"
305
+ out[ext] = entry
306
+ continue
307
+ entry["status"] = "ok"
308
+ entry["path"] = str(dest)
309
+
310
+ # dispatch to handler by ext
311
+ if ext == "txt":
312
+ entry["content"] = _read_txt(dest)
313
+ elif ext == "py":
314
+ entry["content"] = _read_py(dest)
315
+ elif ext == "xlsx":
316
+ try:
317
+ entry["content"] = _read_xlsx(dest)
318
+ except Exception as e:
319
+ entry["status"] = "error"
320
+ entry["content"] = f"xlsx-read-error: {e}"
321
+ elif ext == "csv":
322
+ try:
323
+ entry["content"] = _read_csv(dest)
324
+ except Exception as e:
325
+ entry["status"] = "error"
326
+ entry["content"] = f"csv-read-error: {e}"
327
+ elif ext == "mp3":
328
+ if transcribe_mp3:
329
+ try:
330
+ entry["content"] = _transcribe_mp3(dest, model_name=mp3_model)
331
+ except Exception as e:
332
+ entry["status"] = "error"
333
+ entry["content"] = f"mp3-transcribe-error: {e}"
334
+ else:
335
+ entry["content"] = None
336
+ elif ext == "pdf":
337
+ try:
338
+ entry["content"] = _read_pdf(dest)
339
+ except Exception as e:
340
+ entry["status"] = "error"
341
+ entry["content"] = f"pdf-read-error: {e}"
342
+ elif ext in ("jpg", "png"):
343
+ try:
344
+ entry["content"] = _read_image(dest, ocr=image_ocr)
345
+ except Exception as e:
346
+ entry["status"] = "error"
347
+ entry["content"] = f"image-read-error: {e}"
348
+ elif ext == "pdb":
349
+ entry["content"] = _read_pdb(dest)
350
+ elif ext == "zip":
351
+ try:
352
+ entry["content"] = _handle_zip(dest)
353
+ except Exception as e:
354
+ entry["status"] = "error"
355
+ entry["content"] = f"zip-read-error: {e}"
356
+ elif ext == "docx":
357
+ try:
358
+ entry["content"] = _read_docx(dest)
359
+ except Exception as e:
360
+ entry["status"] = "error"
361
+ entry["content"] = f"docx-read-error: {e}"
362
+ elif ext == "jsonld":
363
+ try:
364
+ entry["content"] = _read_jsonld(dest)
365
+ except Exception as e:
366
+ entry["status"] = "error"
367
+ entry["content"] = f"jsonld-read-error: {e}"
368
+ else:
369
+ entry["content"] = None
370
+
371
+ out[ext] = entry
372
+ except Exception as e:
373
+ out[ext] = {
374
+ "status": "error",
375
+ "path": str(dest) if dest else None,
376
+ "content": f"exception: {e}",
377
+ }
378
+ return out
379
+
380
+
381
+ if __name__ == "__main__":
382
+ import argparse
383
+ import json
384
+
385
+ parser = argparse.ArgumentParser(
386
+ description="Fetch GAIA validation files for a task_id and parse common file types."
387
+ )
388
+ parser.add_argument("task_id")
389
+ parser.add_argument("--dest", default="cache/gaia_validation")
390
+ parser.add_argument("--transcribe-mp3", action="store_true")
391
+ parser.add_argument("--mp3-model", default="small")
392
+ parser.add_argument(
393
+ "--image-ocr",
394
+ action="store_true",
395
+ help="Run OCR on images (requires pytesseract + tesseract)",
396
+ )
397
+ parser.add_argument(
398
+ "--test-image-ocr",
399
+ action="store_true",
400
+ help="Run test for image OCR error/success handling",
401
+ )
402
+ args = parser.parse_args()
403
+
404
+ if args.test_image_ocr:
405
+ def test_image_ocr_handling():
406
+ from unittest.mock import patch, MagicMock
407
+
408
+ # Mock PIL.Image.open and pytesseract.image_to_string
409
+ with patch("libs.questionHelper.file_tools._ensure_pil") as mock_pil, \
410
+ patch("libs.questionHelper.file_tools._ensure_pytesseract") as mock_tess:
411
+
412
+ mock_img = MagicMock()
413
+ mock_img.format = "JPEG"
414
+ mock_img.mode = "RGB"
415
+ mock_img.size = (100, 100)
416
+ mock_img.getexif.return_value = {"dummy": "exif"}
417
+
418
+ mock_pil.return_value.open.return_value = mock_img
419
+
420
+ # Success case
421
+ mock_tess.return_value.image_to_string.return_value = "Extracted text"
422
+ result = _read_image(Path("dummy.jpg"), ocr=True)
423
+ assert result["ocr_text"] == "Extracted text"
424
+ print("OCR success test passed:", result)
425
+
426
+ # Error case
427
+ mock_tess.return_value.image_to_string.side_effect = Exception("OCR failed")
428
+ result = _read_image(Path("dummy.jpg"), ocr=True)
429
+ assert "ocr_error" in result and result["ocr_error"] == "OCR failed"
430
+ print("OCR error test passed:", result)
431
+
432
+ test_image_ocr_handling()
433
+ else:
434
+ results = fetch_task_files(
435
+ args.task_id,
436
+ dest_dir=args.dest,
437
+ transcribe_mp3=args.transcribe_mp3,
438
+ mp3_model=args.mp3_model,
439
+ image_ocr=args.image_ocr,
440
+ )
441
+
442
+ printable = {}
443
+ for k, v in results.items():
444
+ c = v.get("content")
445
+ # For pandas DataFrames, provide summary
446
+ try:
447
+ if hasattr(c, "shape") or (
448
+ isinstance(c, dict)
449
+ and all(
450
+ hasattr(df, "shape")
451
+ for df in (c.values() if isinstance(c, dict) else [])
452
+ )
453
+ ):
454
+ # if it's a dict of DataFrames (xlsx, multiple sheets), summarize
455
+ if isinstance(c, dict):
456
+ printable[k] = {
457
+ **v,
458
+ "content": {
459
+ s: {
460
+ "shape": getattr(df, "shape", None),
461
+ "columns": (
462
+ list(df.columns)
463
+ if hasattr(df, "columns")
464
+ else None
465
+ ),
466
+ }
467
+ for s, df in c.items()
468
+ },
469
+ }
470
+ continue
471
+ else:
472
+ printable[k] = {
473
+ **v,
474
+ "content": {
475
+ "type": "DataFrame",
476
+ "shape": getattr(c, "shape", None),
477
+ "columns": (
478
+ list(c.columns) if hasattr(c, "columns") else None
479
+ ),
480
+ },
481
+ }
482
+ continue
483
+ except Exception:
484
+ pass
485
+ # truncate long strings
486
+ if isinstance(c, str) and len(c) > 1000:
487
+ printable[k] = {**v, "content": c[:1000] + "...(truncated)"}
488
+ else:
489
+ printable[k] = v
490
+ print(json.dumps(printable, indent=2, default=str))
requirements.txt CHANGED
@@ -16,4 +16,7 @@ torch
16
  transformers
17
  opencv-python
18
  python-chess>=1.9.0
19
- pytesseract
 
 
 
 
16
  transformers
17
  opencv-python
18
  python-chess>=1.9.0
19
+ pytesseract
20
+ rdflib
21
+ docx
22
+ PyPDF2