Spaces:

ServiceNow-AI
/

Apriel-Chat

Running

App Files Files Community

bradnow commited on Oct 8

Commit

c95a856

1 Parent(s): 96ae1f0

Improve large CSV handling to preserve memory. Fix the turns counting with images attached

Browse files

Files changed (2) hide show

app.py +14 -2
log_chat.py +37 -20

app.py CHANGED Viewed

@@ -9,6 +9,17 @@ import mimetypes
 import copy
 import os
 from theme import apriel
 from utils import COMMUNITY_POSTFIX_URL, get_model_config, check_format, models_config, \
     logged_event_handler, DEBUG_MODE, DEBUG_MODEL, log_debug, log_info, log_error, log_warning
@@ -177,8 +188,9 @@ def run_chat_inference(history, message, state):
             return history, INPUT_ENABLED, SEND_BUTTON_ENABLED, STOP_BUTTON_DISABLED, BUTTON_ENABLED, state
         chat_start_count = chat_start_count + 1
-        user_messages_count = sum(1 for item in history if isinstance(item, dict) and item.get("role") == "user")
-        log_info(f"chat_start_count: {chat_start_count}, turns: {user_messages_count}, model: {model_name}")
         is_reasoning = model_config.get("REASONING")

 import copy
 import os
+# Workaround for PyCharm debugger + uvicorn compatibility error:
+#   TypeError: _patch_asyncio.<locals>.run() got an unexpected keyword argument 'loop_factory'
+DEBUG = False
+if DEBUG is True:  # or sys.gettrace() is not None:  # Debugger is attached
+    import asyncio
+    _original_run = asyncio.run
+    def _patched_run(main, **kwargs):
+        kwargs.pop('loop_factory', None)  # Remove unsupported arg
+        return _original_run(main, **kwargs)
+    asyncio.run = _patched_run
 from theme import apriel
 from utils import COMMUNITY_POSTFIX_URL, get_model_config, check_format, models_config, \
     logged_event_handler, DEBUG_MODE, DEBUG_MODEL, log_debug, log_info, log_error, log_warning
             return history, INPUT_ENABLED, SEND_BUTTON_ENABLED, STOP_BUTTON_DISABLED, BUTTON_ENABLED, state
         chat_start_count = chat_start_count + 1
+        user_messages_count = sum(1 for item in history if isinstance(item, dict) and item.get("role") == "user"
+                                  and isinstance(item.get("content"), str))
+        log_info(f"chat_start_count: {chat_start_count}, turns: {user_messages_count + 1}, model: {model_name}")
         is_reasoning = model_config.get("REASONING")

log_chat.py CHANGED Viewed

@@ -68,7 +68,8 @@ def _log_chat(chat_id: str, session_id: str, model_name: str, prompt: str, histo
     if len(messages) != len(history):
         log_warning("log_chat() --> Some messages in history are missing 'role' or 'content' keys.")
-    user_messages_count = sum(1 for item in messages if isinstance(item, dict) and item.get("role") == "user")
     # These must match the keys in the new row
     expected_headers = ["timestamp", "chat_id", "turns", "prompt", "messages", "model", "session_id", "info"]
@@ -89,6 +90,8 @@ def _log_chat(chat_id: str, session_id: str, model_name: str, prompt: str, histo
     max_retries = 3
     retry_count = 0
     file_exists = False
     while retry_count < max_retries:
         try:
             csv_path = hf_hub_download(
@@ -97,9 +100,13 @@ def _log_chat(chat_id: str, session_id: str, model_name: str, prompt: str, histo
                 repo_type="dataset",
                 token=HF_TOKEN  # Only needed if not already logged in
             )
-            pd.read_csv(csv_path)
             file_exists = True
-            log_debug(f"log_chat() --> Downloaded existing CSV with {len(pd.read_csv(csv_path))} rows")
             break  # Success, exit the loop
         except Exception as e:
             retry_count += 1
@@ -115,16 +122,12 @@ def _log_chat(chat_id: str, session_id: str, model_name: str, prompt: str, histo
     log_timer.add_step(f"Downloaded existing CSV (attempts: {retry_count + 1})")
     # Handle the case where the CSV file does not exist or is invalid
-    if file_exists and len(pd.read_csv(csv_path)) == 0:
-        log_warning(f"log_chat() --> CSV {csv_path} exists but is empty, will create a new one.")
-        dump_hub_csv()
-        file_exists = False
-    elif file_exists:
-        # Check that the headers match our standard headers of "timestamp", "chat_id", "turns", ...
-        existing_headers = pd.read_csv(csv_path).columns.tolist()
         if set(existing_headers) != set(expected_headers):
             log_warning(f"log_chat() --> CSV {csv_path} has unexpected headers: {existing_headers}. "
-                        f"\nExpected {existing_headers} "
                         f"Will create a new one.")
             dump_hub_csv()
             file_exists = False
@@ -134,15 +137,29 @@ def _log_chat(chat_id: str, session_id: str, model_name: str, prompt: str, histo
     # Write out the new row to the CSV file (append isn't working in HF container, so recreate each time)
     log_debug(f"log_chat() --> Writing CSV file, file_exists={file_exists}")
     try:
-        with open(CSV_FILENAME, "w", newline="\n") as f:
-            writer = csv.DictWriter(f, fieldnames=new_row.keys())
-            writer.writeheader()  # Always write the header
-            if file_exists:
-                for _, row in pd.read_csv(csv_path).iterrows():
-                    writer.writerow(row.to_dict())  # Write existing rows
-            writer.writerow(new_row)  # Write the new row
-        log_debug("log_chat() --> Wrote out CSV with new row")
         # dump_local_csv()
     except Exception as e:
         log_error(f"log_chat() --> Error writing to CSV: {e}")

     if len(messages) != len(history):
         log_warning("log_chat() --> Some messages in history are missing 'role' or 'content' keys.")
+    user_messages_count = sum(1 for item in messages if isinstance(item, dict) and item.get("role") == "user"
+                              and isinstance(item.get("content"), str))
     # These must match the keys in the new row
     expected_headers = ["timestamp", "chat_id", "turns", "prompt", "messages", "model", "session_id", "info"]
     max_retries = 3
     retry_count = 0
     file_exists = False
+    csv_path = None
+    row_count = 0
     while retry_count < max_retries:
         try:
             csv_path = hf_hub_download(
                 repo_type="dataset",
                 token=HF_TOKEN  # Only needed if not already logged in
             )
+            # Only read first row to check if file is valid and get row count efficiently
+            df_check = pd.read_csv(csv_path, nrows=1)
             file_exists = True
+            # Get row count without loading entire file into memory
+            with open(csv_path, 'r') as f:
+                row_count = sum(1 for _ in f) - 1  # Subtract header
+            log_info(f"log_chat() --> Downloaded existing CSV with ~{row_count} rows")
             break  # Success, exit the loop
         except Exception as e:
             retry_count += 1
     log_timer.add_step(f"Downloaded existing CSV (attempts: {retry_count + 1})")
     # Handle the case where the CSV file does not exist or is invalid
+    if file_exists:
+        # Check that the headers match our standard headers (only read first row)
+        existing_headers = pd.read_csv(csv_path, nrows=0).columns.tolist()
         if set(existing_headers) != set(expected_headers):
             log_warning(f"log_chat() --> CSV {csv_path} has unexpected headers: {existing_headers}. "
+                        f"\nExpected {expected_headers} "
                         f"Will create a new one.")
             dump_hub_csv()
             file_exists = False
     # Write out the new row to the CSV file (append isn't working in HF container, so recreate each time)
     log_debug(f"log_chat() --> Writing CSV file, file_exists={file_exists}")
     try:
+        if file_exists:
+            # Append mode: copy existing file and append new row
+            # Use chunked reading to avoid loading entire file into memory
+            with open(CSV_FILENAME, "w", newline="\n") as f_out:
+                writer = csv.DictWriter(f_out, fieldnames=expected_headers)
+                writer.writeheader()
+                # Stream copy existing rows in chunks to minimize memory usage
+                chunk_size = 1000
+                for chunk in pd.read_csv(csv_path, chunksize=chunk_size):
+                    for _, row in chunk.iterrows():
+                        writer.writerow(row.to_dict())
+                # Append new row
+                writer.writerow(new_row)
+        else:
+            # Create new file with just the new row
+            with open(CSV_FILENAME, "w", newline="\n") as f:
+                writer = csv.DictWriter(f, fieldnames=expected_headers)
+                writer.writeheader()
+                writer.writerow(new_row)
+        log_debug(f"log_chat() --> Wrote out CSV with new row, new row_count={row_count + 1}")
         # dump_local_csv()
     except Exception as e:
         log_error(f"log_chat() --> Error writing to CSV: {e}")