bradnow commited on
Commit
c95a856
·
1 Parent(s): 96ae1f0

Improve large CSV handling to preserve memory. Fix the turns counting with images attached

Browse files
Files changed (2) hide show
  1. app.py +14 -2
  2. log_chat.py +37 -20
app.py CHANGED
@@ -9,6 +9,17 @@ import mimetypes
9
  import copy
10
  import os
11
 
 
 
 
 
 
 
 
 
 
 
 
12
  from theme import apriel
13
  from utils import COMMUNITY_POSTFIX_URL, get_model_config, check_format, models_config, \
14
  logged_event_handler, DEBUG_MODE, DEBUG_MODEL, log_debug, log_info, log_error, log_warning
@@ -177,8 +188,9 @@ def run_chat_inference(history, message, state):
177
  return history, INPUT_ENABLED, SEND_BUTTON_ENABLED, STOP_BUTTON_DISABLED, BUTTON_ENABLED, state
178
 
179
  chat_start_count = chat_start_count + 1
180
- user_messages_count = sum(1 for item in history if isinstance(item, dict) and item.get("role") == "user")
181
- log_info(f"chat_start_count: {chat_start_count}, turns: {user_messages_count}, model: {model_name}")
 
182
 
183
  is_reasoning = model_config.get("REASONING")
184
 
 
9
  import copy
10
  import os
11
 
12
+ # Workaround for PyCharm debugger + uvicorn compatibility error:
13
+ # TypeError: _patch_asyncio.<locals>.run() got an unexpected keyword argument 'loop_factory'
14
+ DEBUG = False
15
+ if DEBUG is True: # or sys.gettrace() is not None: # Debugger is attached
16
+ import asyncio
17
+ _original_run = asyncio.run
18
+ def _patched_run(main, **kwargs):
19
+ kwargs.pop('loop_factory', None) # Remove unsupported arg
20
+ return _original_run(main, **kwargs)
21
+ asyncio.run = _patched_run
22
+
23
  from theme import apriel
24
  from utils import COMMUNITY_POSTFIX_URL, get_model_config, check_format, models_config, \
25
  logged_event_handler, DEBUG_MODE, DEBUG_MODEL, log_debug, log_info, log_error, log_warning
 
188
  return history, INPUT_ENABLED, SEND_BUTTON_ENABLED, STOP_BUTTON_DISABLED, BUTTON_ENABLED, state
189
 
190
  chat_start_count = chat_start_count + 1
191
+ user_messages_count = sum(1 for item in history if isinstance(item, dict) and item.get("role") == "user"
192
+ and isinstance(item.get("content"), str))
193
+ log_info(f"chat_start_count: {chat_start_count}, turns: {user_messages_count + 1}, model: {model_name}")
194
 
195
  is_reasoning = model_config.get("REASONING")
196
 
log_chat.py CHANGED
@@ -68,7 +68,8 @@ def _log_chat(chat_id: str, session_id: str, model_name: str, prompt: str, histo
68
  if len(messages) != len(history):
69
  log_warning("log_chat() --> Some messages in history are missing 'role' or 'content' keys.")
70
 
71
- user_messages_count = sum(1 for item in messages if isinstance(item, dict) and item.get("role") == "user")
 
72
 
73
  # These must match the keys in the new row
74
  expected_headers = ["timestamp", "chat_id", "turns", "prompt", "messages", "model", "session_id", "info"]
@@ -89,6 +90,8 @@ def _log_chat(chat_id: str, session_id: str, model_name: str, prompt: str, histo
89
  max_retries = 3
90
  retry_count = 0
91
  file_exists = False
 
 
92
  while retry_count < max_retries:
93
  try:
94
  csv_path = hf_hub_download(
@@ -97,9 +100,13 @@ def _log_chat(chat_id: str, session_id: str, model_name: str, prompt: str, histo
97
  repo_type="dataset",
98
  token=HF_TOKEN # Only needed if not already logged in
99
  )
100
- pd.read_csv(csv_path)
 
101
  file_exists = True
102
- log_debug(f"log_chat() --> Downloaded existing CSV with {len(pd.read_csv(csv_path))} rows")
 
 
 
103
  break # Success, exit the loop
104
  except Exception as e:
105
  retry_count += 1
@@ -115,16 +122,12 @@ def _log_chat(chat_id: str, session_id: str, model_name: str, prompt: str, histo
115
  log_timer.add_step(f"Downloaded existing CSV (attempts: {retry_count + 1})")
116
 
117
  # Handle the case where the CSV file does not exist or is invalid
118
- if file_exists and len(pd.read_csv(csv_path)) == 0:
119
- log_warning(f"log_chat() --> CSV {csv_path} exists but is empty, will create a new one.")
120
- dump_hub_csv()
121
- file_exists = False
122
- elif file_exists:
123
- # Check that the headers match our standard headers of "timestamp", "chat_id", "turns", ...
124
- existing_headers = pd.read_csv(csv_path).columns.tolist()
125
  if set(existing_headers) != set(expected_headers):
126
  log_warning(f"log_chat() --> CSV {csv_path} has unexpected headers: {existing_headers}. "
127
- f"\nExpected {existing_headers} "
128
  f"Will create a new one.")
129
  dump_hub_csv()
130
  file_exists = False
@@ -134,15 +137,29 @@ def _log_chat(chat_id: str, session_id: str, model_name: str, prompt: str, histo
134
  # Write out the new row to the CSV file (append isn't working in HF container, so recreate each time)
135
  log_debug(f"log_chat() --> Writing CSV file, file_exists={file_exists}")
136
  try:
137
- with open(CSV_FILENAME, "w", newline="\n") as f:
138
- writer = csv.DictWriter(f, fieldnames=new_row.keys())
139
- writer.writeheader() # Always write the header
140
- if file_exists:
141
- for _, row in pd.read_csv(csv_path).iterrows():
142
- writer.writerow(row.to_dict()) # Write existing rows
143
- writer.writerow(new_row) # Write the new row
144
-
145
- log_debug("log_chat() --> Wrote out CSV with new row")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  # dump_local_csv()
147
  except Exception as e:
148
  log_error(f"log_chat() --> Error writing to CSV: {e}")
 
68
  if len(messages) != len(history):
69
  log_warning("log_chat() --> Some messages in history are missing 'role' or 'content' keys.")
70
 
71
+ user_messages_count = sum(1 for item in messages if isinstance(item, dict) and item.get("role") == "user"
72
+ and isinstance(item.get("content"), str))
73
 
74
  # These must match the keys in the new row
75
  expected_headers = ["timestamp", "chat_id", "turns", "prompt", "messages", "model", "session_id", "info"]
 
90
  max_retries = 3
91
  retry_count = 0
92
  file_exists = False
93
+ csv_path = None
94
+ row_count = 0
95
  while retry_count < max_retries:
96
  try:
97
  csv_path = hf_hub_download(
 
100
  repo_type="dataset",
101
  token=HF_TOKEN # Only needed if not already logged in
102
  )
103
+ # Only read first row to check if file is valid and get row count efficiently
104
+ df_check = pd.read_csv(csv_path, nrows=1)
105
  file_exists = True
106
+ # Get row count without loading entire file into memory
107
+ with open(csv_path, 'r') as f:
108
+ row_count = sum(1 for _ in f) - 1 # Subtract header
109
+ log_info(f"log_chat() --> Downloaded existing CSV with ~{row_count} rows")
110
  break # Success, exit the loop
111
  except Exception as e:
112
  retry_count += 1
 
122
  log_timer.add_step(f"Downloaded existing CSV (attempts: {retry_count + 1})")
123
 
124
  # Handle the case where the CSV file does not exist or is invalid
125
+ if file_exists:
126
+ # Check that the headers match our standard headers (only read first row)
127
+ existing_headers = pd.read_csv(csv_path, nrows=0).columns.tolist()
 
 
 
 
128
  if set(existing_headers) != set(expected_headers):
129
  log_warning(f"log_chat() --> CSV {csv_path} has unexpected headers: {existing_headers}. "
130
+ f"\nExpected {expected_headers} "
131
  f"Will create a new one.")
132
  dump_hub_csv()
133
  file_exists = False
 
137
  # Write out the new row to the CSV file (append isn't working in HF container, so recreate each time)
138
  log_debug(f"log_chat() --> Writing CSV file, file_exists={file_exists}")
139
  try:
140
+ if file_exists:
141
+ # Append mode: copy existing file and append new row
142
+ # Use chunked reading to avoid loading entire file into memory
143
+ with open(CSV_FILENAME, "w", newline="\n") as f_out:
144
+ writer = csv.DictWriter(f_out, fieldnames=expected_headers)
145
+ writer.writeheader()
146
+
147
+ # Stream copy existing rows in chunks to minimize memory usage
148
+ chunk_size = 1000
149
+ for chunk in pd.read_csv(csv_path, chunksize=chunk_size):
150
+ for _, row in chunk.iterrows():
151
+ writer.writerow(row.to_dict())
152
+
153
+ # Append new row
154
+ writer.writerow(new_row)
155
+ else:
156
+ # Create new file with just the new row
157
+ with open(CSV_FILENAME, "w", newline="\n") as f:
158
+ writer = csv.DictWriter(f, fieldnames=expected_headers)
159
+ writer.writeheader()
160
+ writer.writerow(new_row)
161
+
162
+ log_debug(f"log_chat() --> Wrote out CSV with new row, new row_count={row_count + 1}")
163
  # dump_local_csv()
164
  except Exception as e:
165
  log_error(f"log_chat() --> Error writing to CSV: {e}")