fuvty commited on
Commit
f926ba4
·
1 Parent(s): 0282280

[debug] zeroGPU

Browse files
Files changed (1) hide show
  1. app.py +35 -19
app.py CHANGED
@@ -7,8 +7,8 @@ This creates a web interface to compare three inference modes simultaneously:
7
  3. C2C: Rosetta model with projectors
8
 
9
  ZeroGPU Support:
10
- - Models are loaded to CUDA at startup
11
- - @spaces.GPU decorator handles GPU allocation automatically for each inference
12
  - Works seamlessly on both ZeroGPU and regular GPU environments
13
  """
14
 
@@ -51,15 +51,16 @@ class ModelManager:
51
  c2c_checkpoint_path: Path to C2C checkpoint directory
52
  device: Device to use (cuda, cpu, or auto)
53
  """
54
- # For ZeroGPU, models should be loaded to CUDA directly
55
- # The @spaces.GPU decorator handles GPU allocation automatically
56
  if device == "auto":
57
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
58
  else:
59
  self.device = torch.device(device)
60
  print(f"Using device: {self.device}")
61
- if ZEROGPU_AVAILABLE:
62
- print("ZeroGPU detected: Models will be loaded to CUDA (decorator handles allocation)")
63
 
64
  # Model configurations
65
  self.single_model_name = single_model_name
@@ -70,8 +71,8 @@ class ModelManager:
70
  # T2T prompt configurations
71
  self.t2t_background_prompt = "Briefly describe the most useful background to answer the question:\n\n{question}"
72
  self.t2t_answer_prompt = "Based on the background, answer the question:\n\n{question}" # Format for second round question
73
- self.t2t_context_max_tokens = 512
74
- self.t2t_answer_max_tokens = 512
75
 
76
  # Generation configuration (shared across all models)
77
  # To enable sampling: set use_sampling=True and adjust temperature/top_p/top_k
@@ -220,12 +221,16 @@ class ModelManager:
220
  @spaces.GPU(duration=60)
221
  def generate_single(self, user_input: str) -> Generator[str, None, None]:
222
  """Generate response from single model with streaming."""
223
- # @spaces.GPU decorator handles GPU allocation automatically
 
 
 
 
224
  messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
225
  text = self.single_tokenizer.apply_chat_template(
226
  messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
227
  )
228
- inputs = self.single_tokenizer(text, return_tensors="pt").to(self.device)
229
 
230
  # Setup streamer
231
  streamer = TextIteratorStreamer(
@@ -255,7 +260,14 @@ class ModelManager:
255
  @spaces.GPU(duration=90)
256
  def generate_t2t(self, user_input: str) -> Generator[tuple[str, str], None, None]:
257
  """Generate response from T2T model with streaming (returns context, answer)."""
258
- # @spaces.GPU decorator handles GPU allocation automatically
 
 
 
 
 
 
 
259
 
260
  # Stage 1: Context generation
261
  context_streamer = TextIteratorStreamer(
@@ -271,7 +283,7 @@ class ModelManager:
271
  add_generation_prompt=True,
272
  return_tensors="pt",
273
  enable_thinking=False
274
- ).to(self.device)
275
 
276
  generation_kwargs = {
277
  'input_ids': inputs,
@@ -320,7 +332,7 @@ class ModelManager:
320
  add_generation_prompt=True,
321
  return_tensors="pt",
322
  enable_thinking=False
323
- ).to(self.device)
324
 
325
  generation_kwargs = {
326
  'input_ids': inputs,
@@ -341,12 +353,16 @@ class ModelManager:
341
  @spaces.GPU(duration=60)
342
  def generate_c2c(self, user_input: str) -> Generator[str, None, None]:
343
  """Generate response from C2C model with streaming."""
344
- # @spaces.GPU decorator handles GPU allocation automatically
 
 
 
 
345
  messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
346
  text = self.c2c_tokenizer.apply_chat_template(
347
  messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
348
  )
349
- inputs = self.c2c_tokenizer(text, return_tensors="pt").to(self.device)
350
 
351
  # Setup streamer
352
  streamer = TextIteratorStreamer(
@@ -359,12 +375,12 @@ class ModelManager:
359
  full_length = inputs.input_ids.shape[1]
360
  instruction_index = torch.tensor([1, 0], dtype=torch.long).repeat(
361
  full_length - 1, 1
362
- ).unsqueeze(0).to(self.device)
363
  label_index = torch.tensor([-1, 0], dtype=torch.long).repeat(
364
  1, 1
365
- ).unsqueeze(0).to(self.device)
366
  position_ids = inputs.attention_mask.long().cumsum(-1) - 1 if inputs.attention_mask is not None else \
367
- torch.arange(full_length, dtype=torch.long).unsqueeze(0).to(self.device)
368
 
369
  # Generation parameters
370
  generation_kwargs = {
 
7
  3. C2C: Rosetta model with projectors
8
 
9
  ZeroGPU Support:
10
+ - Models are loaded to CPU at startup
11
+ - @spaces.GPU decorator moves models to GPU on-demand for each inference
12
  - Works seamlessly on both ZeroGPU and regular GPU environments
13
  """
14
 
 
51
  c2c_checkpoint_path: Path to C2C checkpoint directory
52
  device: Device to use (cuda, cpu, or auto)
53
  """
54
+ # For ZeroGPU, load models to CPU and move to GPU in decorated functions
 
55
  if device == "auto":
56
+ if ZEROGPU_AVAILABLE:
57
+ self.device = torch.device("cpu")
58
+ print("ZeroGPU detected: Loading models to CPU (will move to GPU on-demand)")
59
+ else:
60
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
61
  else:
62
  self.device = torch.device(device)
63
  print(f"Using device: {self.device}")
 
 
64
 
65
  # Model configurations
66
  self.single_model_name = single_model_name
 
71
  # T2T prompt configurations
72
  self.t2t_background_prompt = "Briefly describe the most useful background to answer the question:\n\n{question}"
73
  self.t2t_answer_prompt = "Based on the background, answer the question:\n\n{question}" # Format for second round question
74
+ self.t2t_context_max_tokens = 256
75
+ self.t2t_answer_max_tokens = 256
76
 
77
  # Generation configuration (shared across all models)
78
  # To enable sampling: set use_sampling=True and adjust temperature/top_p/top_k
 
221
  @spaces.GPU(duration=60)
222
  def generate_single(self, user_input: str) -> Generator[str, None, None]:
223
  """Generate response from single model with streaming."""
224
+ # Move model to GPU for ZeroGPU
225
+ device = torch.device("cuda" if torch.cuda.is_available() else self.device)
226
+ if ZEROGPU_AVAILABLE and self.single_model.device.type != "cuda":
227
+ self.single_model.to(device)
228
+
229
  messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
230
  text = self.single_tokenizer.apply_chat_template(
231
  messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
232
  )
233
+ inputs = self.single_tokenizer(text, return_tensors="pt").to(device)
234
 
235
  # Setup streamer
236
  streamer = TextIteratorStreamer(
 
260
  @spaces.GPU(duration=90)
261
  def generate_t2t(self, user_input: str) -> Generator[tuple[str, str], None, None]:
262
  """Generate response from T2T model with streaming (returns context, answer)."""
263
+ # Move models to GPU for ZeroGPU
264
+ device = torch.device("cuda" if torch.cuda.is_available() else self.device)
265
+ if ZEROGPU_AVAILABLE:
266
+ if self.t2t_model.context_model.device.type != "cuda":
267
+ self.t2t_model.context_model.to(device)
268
+ if self.t2t_model.answer_model.device.type != "cuda":
269
+ self.t2t_model.answer_model.to(device)
270
+
271
 
272
  # Stage 1: Context generation
273
  context_streamer = TextIteratorStreamer(
 
283
  add_generation_prompt=True,
284
  return_tensors="pt",
285
  enable_thinking=False
286
+ ).to(device)
287
 
288
  generation_kwargs = {
289
  'input_ids': inputs,
 
332
  add_generation_prompt=True,
333
  return_tensors="pt",
334
  enable_thinking=False
335
+ ).to(device)
336
 
337
  generation_kwargs = {
338
  'input_ids': inputs,
 
353
  @spaces.GPU(duration=60)
354
  def generate_c2c(self, user_input: str) -> Generator[str, None, None]:
355
  """Generate response from C2C model with streaming."""
356
+ # Move model to GPU for ZeroGPU
357
+ device = torch.device("cuda" if torch.cuda.is_available() else self.device)
358
+ if ZEROGPU_AVAILABLE and self.c2c_model.device.type != "cuda":
359
+ self.c2c_model.to(device)
360
+
361
  messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
362
  text = self.c2c_tokenizer.apply_chat_template(
363
  messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
364
  )
365
+ inputs = self.c2c_tokenizer(text, return_tensors="pt").to(device)
366
 
367
  # Setup streamer
368
  streamer = TextIteratorStreamer(
 
375
  full_length = inputs.input_ids.shape[1]
376
  instruction_index = torch.tensor([1, 0], dtype=torch.long).repeat(
377
  full_length - 1, 1
378
+ ).unsqueeze(0).to(device)
379
  label_index = torch.tensor([-1, 0], dtype=torch.long).repeat(
380
  1, 1
381
+ ).unsqueeze(0).to(device)
382
  position_ids = inputs.attention_mask.long().cumsum(-1) - 1 if inputs.attention_mask is not None else \
383
+ torch.arange(full_length, dtype=torch.long).unsqueeze(0).to(device)
384
 
385
  # Generation parameters
386
  generation_kwargs = {