Spaces:

nics-efc
/

C2C_demo

Starting on L4

App Files Files Community

fuvty commited on 12 days ago

Commit

f926ba4

1 Parent(s): 0282280

[debug] zeroGPU

Browse files

Files changed (1) hide show

app.py +35 -19

app.py CHANGED Viewed

@@ -7,8 +7,8 @@ This creates a web interface to compare three inference modes simultaneously:
 3. C2C: Rosetta model with projectors
 ZeroGPU Support:
-- Models are loaded to CUDA at startup
-- @spaces.GPU decorator handles GPU allocation automatically for each inference
 - Works seamlessly on both ZeroGPU and regular GPU environments
 """
@@ -51,15 +51,16 @@ class ModelManager:
             c2c_checkpoint_path: Path to C2C checkpoint directory
             device: Device to use (cuda, cpu, or auto)
         """
-        # For ZeroGPU, models should be loaded to CUDA directly
-        # The @spaces.GPU decorator handles GPU allocation automatically
         if device == "auto":
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         else:
             self.device = torch.device(device)
         print(f"Using device: {self.device}")
-        if ZEROGPU_AVAILABLE:
-            print("ZeroGPU detected: Models will be loaded to CUDA (decorator handles allocation)")
         # Model configurations
         self.single_model_name = single_model_name
@@ -70,8 +71,8 @@ class ModelManager:
         # T2T prompt configurations
         self.t2t_background_prompt = "Briefly describe the most useful background to answer the question:\n\n{question}"
         self.t2t_answer_prompt = "Based on the background, answer the question:\n\n{question}"  # Format for second round question
-        self.t2t_context_max_tokens = 512
-        self.t2t_answer_max_tokens = 512
         # Generation configuration (shared across all models)
         # To enable sampling: set use_sampling=True and adjust temperature/top_p/top_k
@@ -220,12 +221,16 @@ class ModelManager:
     @spaces.GPU(duration=60)
     def generate_single(self, user_input: str) -> Generator[str, None, None]:
         """Generate response from single model with streaming."""
-        # @spaces.GPU decorator handles GPU allocation automatically
         messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
         text = self.single_tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
         )
-        inputs = self.single_tokenizer(text, return_tensors="pt").to(self.device)
         # Setup streamer
         streamer = TextIteratorStreamer(
@@ -255,7 +260,14 @@ class ModelManager:
     @spaces.GPU(duration=90)
     def generate_t2t(self, user_input: str) -> Generator[tuple[str, str], None, None]:
         """Generate response from T2T model with streaming (returns context, answer)."""
-        # @spaces.GPU decorator handles GPU allocation automatically
         # Stage 1: Context generation
         context_streamer = TextIteratorStreamer(
@@ -271,7 +283,7 @@ class ModelManager:
             add_generation_prompt=True,
             return_tensors="pt",
             enable_thinking=False
-        ).to(self.device)
         generation_kwargs = {
             'input_ids': inputs,
@@ -320,7 +332,7 @@ class ModelManager:
             add_generation_prompt=True,
             return_tensors="pt",
             enable_thinking=False
-        ).to(self.device)
         generation_kwargs = {
             'input_ids': inputs,
@@ -341,12 +353,16 @@ class ModelManager:
     @spaces.GPU(duration=60)
     def generate_c2c(self, user_input: str) -> Generator[str, None, None]:
         """Generate response from C2C model with streaming."""
-        # @spaces.GPU decorator handles GPU allocation automatically
         messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
         text = self.c2c_tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
         )
-        inputs = self.c2c_tokenizer(text, return_tensors="pt").to(self.device)
         # Setup streamer
         streamer = TextIteratorStreamer(
@@ -359,12 +375,12 @@ class ModelManager:
         full_length = inputs.input_ids.shape[1]
         instruction_index = torch.tensor([1, 0], dtype=torch.long).repeat(
             full_length - 1, 1
-        ).unsqueeze(0).to(self.device)
         label_index = torch.tensor([-1, 0], dtype=torch.long).repeat(
             1, 1
-        ).unsqueeze(0).to(self.device)
         position_ids = inputs.attention_mask.long().cumsum(-1) - 1 if inputs.attention_mask is not None else \
-                      torch.arange(full_length, dtype=torch.long).unsqueeze(0).to(self.device)
         # Generation parameters
         generation_kwargs = {

 3. C2C: Rosetta model with projectors
 ZeroGPU Support:
+- Models are loaded to CPU at startup
+- @spaces.GPU decorator moves models to GPU on-demand for each inference
 - Works seamlessly on both ZeroGPU and regular GPU environments
 """
             c2c_checkpoint_path: Path to C2C checkpoint directory
             device: Device to use (cuda, cpu, or auto)
         """
+        # For ZeroGPU, load models to CPU and move to GPU in decorated functions
         if device == "auto":
+            if ZEROGPU_AVAILABLE:
+                self.device = torch.device("cpu")
+                print("ZeroGPU detected: Loading models to CPU (will move to GPU on-demand)")
+            else:
+                self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         else:
             self.device = torch.device(device)
         print(f"Using device: {self.device}")
         # Model configurations
         self.single_model_name = single_model_name
         # T2T prompt configurations
         self.t2t_background_prompt = "Briefly describe the most useful background to answer the question:\n\n{question}"
         self.t2t_answer_prompt = "Based on the background, answer the question:\n\n{question}"  # Format for second round question
+        self.t2t_context_max_tokens = 256
+        self.t2t_answer_max_tokens = 256
         # Generation configuration (shared across all models)
         # To enable sampling: set use_sampling=True and adjust temperature/top_p/top_k
     @spaces.GPU(duration=60)
     def generate_single(self, user_input: str) -> Generator[str, None, None]:
         """Generate response from single model with streaming."""
+        # Move model to GPU for ZeroGPU
+        device = torch.device("cuda" if torch.cuda.is_available() else self.device)
+        if ZEROGPU_AVAILABLE and self.single_model.device.type != "cuda":
+            self.single_model.to(device)
         messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
         text = self.single_tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
         )
+        inputs = self.single_tokenizer(text, return_tensors="pt").to(device)
         # Setup streamer
         streamer = TextIteratorStreamer(
     @spaces.GPU(duration=90)
     def generate_t2t(self, user_input: str) -> Generator[tuple[str, str], None, None]:
         """Generate response from T2T model with streaming (returns context, answer)."""
+        # Move models to GPU for ZeroGPU
+        device = torch.device("cuda" if torch.cuda.is_available() else self.device)
+        if ZEROGPU_AVAILABLE:
+            if self.t2t_model.context_model.device.type != "cuda":
+                self.t2t_model.context_model.to(device)
+            if self.t2t_model.answer_model.device.type != "cuda":
+                self.t2t_model.answer_model.to(device)
         # Stage 1: Context generation
         context_streamer = TextIteratorStreamer(
             add_generation_prompt=True,
             return_tensors="pt",
             enable_thinking=False
+        ).to(device)
         generation_kwargs = {
             'input_ids': inputs,
             add_generation_prompt=True,
             return_tensors="pt",
             enable_thinking=False
+        ).to(device)
         generation_kwargs = {
             'input_ids': inputs,
     @spaces.GPU(duration=60)
     def generate_c2c(self, user_input: str) -> Generator[str, None, None]:
         """Generate response from C2C model with streaming."""
+        # Move model to GPU for ZeroGPU
+        device = torch.device("cuda" if torch.cuda.is_available() else self.device)
+        if ZEROGPU_AVAILABLE and self.c2c_model.device.type != "cuda":
+            self.c2c_model.to(device)
         messages = [{"role": "system", "content": ""}, {"role": "user", "content": user_input}]
         text = self.c2c_tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
         )
+        inputs = self.c2c_tokenizer(text, return_tensors="pt").to(device)
         # Setup streamer
         streamer = TextIteratorStreamer(
         full_length = inputs.input_ids.shape[1]
         instruction_index = torch.tensor([1, 0], dtype=torch.long).repeat(
             full_length - 1, 1
+        ).unsqueeze(0).to(device)
         label_index = torch.tensor([-1, 0], dtype=torch.long).repeat(
             1, 1
+        ).unsqueeze(0).to(device)
         position_ids = inputs.attention_mask.long().cumsum(-1) - 1 if inputs.attention_mask is not None else \
+                      torch.arange(full_length, dtype=torch.long).unsqueeze(0).to(device)
         # Generation parameters
         generation_kwargs = {