EdgeTAM

Running on Zero

App Files Files Community

chongzhou commited on May 17

Commit

3e65e00

1 Parent(s): 4277d6f

offload_state_to_cpu

Browse files

Files changed (1) hide show

app.py +116 -119

app.py CHANGED Viewed

@@ -237,15 +237,16 @@ def preprocess_video_in(
     input_points = []
     input_labels = []
-    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
-    predictor.to("cuda")
-    if inference_state:
-        inference_state["device"] = "cuda"
     if torch.cuda.get_device_properties(0).major >= 8:
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
-    torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
-    inference_state = predictor.init_state(video_path=video_path)
     return [
         gr.update(open=False),  # video_in_drawer
@@ -270,72 +271,68 @@ def segment_with_points(
     inference_state,
     evt: gr.SelectData,
 ):
-    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
-    predictor.to("cuda")
-    if inference_state:
-        inference_state["device"] = "cuda"
     if torch.cuda.get_device_properties(0).major >= 8:
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
-    torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
-    input_points.append(evt.index)
-    print(f"TRACKING INPUT POINT: {input_points}")
-    if point_type == "include":
-        input_labels.append(1)
-    elif point_type == "exclude":
-        input_labels.append(0)
-    print(f"TRACKING INPUT LABEL: {input_labels}")
-    # Open the image and get its dimensions
-    transparent_background = Image.fromarray(first_frame).convert("RGBA")
-    w, h = transparent_background.size
-    # Define the circle radius as a fraction of the smaller dimension
-    fraction = 0.01  # You can adjust this value as needed
-    radius = int(fraction * min(w, h))
-    # Create a transparent layer to draw on
-    transparent_layer = np.zeros((h, w, 4), dtype=np.uint8)
-    for index, track in enumerate(input_points):
-        if input_labels[index] == 1:
-            cv2.circle(transparent_layer, track, radius, (0, 255, 0, 255), -1)
-        else:
-            cv2.circle(transparent_layer, track, radius, (255, 0, 0, 255), -1)
-    # Convert the transparent layer back to an image
-    transparent_layer = Image.fromarray(transparent_layer, "RGBA")
-    selected_point_map = Image.alpha_composite(
-        transparent_background, transparent_layer
-    )
-    # Let's add a positive click at (x, y) = (210, 350) to get started
-    points = np.array(input_points, dtype=np.float32)
-    # for labels, `1` means positive click and `0` means negative click
-    labels = np.array(input_labels, dtype=np.int32)
-    _, _, out_mask_logits = predictor.add_new_points(
-        inference_state=inference_state,
-        frame_idx=0,
-        obj_id=OBJ_ID,
-        points=points,
-        labels=labels,
-    )
-    mask_image = show_mask((out_mask_logits[0] > 0.0).cpu().numpy())
-    first_frame_output = Image.alpha_composite(transparent_background, mask_image)
-    torch.cuda.empty_cache()
-    return (
-        selected_point_map,
-        first_frame_output,
-        first_frame,
-        all_frames,
-        input_points,
-        input_labels,
-        inference_state,
-    )
 def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
@@ -362,64 +359,64 @@ def propagate_to_all(
     input_labels,
     inference_state,
 ):
-    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cpu")
-    predictor.to("cuda")
-    if inference_state:
-        inference_state["device"] = "cuda"
     if torch.cuda.get_device_properties(0).major >= 8:
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
-    torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
-    if len(input_points) == 0 or video_in is None or inference_state is None:
-        return None
-    # run propagation throughout the video and collect the results in a dict
-    video_segments = {}  # video_segments contains the per-frame segmentation results
-    print("starting propagate_in_video")
-    for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
-        inference_state
-    ):
-        video_segments[out_frame_idx] = {
-            out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
-            for i, out_obj_id in enumerate(out_obj_ids)
-        }
-    # obtain the segmentation results every few frames
-    vis_frame_stride = 1
-    output_frames = []
-    for out_frame_idx in range(0, len(video_segments), vis_frame_stride):
-        transparent_background = Image.fromarray(all_frames[out_frame_idx]).convert(
-            "RGBA"
         )
-        out_mask = video_segments[out_frame_idx][OBJ_ID]
-        mask_image = show_mask(out_mask)
-        output_frame = Image.alpha_composite(transparent_background, mask_image)
-        output_frame = np.array(output_frame)
-        output_frames.append(output_frame)
-    torch.cuda.empty_cache()
-    # Create a video clip from the image sequence
-    original_fps = get_video_fps(video_in)
-    fps = original_fps  # Frames per second
-    clip = ImageSequenceClip(output_frames, fps=fps)
-    # Write the result to a file
-    unique_id = datetime.now().strftime("%Y%m%d%H%M%S")
-    final_vid_output_path = f"output_video_{unique_id}.mp4"
-    final_vid_output_path = os.path.join(tempfile.gettempdir(), final_vid_output_path)
-    # Write the result to a file
-    clip.write_videofile(final_vid_output_path, codec="libx264")
-    return (
-        gr.update(value=final_vid_output_path),
-        first_frame,
-        all_frames,
-        input_points,
-        input_labels,
-        inference_state,
-    )
 def update_ui():

     input_points = []
     input_labels = []
+    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cuda")
     if torch.cuda.get_device_properties(0).major >= 8:
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
+    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+        inference_state = predictor.init_state(
+            offload_video_to_cpu=True,
+            offload_state_to_cpu=True,
+            video_path=video_path,
+        )
     return [
         gr.update(open=False),  # video_in_drawer
     inference_state,
     evt: gr.SelectData,
 ):
+    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cuda")
     if torch.cuda.get_device_properties(0).major >= 8:
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
+    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+        input_points.append(evt.index)
+        print(f"TRACKING INPUT POINT: {input_points}")
+        if point_type == "include":
+            input_labels.append(1)
+        elif point_type == "exclude":
+            input_labels.append(0)
+        print(f"TRACKING INPUT LABEL: {input_labels}")
+        # Open the image and get its dimensions
+        transparent_background = Image.fromarray(first_frame).convert("RGBA")
+        w, h = transparent_background.size
+        # Define the circle radius as a fraction of the smaller dimension
+        fraction = 0.01  # You can adjust this value as needed
+        radius = int(fraction * min(w, h))
+        # Create a transparent layer to draw on
+        transparent_layer = np.zeros((h, w, 4), dtype=np.uint8)
+        for index, track in enumerate(input_points):
+            if input_labels[index] == 1:
+                cv2.circle(transparent_layer, track, radius, (0, 255, 0, 255), -1)
+            else:
+                cv2.circle(transparent_layer, track, radius, (255, 0, 0, 255), -1)
+        # Convert the transparent layer back to an image
+        transparent_layer = Image.fromarray(transparent_layer, "RGBA")
+        selected_point_map = Image.alpha_composite(
+            transparent_background, transparent_layer
+        )
+        # Let's add a positive click at (x, y) = (210, 350) to get started
+        points = np.array(input_points, dtype=np.float32)
+        # for labels, `1` means positive click and `0` means negative click
+        labels = np.array(input_labels, dtype=np.int32)
+        _, _, out_mask_logits = predictor.add_new_points(
+            inference_state=inference_state,
+            frame_idx=0,
+            obj_id=OBJ_ID,
+            points=points,
+            labels=labels,
+        )
+        mask_image = show_mask((out_mask_logits[0] > 0.0).cpu().numpy())
+        first_frame_output = Image.alpha_composite(transparent_background, mask_image)
+        torch.cuda.empty_cache()
+        return (
+            selected_point_map,
+            first_frame_output,
+            first_frame,
+            all_frames,
+            input_points,
+            input_labels,
+            inference_state,
+        )
 def show_mask(mask, obj_id=None, random_color=False, convert_to_image=True):
     input_labels,
     inference_state,
 ):
+    predictor = build_sam2_video_predictor(model_cfg, sam2_checkpoint, device="cuda")
     if torch.cuda.get_device_properties(0).major >= 8:
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
+    with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+        if len(input_points) == 0 or video_in is None or inference_state is None:
+            return None
+        # run propagation throughout the video and collect the results in a dict
+        video_segments = (
+            {}
+        )  # video_segments contains the per-frame segmentation results
+        print("starting propagate_in_video")
+        for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
+            inference_state
+        ):
+            video_segments[out_frame_idx] = {
+                out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                for i, out_obj_id in enumerate(out_obj_ids)
+            }
+        # obtain the segmentation results every few frames
+        vis_frame_stride = 1
+        output_frames = []
+        for out_frame_idx in range(0, len(video_segments), vis_frame_stride):
+            transparent_background = Image.fromarray(all_frames[out_frame_idx]).convert(
+                "RGBA"
+            )
+            out_mask = video_segments[out_frame_idx][OBJ_ID]
+            mask_image = show_mask(out_mask)
+            output_frame = Image.alpha_composite(transparent_background, mask_image)
+            output_frame = np.array(output_frame)
+            output_frames.append(output_frame)
+        torch.cuda.empty_cache()
+        # Create a video clip from the image sequence
+        original_fps = get_video_fps(video_in)
+        fps = original_fps  # Frames per second
+        clip = ImageSequenceClip(output_frames, fps=fps)
+        # Write the result to a file
+        unique_id = datetime.now().strftime("%Y%m%d%H%M%S")
+        final_vid_output_path = f"output_video_{unique_id}.mp4"
+        final_vid_output_path = os.path.join(
+            tempfile.gettempdir(), final_vid_output_path
         )
+        # Write the result to a file
+        clip.write_videofile(final_vid_output_path, codec="libx264")
+        return (
+            gr.update(value=final_vid_output_path),
+            first_frame,
+            all_frames,
+            input_points,
+            input_labels,
+            inference_state,
+        )
 def update_ui():