Spaces:
Paused
Paused
| #!/usr/bin/env python3 | |
| """ | |
| Copyright (c) 2022, salesforce.com, inc. | |
| All rights reserved. | |
| SPDX-License-Identifier: BSD-3-Clause | |
| For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause | |
| """ | |
| import numbers | |
| import random | |
| from torchvision.transforms import ( | |
| RandomCrop, | |
| RandomResizedCrop, | |
| ) | |
| import video_llama.processors.functional_video as F | |
| __all__ = [ | |
| "RandomCropVideo", | |
| "RandomResizedCropVideo", | |
| "CenterCropVideo", | |
| "NormalizeVideo", | |
| "ToTensorVideo", | |
| "RandomHorizontalFlipVideo", | |
| ] | |
| class RandomCropVideo(RandomCrop): | |
| def __init__(self, size): | |
| if isinstance(size, numbers.Number): | |
| self.size = (int(size), int(size)) | |
| else: | |
| self.size = size | |
| def __call__(self, clip): | |
| """ | |
| Args: | |
| clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) | |
| Returns: | |
| torch.tensor: randomly cropped/resized video clip. | |
| size is (C, T, OH, OW) | |
| """ | |
| i, j, h, w = self.get_params(clip, self.size) | |
| return F.crop(clip, i, j, h, w) | |
| def __repr__(self) -> str: | |
| return f"{self.__class__.__name__}(size={self.size})" | |
| class RandomResizedCropVideo(RandomResizedCrop): | |
| def __init__( | |
| self, | |
| size, | |
| scale=(0.08, 1.0), | |
| ratio=(3.0 / 4.0, 4.0 / 3.0), | |
| interpolation_mode="bilinear", | |
| ): | |
| if isinstance(size, tuple): | |
| if len(size) != 2: | |
| raise ValueError( | |
| f"size should be tuple (height, width), instead got {size}" | |
| ) | |
| self.size = size | |
| else: | |
| self.size = (size, size) | |
| self.interpolation_mode = interpolation_mode | |
| self.scale = scale | |
| self.ratio = ratio | |
| def __call__(self, clip): | |
| """ | |
| Args: | |
| clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) | |
| Returns: | |
| torch.tensor: randomly cropped/resized video clip. | |
| size is (C, T, H, W) | |
| """ | |
| i, j, h, w = self.get_params(clip, self.scale, self.ratio) | |
| return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode) | |
| def __repr__(self) -> str: | |
| return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}, scale={self.scale}, ratio={self.ratio})" | |
| class CenterCropVideo: | |
| def __init__(self, crop_size): | |
| if isinstance(crop_size, numbers.Number): | |
| self.crop_size = (int(crop_size), int(crop_size)) | |
| else: | |
| self.crop_size = crop_size | |
| def __call__(self, clip): | |
| """ | |
| Args: | |
| clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) | |
| Returns: | |
| torch.tensor: central cropping of video clip. Size is | |
| (C, T, crop_size, crop_size) | |
| """ | |
| return F.center_crop(clip, self.crop_size) | |
| def __repr__(self) -> str: | |
| return f"{self.__class__.__name__}(crop_size={self.crop_size})" | |
| class NormalizeVideo: | |
| """ | |
| Normalize the video clip by mean subtraction and division by standard deviation | |
| Args: | |
| mean (3-tuple): pixel RGB mean | |
| std (3-tuple): pixel RGB standard deviation | |
| inplace (boolean): whether do in-place normalization | |
| """ | |
| def __init__(self, mean, std, inplace=False): | |
| self.mean = mean | |
| self.std = std | |
| self.inplace = inplace | |
| def __call__(self, clip): | |
| """ | |
| Args: | |
| clip (torch.tensor): video clip to be normalized. Size is (C, T, H, W) | |
| """ | |
| return F.normalize(clip, self.mean, self.std, self.inplace) | |
| def __repr__(self) -> str: | |
| return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})" | |
| class ToTensorVideo: | |
| """ | |
| Convert tensor data type from uint8 to float, divide value by 255.0 and | |
| permute the dimensions of clip tensor | |
| """ | |
| def __init__(self): | |
| pass | |
| def __call__(self, clip): | |
| """ | |
| Args: | |
| clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C) | |
| Return: | |
| clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W) | |
| """ | |
| return F.to_tensor(clip) | |
| def __repr__(self) -> str: | |
| return self.__class__.__name__ | |
| class RandomHorizontalFlipVideo: | |
| """ | |
| Flip the video clip along the horizonal direction with a given probability | |
| Args: | |
| p (float): probability of the clip being flipped. Default value is 0.5 | |
| """ | |
| def __init__(self, p=0.5): | |
| self.p = p | |
| def __call__(self, clip): | |
| """ | |
| Args: | |
| clip (torch.tensor): Size is (C, T, H, W) | |
| Return: | |
| clip (torch.tensor): Size is (C, T, H, W) | |
| """ | |
| if random.random() < self.p: | |
| clip = F.hflip(clip) | |
| return clip | |
| def __repr__(self) -> str: | |
| return f"{self.__class__.__name__}(p={self.p})" | |