Upload 2 files

Browse files

Files changed (2) hide show

architecture/efficientvit.py +402 -0
architecture/spectformer.py +673 -0

architecture/efficientvit.py ADDED Viewed

	@@ -0,0 +1,402 @@

+import torch
+import itertools
+from timm.models.vision_transformer import trunc_normal_
+from timm.models.layers import SqueezeExcite
+from timm.models.registry import register_model
+class Conv2d_BN(torch.nn.Sequential):
+    def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1,
+                 groups=1, bn_weight_init=1, resolution=-10000):
+        super().__init__()
+        self.add_module('c', torch.nn.Conv2d(
+            a, b, ks, stride, pad, dilation, groups, bias=False))
+        self.add_module('bn', torch.nn.BatchNorm2d(b))
+        torch.nn.init.constant_(self.bn.weight, bn_weight_init)
+        torch.nn.init.constant_(self.bn.bias, 0)
+    @torch.no_grad()
+    def fuse(self):
+        c, bn = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        w = c.weight * w[:, None, None, None]
+        b = bn.bias - bn.running_mean * bn.weight / \
+            (bn.running_var + bn.eps)**0.5
+        m = torch.nn.Conv2d(w.size(1) * self.c.groups, w.size(
+            0), w.shape[2:], stride=self.c.stride, padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups)
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class BN_Linear(torch.nn.Sequential):
+    def __init__(self, a, b, bias=True, std=0.02):
+        super().__init__()
+        self.add_module('bn', torch.nn.BatchNorm1d(a))
+        self.add_module('l', torch.nn.Linear(a, b, bias=bias))
+        trunc_normal_(self.l.weight, std=std)
+        if bias:
+            torch.nn.init.constant_(self.l.bias, 0)
+    @torch.no_grad()
+    def fuse(self):
+        bn, l = self._modules.values()
+        w = bn.weight / (bn.running_var + bn.eps)**0.5
+        b = bn.bias - self.bn.running_mean * \
+            self.bn.weight / (bn.running_var + bn.eps)**0.5
+        w = l.weight * w[None, :]
+        if l.bias is None:
+            b = b @ self.l.weight.T
+        else:
+            b = (l.weight @ b[:, None]).view(-1) + self.l.bias
+        m = torch.nn.Linear(w.size(1), w.size(0))
+        m.weight.data.copy_(w)
+        m.bias.data.copy_(b)
+        return m
+class PatchMerging(torch.nn.Module):
+    def __init__(self, dim, out_dim, input_resolution):
+        super().__init__()
+        hid_dim = int(dim * 4)
+        self.conv1 = Conv2d_BN(dim, hid_dim, 1, 1, 0, resolution=input_resolution)
+        self.act = torch.nn.ReLU()
+        self.conv2 = Conv2d_BN(hid_dim, hid_dim, 3, 2, 1, groups=hid_dim, resolution=input_resolution)
+        self.se = SqueezeExcite(hid_dim, .25)
+        self.conv3 = Conv2d_BN(hid_dim, out_dim, 1, 1, 0, resolution=input_resolution // 2)
+    def forward(self, x):
+        x = self.conv3(self.se(self.act(self.conv2(self.act(self.conv1(x))))))
+        return x
+class Residual(torch.nn.Module):
+    def __init__(self, m, drop=0.):
+        super().__init__()
+        self.m = m
+        self.drop = drop
+    def forward(self, x):
+        if self.training and self.drop > 0:
+            return x + self.m(x) * torch.rand(x.size(0), 1, 1, 1,
+                                              device=x.device).ge_(self.drop).div(1 - self.drop).detach()
+        else:
+            return x + self.m(x)
+class FFN(torch.nn.Module):
+    def __init__(self, ed, h, resolution):
+        super().__init__()
+        self.pw1 = Conv2d_BN(ed, h, resolution=resolution)
+        self.act = torch.nn.ReLU()
+        self.pw2 = Conv2d_BN(h, ed, bn_weight_init=0, resolution=resolution)
+    def forward(self, x):
+        x = self.pw2(self.act(self.pw1(x)))
+        return x
+class CascadedGroupAttention(torch.nn.Module):
+    r""" Cascaded Group Attention.
+    Args:
+        dim (int): Number of input channels.
+        key_dim (int): The dimension for query and key.
+        num_heads (int): Number of attention heads.
+        attn_ratio (int): Multiplier for the query dim for value dimension.
+        resolution (int): Input resolution, correspond to the window size.
+        kernels (List[int]): The kernel size of the dw conv on query.
+    """
+    def __init__(self, dim, key_dim, num_heads=8,
+                 attn_ratio=4,
+                 resolution=14,
+                 kernels=[5, 5, 5, 5],):
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = key_dim ** -0.5
+        self.key_dim = key_dim
+        self.d = int(attn_ratio * key_dim)
+        self.attn_ratio = attn_ratio
+        qkvs = []
+        dws = []
+        for i in range(num_heads):
+            qkvs.append(Conv2d_BN(dim // (num_heads), self.key_dim * 2 + self.d, resolution=resolution))
+            dws.append(Conv2d_BN(self.key_dim, self.key_dim, kernels[i], 1, kernels[i]//2, groups=self.key_dim, resolution=resolution))
+        self.qkvs = torch.nn.ModuleList(qkvs)
+        self.dws = torch.nn.ModuleList(dws)
+        self.proj = torch.nn.Sequential(torch.nn.ReLU(), Conv2d_BN(
+            self.d * num_heads, dim, bn_weight_init=0, resolution=resolution))
+        points = list(itertools.product(range(resolution), range(resolution)))
+        N = len(points)
+        attention_offsets = {}
+        idxs = []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = torch.nn.Parameter(
+            torch.zeros(num_heads, len(attention_offsets)))
+        self.register_buffer('attention_bias_idxs',
+                             torch.LongTensor(idxs).view(N, N))
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+        else:
+            self.ab = self.attention_biases[:, self.attention_bias_idxs]
+    def forward(self, x):  # x (B,C,H,W)
+        B, C, H, W = x.shape
+        trainingab = self.attention_biases[:, self.attention_bias_idxs]
+        feats_in = x.chunk(len(self.qkvs), dim=1)
+        feats_out = []
+        feat = feats_in[0]
+        for i, qkv in enumerate(self.qkvs):
+            if i > 0: # add the previous output to the input
+                feat = feat + feats_in[i]
+            feat = qkv(feat)
+            q, k, v = feat.view(B, -1, H, W).split([self.key_dim, self.key_dim, self.d], dim=1) # B, C/h, H, W
+            q = self.dws[i](q)
+            q, k, v = q.flatten(2), k.flatten(2), v.flatten(2) # B, C/h, N
+            attn = (
+                (q.transpose(-2, -1) @ k) * self.scale
+                +
+                (trainingab[i] if self.training else self.ab[i])
+            )
+            attn = attn.softmax(dim=-1) # BNN
+            feat = (v @ attn.transpose(-2, -1)).view(B, self.d, H, W) # BCHW
+            feats_out.append(feat)
+        x = self.proj(torch.cat(feats_out, 1))
+        return x
+class LocalWindowAttention(torch.nn.Module):
+    r""" Local Window Attention.
+    Args:
+        dim (int): Number of input channels.
+        key_dim (int): The dimension for query and key.
+        num_heads (int): Number of attention heads.
+        attn_ratio (int): Multiplier for the query dim for value dimension.
+        resolution (int): Input resolution.
+        window_resolution (int): Local window resolution.
+        kernels (List[int]): The kernel size of the dw conv on query.
+    """
+    def __init__(self, dim, key_dim, num_heads=8,
+                 attn_ratio=4,
+                 resolution=14,
+                 window_resolution=7,
+                 kernels=[5, 5, 5, 5],):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.resolution = resolution
+        assert window_resolution > 0, 'window_size must be greater than 0'
+        self.window_resolution = window_resolution
+        window_resolution = min(window_resolution, resolution)
+        self.attn = CascadedGroupAttention(dim, key_dim, num_heads,
+                                attn_ratio=attn_ratio,
+                                resolution=window_resolution,
+                                kernels=kernels,)
+    def forward(self, x):
+        H = W = self.resolution
+        B, C, H_, W_ = x.shape
+        # Only check this for classifcation models
+        assert H == H_ and W == W_, 'input feature has wrong size, expect {}, got {}'.format((H, W), (H_, W_))
+        if H <= self.window_resolution and W <= self.window_resolution:
+            x = self.attn(x)
+        else:
+            x = x.permute(0, 2, 3, 1)
+            pad_b = (self.window_resolution - H %
+                     self.window_resolution) % self.window_resolution
+            pad_r = (self.window_resolution - W %
+                     self.window_resolution) % self.window_resolution
+            padding = pad_b > 0 or pad_r > 0
+            if padding:
+                x = torch.nn.functional.pad(x, (0, 0, 0, pad_r, 0, pad_b))
+            pH, pW = H + pad_b, W + pad_r
+            nH = pH // self.window_resolution
+            nW = pW // self.window_resolution
+            # window partition, BHWC -> B(nHh)(nWw)C -> BnHnWhwC -> (BnHnW)hwC -> (BnHnW)Chw
+            x = x.view(B, nH, self.window_resolution, nW, self.window_resolution, C).transpose(2, 3).reshape(
+                B * nH * nW, self.window_resolution, self.window_resolution, C
+            ).permute(0, 3, 1, 2)
+            x = self.attn(x)
+            # window reverse, (BnHnW)Chw -> (BnHnW)hwC -> BnHnWhwC -> B(nHh)(nWw)C -> BHWC
+            x = x.permute(0, 2, 3, 1).view(B, nH, nW, self.window_resolution, self.window_resolution,
+                       C).transpose(2, 3).reshape(B, pH, pW, C)
+            if padding:
+                x = x[:, :H, :W].contiguous()
+            x = x.permute(0, 3, 1, 2)
+        return x
+class EfficientViTBlock(torch.nn.Module):
+    """ A basic EfficientViT building block.
+    Args:
+        type (str): Type for token mixer. Default: 's' for self-attention.
+        ed (int): Number of input channels.
+        kd (int): Dimension for query and key in the token mixer.
+        nh (int): Number of attention heads.
+        ar (int): Multiplier for the query dim for value dimension.
+        resolution (int): Input resolution.
+        window_resolution (int): Local window resolution.
+        kernels (List[int]): The kernel size of the dw conv on query.
+    """
+    def __init__(self, type,
+                 ed, kd, nh=8,
+                 ar=4,
+                 resolution=14,
+                 window_resolution=7,
+                 kernels=[5, 5, 5, 5],):
+        super().__init__()
+        self.dw0 = Residual(Conv2d_BN(ed, ed, 3, 1, 1, groups=ed, bn_weight_init=0., resolution=resolution))
+        self.ffn0 = Residual(FFN(ed, int(ed * 2), resolution))
+        if type == 's':
+            self.mixer = Residual(LocalWindowAttention(ed, kd, nh, attn_ratio=ar, \
+                    resolution=resolution, window_resolution=window_resolution, kernels=kernels))
+        self.dw1 = Residual(Conv2d_BN(ed, ed, 3, 1, 1, groups=ed, bn_weight_init=0., resolution=resolution))
+        self.ffn1 = Residual(FFN(ed, int(ed * 2), resolution))
+    def forward(self, x):
+        return self.ffn1(self.dw1(self.mixer(self.ffn0(self.dw0(x)))))
+class EfficientViT(torch.nn.Module):
+    def __init__(self, img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=1000,
+                 stages=['s', 's', 's'],
+                 embed_dim=[64, 128, 192],
+                 key_dim=[16, 16, 16],
+                 depth=[1, 2, 3],
+                 num_heads=[4, 4, 4],
+                 window_size=[7, 7, 7],
+                 kernels=[5, 5, 5, 5],
+                 down_ops=[['subsample', 2], ['subsample', 2], ['']],
+                 distillation=False,):
+        super().__init__()
+        resolution = img_size
+        # Patch embedding
+        self.patch_embed = torch.nn.Sequential(Conv2d_BN(in_chans, embed_dim[0] // 8, 3, 2, 1, resolution=resolution), torch.nn.ReLU(),
+                           Conv2d_BN(embed_dim[0] // 8, embed_dim[0] // 4, 3, 2, 1, resolution=resolution // 2), torch.nn.ReLU(),
+                           Conv2d_BN(embed_dim[0] // 4, embed_dim[0] // 2, 3, 2, 1, resolution=resolution // 4), torch.nn.ReLU(),
+                           Conv2d_BN(embed_dim[0] // 2, embed_dim[0], 3, 2, 1, resolution=resolution // 8))
+        resolution = img_size // patch_size
+        attn_ratio = [embed_dim[i] / (key_dim[i] * num_heads[i]) for i in range(len(embed_dim))]
+        self.blocks1 = []
+        self.blocks2 = []
+        self.blocks3 = []
+        # Build EfficientViT blocks
+        for i, (stg, ed, kd, dpth, nh, ar, wd, do) in enumerate(
+                zip(stages, embed_dim, key_dim, depth, num_heads, attn_ratio, window_size, down_ops)):
+            for d in range(dpth):
+                eval('self.blocks' + str(i+1)).append(EfficientViTBlock(stg, ed, kd, nh, ar, resolution, wd, kernels))
+            if do[0] == 'subsample':
+                # Build EfficientViT downsample block
+                #('Subsample' stride)
+                blk = eval('self.blocks' + str(i+2))
+                resolution_ = (resolution - 1) // do[1] + 1
+                blk.append(torch.nn.Sequential(Residual(Conv2d_BN(embed_dim[i], embed_dim[i], 3, 1, 1, groups=embed_dim[i], resolution=resolution)),
+                                    Residual(FFN(embed_dim[i], int(embed_dim[i] * 2), resolution)),))
+                blk.append(PatchMerging(*embed_dim[i:i + 2], resolution))
+                resolution = resolution_
+                blk.append(torch.nn.Sequential(Residual(Conv2d_BN(embed_dim[i + 1], embed_dim[i + 1], 3, 1, 1, groups=embed_dim[i + 1], resolution=resolution)),
+                                    Residual(FFN(embed_dim[i + 1], int(embed_dim[i + 1] * 2), resolution)),))
+        self.blocks1 = torch.nn.Sequential(*self.blocks1)
+        self.blocks2 = torch.nn.Sequential(*self.blocks2)
+        self.blocks3 = torch.nn.Sequential(*self.blocks3)
+        # Classification head
+        self.head = BN_Linear(embed_dim[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+        self.distillation = distillation
+        if distillation:
+            self.head_dist = BN_Linear(embed_dim[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {x for x in self.state_dict().keys() if 'attention_biases' in x}
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = self.blocks1(x)
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        x = torch.nn.functional.adaptive_avg_pool2d(x, 1).flatten(1)
+        if self.distillation:
+            x = self.head(x), self.head_dist(x)
+            if not self.training:
+                x = (x[0] + x[1]) / 2
+        else:
+            x = self.head(x)
+        return x
+EfficientViT_d = {
+        'img_size': 224,
+        'patch_size': 16,
+        'embed_dim': [96, 144, 400], #192, 288, 384
+        'depth': [1, 3, 4], #1, 3, 4 -----------------[1, 1, 2]
+        'num_heads': [3, 3, 4], #3, 3, 4
+        'window_size': [7, 7, 7],
+        'kernels': [7, 5, 3, 3],
+    }
+EfficientViT_w = {
+        'img_size': 224,
+        'patch_size': 16,
+        'embed_dim': [192, 288, 96], #400 192
+        'depth': [1, 1, 1], #1, 3, 4 -----------------[1, 1, 2]
+        'num_heads': [3, 3, 4], #3, 3, 4
+        'window_size': [7, 7, 7],
+        'kernels': [7, 5, 3, 3],
+    }
+@register_model
+def EfficientViT_d(num_classes=5, pretrained=False, distillation=False, fuse=False, pretrained_cfg=None, model_cfg=EfficientViT_d):
+    model = EfficientViT(num_classes=num_classes, distillation=distillation, **model_cfg)
+    if fuse:
+        replace_batchnorm(model)
+    return model
+@register_model
+def EfficientViT_w(num_classes=5, pretrained=False, distillation=False, fuse=False, pretrained_cfg=None, model_cfg=EfficientViT_w):
+    model = EfficientViT(num_classes=num_classes, distillation=distillation, **model_cfg)
+    if fuse:
+        replace_batchnorm(model)
+    return model
+def replace_batchnorm(net):
+    for child_name, child in net.named_children():
+        if hasattr(child, 'fuse'):
+            setattr(net, child_name, child.fuse())
+        elif isinstance(child, torch.nn.BatchNorm2d):
+            setattr(net, child_name, torch.nn.Identity())
+        else:
+            replace_batchnorm(child)

architecture/spectformer.py ADDED Viewed

	@@ -0,0 +1,673 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+from timm.models.vision_transformer import _cfg
+import math
+import numpy as np
+from pytorch_wavelets import DWTForward, DWTInverse # (or import DWT, IDWT)
+class SpectralGatingNetwork(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        # this weights are valid for h=14 and w=8
+        if dim == 64: #96 for large model, 64 for small and base model
+            self.h = 56 #H
+            self.w = 29 #(W/2)+1
+            self.complex_weight = nn.Parameter(torch.randn(self.h, self.w, dim, 2, dtype=torch.float32) * 0.02)
+        if dim ==128:
+            self.h = 28 #H
+            self.w = 15 #(W/2)+1, this is due to rfft2
+            self.complex_weight = nn.Parameter(torch.randn(self.h, self.w, dim, 2, dtype=torch.float32) * 0.02)
+        if dim == 96: #96 for large model, 64 for small and base model
+            self.h = 56 #H
+            self.w = 29 #(W/2)+1
+            self.complex_weight = nn.Parameter(torch.randn(self.h, self.w, dim, 2, dtype=torch.float32) * 0.02)
+        if dim ==192:
+            self.h = 28 #H
+            self.w = 15 #(W/2)+1, this is due to rfft2
+            self.complex_weight = nn.Parameter(torch.randn(self.h, self.w, dim, 2, dtype=torch.float32) * 0.02)
+    def forward(self, x, H, W):
+        # print('wno',x.shape) #CIFAR100 image :[128, 196, 384]
+        B, N, C = x.shape
+        # print('wno B, N, C',B, N, C) #CIFAR100 image : 128 196 384
+        x = x.view(B, H, W, C)
+        # B, H, W, C=x.shape
+        x = x.to(torch.float32)
+        # print(x.dtype)
+        # Add above for this error, RuntimeError: Input type (torch.cuda.HalfTensor) and weight type (torch.cuda.FloatTensor) should be the same
+        x = torch.fft.rfft2(x, dim=(1, 2), norm='ortho')
+        # print('wno',x.shape)
+        weight = torch.view_as_complex(self.complex_weight)
+        # print('weight',weight.shape)
+        x = x * weight
+        x = torch.fft.irfft2(x, s=(H, W), dim=(1, 2), norm='ortho')
+        # print('wno',x.shape)
+        x = x.reshape(B, N, C)# permute is not same as reshape or view
+        return x
+        #return x, weight
+def rand_bbox(size, lam, scale=1):
+    W = size[1] // scale
+    H = size[2] // scale
+    cut_rat = np.sqrt(1. - lam)
+    cut_w = np.int(W * cut_rat)
+    cut_h = np.int(H * cut_rat)
+    # uniform
+    cx = np.random.randint(W)
+    cy = np.random.randint(H)
+    bbx1 = np.clip(cx - cut_w // 2, 0, W)
+    bby1 = np.clip(cy - cut_h // 2, 0, H)
+    bbx2 = np.clip(cx + cut_w // 2, 0, W)
+    bby2 = np.clip(cy + cut_h // 2, 0, H)
+    return bbx1, bby1, bbx2, bby2
+class ClassAttention(nn.Module):
+    def __init__(self, dim, num_heads):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.head_dim = head_dim
+        self.scale = head_dim**-0.5
+        self.kv = nn.Linear(dim, dim * 2)
+        self.q = nn.Linear(dim, dim)
+        self.proj = nn.Linear(dim, dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        B, N, C = x.shape
+        kv = self.kv(x).reshape(B, N, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+        q = self.q(x[:, :1, :]).reshape(B, self.num_heads, 1, self.head_dim)
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        cls_embed = (attn @ v).transpose(1, 2).reshape(B, 1, self.head_dim * self.num_heads)
+        cls_embed = self.proj(cls_embed)
+        return cls_embed
+class FFN(nn.Module):
+    def __init__(self, in_features, hidden_features):
+        super().__init__()
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_features, in_features)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+class ClassBlock(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        self.attn = ClassAttention(dim, num_heads)
+        self.mlp = FFN(dim, int(dim * mlp_ratio))
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        cls_embed = x[:, :1]
+        cls_embed = cls_embed + self.attn(self.norm1(x))
+        cls_embed = cls_embed + self.mlp(self.norm2(cls_embed))
+        return torch.cat([cls_embed, x[:, 1:]], dim=1)
+class PVT2FFN(nn.Module):
+    def __init__(self, in_features, hidden_features):
+        super().__init__()
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_features, in_features)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.q = nn.Linear(dim, dim)
+        self.kv = nn.Linear(dim, dim * 2)
+        self.proj = nn.Linear(dim, dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        #return x
+        return x, attn
+class Block(nn.Module):
+    def __init__(self,
+        dim,
+        num_heads,
+        mlp_ratio,
+        drop_path=0.,
+        norm_layer=nn.LayerNorm,
+        sr_ratio=1,
+        block_type = 'wave'
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        if block_type == 'std_att':
+            self.attn = Attention(dim, num_heads)
+        else:
+            self.attn = SpectralGatingNetwork(dim)
+        self.mlp = PVT2FFN(in_features=dim, hidden_features=int(dim * mlp_ratio))
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    # def forward(self, x, H, W): ## !!!!!!!!!!!!!!!!
+    #     x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+    #     x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+    #     return x
+    def forward(self, x, H, W):
+        attn_output, attn_weights = self.attn(self.norm1(x), H, W) if isinstance(self.attn, Attention) else (self.attn(self.norm1(x), H, W), None)
+        x = x + self.drop_path(attn_output)
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        # Optionally return attention weights for visualization or analysis
+        return (x, attn_weights) if attn_weights is not None else x
+class DownSamples(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.proj = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)
+        self.norm = nn.LayerNorm(out_channels)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, H, W
+class Stem(nn.Module):
+    def __init__(self, in_channels, stem_hidden_dim, out_channels):
+        super().__init__()
+        hidden_dim = stem_hidden_dim
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels, hidden_dim, kernel_size=7, stride=2,
+                      padding=3, bias=False),  # 112x112
+            nn.BatchNorm2d(hidden_dim),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=1,
+                      padding=1, bias=False),  # 112x112
+            nn.BatchNorm2d(hidden_dim),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=1,
+                      padding=1, bias=False),  # 112x112
+            nn.BatchNorm2d(hidden_dim),
+            nn.ReLU(inplace=True),
+        )
+        self.proj = nn.Conv2d(hidden_dim,
+                              out_channels,
+                              kernel_size=3,
+                              stride=2,
+                              padding=1)
+        self.norm = nn.LayerNorm(out_channels)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, H, W
+class SpectFormer(nn.Module):
+    def __init__(self,
+        in_chans=3,
+        num_classes=1000,
+        stem_hidden_dim = 32,
+        embed_dims=[64, 128, 320, 448],
+        num_heads=[2, 4, 10, 14],
+        mlp_ratios=[8, 8, 4, 4],
+        drop_path_rate=0.,
+        norm_layer=nn.LayerNorm,
+        depths=[3, 4, 6, 3],
+        sr_ratios=[4, 2, 1, 1],
+        num_stages=4,
+        token_label=False,
+        **kwargs
+    ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.depths = depths
+        self.num_stages = num_stages
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        cur = 0
+        for i in range(num_stages):
+            if i == 0:
+                patch_embed = Stem(in_chans, stem_hidden_dim, embed_dims[i])
+            else:
+                patch_embed = DownSamples(embed_dims[i - 1], embed_dims[i])
+            block = nn.ModuleList([Block(
+                dim = embed_dims[i],
+                num_heads = num_heads[i],
+                mlp_ratio = mlp_ratios[i],
+                drop_path=dpr[cur + j],
+                norm_layer=norm_layer,
+                sr_ratio = sr_ratios[i],
+                block_type='wave' if i < 2 else 'std_att')
+            for j in range(depths[i])])
+            norm = norm_layer(embed_dims[i])
+            cur += depths[i]
+            setattr(self, f"patch_embed{i + 1}", patch_embed)
+            setattr(self, f"block{i + 1}", block)
+            setattr(self, f"norm{i + 1}", norm)
+        post_layers = ['ca']
+        self.post_network = nn.ModuleList([
+            ClassBlock(
+                dim = embed_dims[-1],
+                num_heads = num_heads[-1],
+                mlp_ratio = mlp_ratios[-1],
+                norm_layer=norm_layer)
+            for _ in range(len(post_layers))
+        ])
+        # classification head
+        self.head = nn.Linear(embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
+        ##################################### token_label #####################################
+        self.return_dense = token_label
+        self.mix_token = token_label
+        self.beta = 1.0
+        self.pooling_scale = 8
+        if self.return_dense:
+            self.aux_head = nn.Linear(
+                embed_dims[-1],
+                num_classes) if num_classes > 0 else nn.Identity()
+        ##################################### token_label #####################################
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward_cls(self, x):
+        B, N, C = x.shape
+        cls_tokens = x.mean(dim=1, keepdim=True)
+        x = torch.cat((cls_tokens, x), dim=1)
+        for block in self.post_network:
+            x = block(x)
+        return x
+    # def forward_features(self, x):
+    #     B = x.shape[0]
+    #     for i in range(self.num_stages):
+    #         patch_embed = getattr(self, f"patch_embed{i + 1}")
+    #         block = getattr(self, f"block{i + 1}")
+    #         x, H, W = patch_embed(x)
+    #         for blk in block:
+    #             x = blk(x, H, W)
+    #             tokens = x
+    #         if i != self.num_stages - 1:
+    #             norm = getattr(self, f"norm{i + 1}")
+    #             x = norm(x)
+    #             x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+    #     x = self.forward_cls(x)[:, 0]
+    #     norm = getattr(self, f"norm{self.num_stages}")
+    #     x = norm(x)
+    #     return x, tokens
+    def forward_features(self, x):
+        B = x.shape[0]
+        attention_maps = []  # Collect attention maps if available
+        tokens = None  # Initialize tokens to ensure scope coverage
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f"patch_embed{i + 1}")
+            block = getattr(self, f"block{i + 1}")
+            x, H, W = patch_embed(x)
+            for blk in block:
+                outputs = blk(x, H, W)
+                if isinstance(outputs, tuple):
+                    x, attn_weights = outputs
+                    attention_maps.append(attn_weights)  # Store attention maps
+                else:
+                    x = outputs
+            tokens = x  # Update tokens with the latest block output
+            if i != self.num_stages - 1:
+                norm = getattr(self, f"norm{i + 1}")
+                x = norm(x)
+                x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        x = self.forward_cls(x)[:, 0]  # Further processing for classification token
+        norm = getattr(self, f"norm{self.num_stages}")
+        x = norm(x)
+        return x, tokens, attention_maps
+    # def forward(self, x):
+    #     if not self.return_dense:
+    #         x, tokens = self.forward_features(x)
+    #         x = self.head(x)
+    #         return x, tokens
+    #     else:
+    #         x, H, W = self.forward_embeddings(x)
+    #         # mix token, see token labeling for details.
+    #         if self.mix_token and self.training:
+    #             lam = np.random.beta(self.beta, self.beta)
+    #             patch_h, patch_w = x.shape[1] // self.pooling_scale, x.shape[
+    #                 2] // self.pooling_scale
+    #             bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam, scale=self.pooling_scale)
+    #             temp_x = x.clone()
+    #             sbbx1,sbby1,sbbx2,sbby2=self.pooling_scale*bbx1,self.pooling_scale*bby1,\
+    #                                     self.pooling_scale*bbx2,self.pooling_scale*bby2
+    #             temp_x[:, sbbx1:sbbx2, sbby1:sbby2, :] = x.flip(0)[:, sbbx1:sbbx2, sbby1:sbby2, :]
+    #             x = temp_x
+    #         else:
+    #             bbx1, bby1, bbx2, bby2 = 0, 0, 0, 0
+    #         x = self.forward_tokens(x, H, W)
+    #         x_cls = self.head(x[:, 0])
+    #         x_aux = self.aux_head(
+    #             x[:, 1:]
+    #         )  # generate classes in all feature tokens, see token labeling
+    #         if not self.training:
+    #             return x_cls + 0.5 * x_aux.max(1)[0]
+    #         if self.mix_token and self.training:  # reverse "mix token", see token labeling for details.
+    #             x_aux = x_aux.reshape(x_aux.shape[0], patch_h, patch_w, x_aux.shape[-1])
+    #             temp_x = x_aux.clone()
+    #             temp_x[:, bbx1:bbx2, bby1:bby2, :] = x_aux.flip(0)[:, bbx1:bbx2, bby1:bby2, :]
+    #             x_aux = temp_x
+    #             x_aux = x_aux.reshape(x_aux.shape[0], patch_h * patch_w, x_aux.shape[-1])
+    #         return x_cls, x_aux, (bbx1, bby1, bbx2, bby2)
+    def forward(self, x):
+        attention_maps = []  # Initialize to collect attention maps from all blocks
+        if not self.return_dense:
+            # Retrieve main output, tokens, and attention maps
+            x, tokens, new_attention_maps = self.forward_features(x)
+            attention_maps.extend(new_attention_maps)  # Collect new attention maps
+            x = self.head(x)
+            return x, tokens, attention_maps
+        else:
+            # For dense token labeling and feature manipulation
+            x, H, W = self.forward_embeddings(x)
+            x, new_attention_maps = self.forward_tokens(x, H, W)  # Adjusted to return attention maps
+            attention_maps.extend(new_attention_maps)  # Collect new attention maps
+            if self.mix_token and self.training:
+                lam = np.random.beta(self.beta, self.beta)
+                patch_h, patch_w = x.shape[1] // self.pooling_scale, x.shape[2] // self.pooling_scale
+                bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam, scale=self.pooling_scale)
+                sbbx1, sbby1, sbbx2, sbby2 = self.pooling_scale * bbx1, self.pooling_scale * bby1, self.pooling_scale * bbx2, self.pooling_scale * bby2
+                temp_x = x.clone()
+                temp_x[:, sbbx1:sbbx2, sbby1:sbby2, :] = x.flip(0)[:, sbbx1:sbbx2, sbby1:sbby2, :]
+                x = temp_x
+            else:
+                bbx1, bby1, bbx2, bby2 = 0, 0, 0, 0  # Default to zero if no mixing
+            x_cls = self.head(x[:, 0])
+            x_aux = self.aux_head(x[:, 1:])  # Class prediction for all feature tokens
+            if not self.training:
+                return x_cls + 0.5 * x_aux.max(1)[0], attention_maps
+            return x_cls, x_aux, (bbx1, bby1, bbx2, bby2), attention_maps
+    def forward_tokens(self, x, H, W):
+        B = x.shape[0]
+        x = x.view(B, -1, x.size(-1))
+        for i in range(self.num_stages):
+            if i != 0:
+                patch_embed = getattr(self, f"patch_embed{i + 1}")
+                x, H, W = patch_embed(x)
+            block = getattr(self, f"block{i + 1}")
+            for blk in block:
+                x = blk(x, H, W)
+            if i != self.num_stages - 1:
+                norm = getattr(self, f"norm{i + 1}")
+                x = norm(x)
+                x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        x = self.forward_cls(x)
+        norm = getattr(self, f"norm{self.num_stages}")
+        x = norm(x)
+        return x
+    def forward_embeddings(self, x):
+        patch_embed = getattr(self, f"patch_embed{0 + 1}")
+        x, H, W = patch_embed(x)
+        x = x.view(x.size(0), H, W, -1)
+        return x, H, W
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+@register_model
+def spectformer_t_d(pretrained=False, **kwargs):
+    model = SpectFormer(
+        stem_hidden_dim = 32,
+        embed_dims = [64, 128, 160, 400], #64, 128, 320, 448 -----[64, 128, 160, 200]
+        num_heads = [2, 4, 10, 16],  #2, 4, 10, 16 ----------[2, 4, 10, 10]
+        mlp_ratios = [8, 8, 4, 4],
+        norm_layer = partial(nn.LayerNorm, eps=1e-6),
+        depths = [1, 2, 5, 2],   #1, 2, 3, 1 ---------[1, 1, 1, 1]
+        sr_ratios = [4, 2, 1, 1],
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+@register_model
+def spectformer_t_w(pretrained=False, **kwargs):
+    model = SpectFormer(
+        stem_hidden_dim = 32,
+        embed_dims = [64, 128, 320, 96], #64, 128, 320, 448 -----[64, 128, 160, 200]
+        num_heads = [2, 4, 10, 16],  #2, 4, 10, 16 ----------[2, 4, 10, 10]
+        mlp_ratios = [8, 8, 4, 4],
+        norm_layer = partial(nn.LayerNorm, eps=1e-6),
+        depths = [1, 1, 1, 1],   #1, 2, 3, 1 ---------[1, 1, 1, 1]
+        sr_ratios = [4, 2, 1, 1],
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model