Refactor Vim backbone + Add more vision mamba configs

anwai98 · anwai98 · commit 9fbba4d4fcd8 · 2024-02-04T00:54:41.000+01:00
diff --git a/experiments/vision-mamba/run_livecell.py b/experiments/vision-mamba/run_livecell.py
@@ -11,14 +11,13 @@
 
 import torch_em
 from torch_em.util import segmentation
+from torch_em.model import get_vimunet_model
 from torch_em.transform.raw import standardize
 from torch_em.data.datasets import get_livecell_loader
 from torch_em.loss import DiceLoss, LossWrapper, ApplyAndRemoveMask, DiceBasedDistanceLoss
 
 from elf.evaluation import mean_segmentation_accuracy
 
-from vimunet import get_vimunet_model
-
 
 ROOT = "/scratch/usr/nimanwai"
 
@@ -128,7 +127,7 @@ def run_livecell_training(args):
     output_channels = get_output_channels(args)
 
     # the vision-mamba + decoder (UNet-based) model
-    model = get_vimunet_model(out_channels=output_channels, checkpoint=checkpoint)
+    model = get_vimunet_model(out_channels=output_channels, model_type=args.model_type, checkpoint=checkpoint)
 
     save_root = get_save_root(args)
 
@@ -160,7 +159,12 @@ def run_livecell_inference(args):
     checkpoint = os.path.join(save_root, "checkpoints", "livecell-vimunet", "best.pt")
 
     # the vision-mamba + decoder (UNet-based) model
-    model = get_vimunet_model(out_channels=output_channels, checkpoint=checkpoint)
+    model = get_vimunet_model(
+        out_channels=output_channels,
+        model_type=args.model_type,
+        with_cls_token=args.with_cls_token,
+        checkpoint=checkpoint
+    )
 
     test_image_dir = os.path.join(ROOT, "data", "livecell", "images", "livecell_test_images")
     all_test_labels = glob(os.path.join(ROOT, "data", "livecell", "annotations", "livecell_test_images", "*", "*"))
@@ -228,6 +232,8 @@ def main(args):
     parser.add_argument("-i", "--input", type=str, default=os.path.join(ROOT, "data", "livecell"))
     parser.add_argument("--iterations", type=int, default=1e4)
     parser.add_argument("-s", "--save_root", type=str, default=os.path.join(ROOT, "experiments", "vision-mamba"))
+    parser.add_argument("-m", "--model_type", type=str, default="vim_t")
+    parser.add_argument("--with_cls_token", action="store_true")
 
     parser.add_argument("--pretrained", action="store_true")
 
diff --git a/torch_em/model/__init__.py b/torch_em/model/__init__.py
@@ -2,3 +2,4 @@
 from .probabilistic_unet import ProbabilisticUNet
 from .unetr import UNETR
 from .vit import get_vision_transformer
+from .vim import get_vimunet_model
diff --git a/torch_em/model/vim.py b/torch_em/model/vim.py
@@ -6,7 +6,7 @@
 
 import torch
 
-from torch_em.model import UNETR
+from .unetr import UNETR
 
 from vim.models_mamba import VisionMamba, rms_norm_fn, RMSNorm, layer_norm_fn
 
@@ -40,7 +40,7 @@ def forward_features(self, x, inference_params=None):
             x = x + self.pos_embed
             x = self.pos_drop(x)
 
-        # mamba impl
+        # mamba implementation
         residual = None
         hidden_states = x
         for layer in self.layers:
@@ -61,7 +61,7 @@ def forward_features(self, x, inference_params=None):
                 residual = residual + self.drop_path(hidden_states)
             hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype))
         else:
-            # Set prenorm=False here since we don't need the residual
+            # Set prenorm = False here since we don't need the residual
             fused_add_norm_fn = rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn
             hidden_states = fused_add_norm_fn(
                 self.drop_path(hidden_states),
@@ -96,27 +96,73 @@ def forward(self, x, inference_params=None):
         return x  # from here, the tokens can be upsampled easily (N x H x W x C)
 
 
-def get_vimunet_model(out_channels, device=None, checkpoint=None):
+def get_vim_encoder(model_type="vim_t", with_cls_token=True):
+    if model_type == "vim_t":
+        # `vim_tiny_patch16_224_bimambav2_final_pool_mean_abs_pos_embed_rope_also_residual_with_cls_token`
+        # *has an imagenet pretrained model
+        encoder = ViM(
+            img_size=1024,
+            patch_size=16,
+            embed_dim=192,
+            depth=24,
+            rms_norm=True,
+            residual_in_fp32=True,
+            fused_add_norm=True,
+            final_pool_type='all',
+            if_abs_pos_embed=True,
+            if_rope=True,
+            if_rope_residual=True,
+            bimamba_type="v2",
+            if_cls_token=with_cls_token,
+        )
+    elif model_type == "vim_s":
+        # `vim_small_patch16_224_bimambav2_final_pool_mean_abs_pos_embed_rope_also_residual`
+        # AA: added a class token to the default models
+        encoder = ViM(
+            img_size=1024,
+            patch_size=16,
+            embed_dim=384,
+            depth=24,
+            rms_norm=True,
+            residual_in_fp32=True,
+            fused_add_norm=True,
+            final_pool_type='all',
+            if_abs_pos_embed=True,
+            if_rope=True,
+            if_rope_residual=True,
+            bimamba_type="v2",
+            if_cls_token=with_cls_token,
+        )
+    elif model_type == "vim_b":
+        # `vim_base_patch16_224_bimambav2_final_pool_mean_abs_pos_embed_rope_also_residual`
+        # AA: added a class token to the default models
+        encoder = ViM(
+            img_size=1024,
+            patch_size=16,
+            embed_dim=768,
+            depth=24,
+            rms_norm=True,
+            residual_in_fp32=True,
+            fused_add_norm=True,
+            final_pool_type='all',
+            if_abs_pos_embed=True,
+            if_rope=True,
+            if_rope_residual=True,
+            bimamba_type="v2",
+            if_cls_token=with_cls_token,
+        )
+    else:
+        raise ValueError("Choose from `vim_t` or `vim_b`")
+
+    encoder.default_cfg = _cfg()
+    return encoder
+
+
+def get_vimunet_model(out_channels, model_type="vim_t", with_cls_token=True, device=None, checkpoint=None):
     if device is None:
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-    encoder = ViM(
-        img_size=1024,
-        patch_size=16,
-        embed_dim=192,
-        depth=24,
-        rms_norm=True,
-        residual_in_fp32=True,
-        fused_add_norm=True,
-        final_pool_type='all',
-        if_abs_pos_embed=True,
-        if_rope=True,
-        if_rope_residual=True,
-        bimamba_type="v2",
-        if_cls_token=True,
-    )
-
-    encoder.default_cfg = _cfg()
+    encoder = get_vim_encoder(model_type, with_cls_token)
 
     model_state = None
     if checkpoint is not None: