add Dinov2 backbone (#43)

tlpss · web-flow · commit 5db1bdf61519 · 2025-01-24T15:37:52.000+01:00
* add dinoV2 with linear head

* format

* improve dino decoder and freeze encoder

* improve naming in model docs

* fix deprecated mamba setup in pytest workflow

* fix typo

* try to fix dependency hell

* dependency merde

* invalidate cached env upon changes
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -14,11 +14,14 @@ jobs:
         run: |
           sudo apt-get install fonts-freefont-ttf
       - name: install conda env with micromamba
-        uses: mamba-org/provision-with-micromamba@main
+        uses: mamba-org/setup-micromamba@v2
         with:
           channel-priority: strict
           environment-file: environment.yaml
           cache-env: true
+          # add hash of environment.yaml and setup.py
+          cache-environment-key: environment-${{ steps.date.outputs.date }} -${{ hashFiles('environment.yaml') }} -${{ hashFiles('setup.py') }}
+          cache-downloads-key: downloads-${{ steps.date.outputs.date }} - ${{ hashFiles('environment.yaml') }} -${{ hashFiles('setup.py') }}
       - name: Conda list
         shell: bash -l {0}
         run: conda list
diff --git a/environment.yaml b/environment.yaml
@@ -8,8 +8,9 @@ dependencies:
   - pytorch=1.13
   - pytorch-cuda=11.7
   - torchvision
+  - mkl==2024.0 # bug, https://github.com/pytorch/pytorch/issues/123097
   - pip
   - pip:
     - wandb>=0.13.7 # quick fix, gh actions failed to install wandb https://github.com/tlpss/keypoint-detection/actions/runs/3204224778/jobs/5235259475
-    - setuptools==59.5.0
+    - setuptools==70.0
     - -e .
diff --git a/keypoint_detection/data/coco_dataset.py b/keypoint_detection/data/coco_dataset.py
@@ -117,7 +117,7 @@ def __getitem__(self, index) -> Tuple[torch.Tensor, IMG_KEYPOINTS_TYPE]:
         image = self.image_to_tensor_transform(image)
         return image, keypoints
 
-    def prepare_dataset(self):
+    def prepare_dataset(self):  # noqa: C901
         """Prepares the dataset to map from COCO to (img, [keypoints for each channel])
 
         Returns:
@@ -161,7 +161,11 @@ def prepare_dataset(self):
                 for semantic_type, keypoints in keypoint_dict.items():
                     for keypoint in keypoints:
 
-                        if min(keypoint[:2]) < 0 or keypoint[0] > img_dict[img_id].width or keypoint[1] > img_dict[img_id].height:
+                        if (
+                            min(keypoint[:2]) < 0
+                            or keypoint[0] > img_dict[img_id].width
+                            or keypoint[1] > img_dict[img_id].height
+                        ):
                             print("keypoint outside of image, ignoring.")
                             continue
                         if self.is_keypoint_visible(keypoint):
diff --git a/keypoint_detection/models/backbones/backbone_factory.py b/keypoint_detection/models/backbones/backbone_factory.py
@@ -4,6 +4,7 @@
 from keypoint_detection.models.backbones.base_backbone import Backbone
 from keypoint_detection.models.backbones.convnext_unet import ConvNeXtUnet
 from keypoint_detection.models.backbones.dilated_cnn import DilatedCnn
+from keypoint_detection.models.backbones.dinov2 import DinoV2Up
 from keypoint_detection.models.backbones.maxvit_unet import MaxVitPicoUnet, MaxVitUnet
 from keypoint_detection.models.backbones.mobilenetv3 import MobileNetV3
 from keypoint_detection.models.backbones.s3k import S3K
@@ -20,6 +21,7 @@ class BackboneFactory:
         S3K,
         DilatedCnn,
         MobileNetV3,
+        DinoV2Up,
     ]
 
     @staticmethod
diff --git a/keypoint_detection/models/backbones/dinov2.py b/keypoint_detection/models/backbones/dinov2.py
@@ -0,0 +1,133 @@
+import timm
+import torch
+import torch.nn as nn
+from torchvision.models.feature_extraction import create_feature_extractor
+from torchvision.transforms import Resize
+
+from keypoint_detection.models.backbones.base_backbone import Backbone
+
+
+class UpSamplingBlock(nn.Module):
+    """
+    A very basic Upsampling block (these params have to be learnt from scratch so keep them small)
+
+    x --> up ---> conv1 --> norm -> relu
+
+    """
+
+    def __init__(self, n_channels_in, n_channels_out, kernel_size):
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(
+            in_channels=n_channels_in,
+            out_channels=n_channels_out,
+            kernel_size=kernel_size,
+            bias=False,
+            padding="same",
+        )
+
+        self.norm1 = nn.BatchNorm2d(n_channels_out)
+        self.relu1 = nn.ReLU()
+
+    def forward(self, x):
+        # bilinear is not deterministic, use nearest neighbor instead
+        x = nn.functional.interpolate(x, scale_factor=2.0)
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        # second conv as in original UNet upsampling block decreases performance
+        # probably because I was using a small dataset that did not have enough data to learn the extra parameters
+        return x
+
+
+class DinoV2Up(Backbone):
+    """
+    backbone based on a frozen Dino-v2 ViT-S  model and a number of conv-based upsampling blocks to go from patch-level to pixel-level.
+    Images are resized to 518x518 before being fed to the ViT.
+
+    The Dino v2 paper considers adding  both a linear layer  and a full-blown DPT head to the intermediate output of the last 4 blocks of the ViT.
+
+    This model can be considered as a simpler alternative to the DPT head that also aims to increase resolution of the features.
+
+    The upsample blocks add about 6M params, bringing the total to 28 params.
+    only these blocks are trained, the dino model is frozen.
+
+    Dinov2 paper: https://arxiv.org/pdf/2304.07193#page=13.87
+    DPT paper: https://arxiv.org/abs/2103.13413
+
+
+    THe head is most likely not the optimal architecture. reducing the #params in the decoder does not work for sure.
+      Unfreezing the dino model doesn't work either (for small datasets).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.encoder = timm.create_model(
+            "vit_small_patch14_dinov2.lvd142m",
+            pretrained=True,
+            num_classes=0,  # remove classifier nn.Linear
+        )
+
+        # get model specific transforms (normalization, resize)
+        self.img_resizer = Resize((518, 518))  # specific to DinoV2 ViT
+
+        self.feature_extractor = create_feature_extractor(
+            self.encoder, ["blocks.8", "blocks.9", "blocks.10", "blocks.11"]
+        )
+
+        # freeze the feature extractor
+        for param in self.feature_extractor.parameters():
+            param.requires_grad = False
+
+        self.upsamplingblocks = nn.ModuleList(
+            [
+                UpSamplingBlock(4 * 384, 384, 3),
+                UpSamplingBlock(384, 192, 3),
+                UpSamplingBlock(192, 96, 3),
+                UpSamplingBlock(96, 96, 3),
+            ]
+        )
+
+    def forward(self, x):
+        orig_image_shape = x.shape[-2:]
+        x = self.img_resizer(x)
+        features = self.feature_extractor(x)  # [(B,1370,384)]
+        features = list(features.values())
+        # concatenate the features
+        features = torch.cat(features, dim=2)
+        # drop class token patch
+        features = features[:, 1:]  # (B, 1369, 384)
+
+        # reshape to (B,B, 37,37,4*384)
+        features = features.view(features.shape[0], 37, 37, -1)
+
+        # permute to (B, 4*384, 37, 37)
+        features = features.permute(0, 3, 1, 2)
+
+        # upsample 3 times 2x to 37*8 = 296
+        for i in range(3):
+            features = self.upsamplingblocks[i](features)
+
+        # resize to 518/2 = 259
+        features = nn.functional.interpolate(features, size=(259, 259))
+        # upsample final time to 518
+        features = self.upsamplingblocks[-1](features)
+
+        # now resize to original image shape
+        features = nn.functional.interpolate(features, size=orig_image_shape)
+        return features
+
+    def get_n_channels_out(self):
+        return 96
+
+
+if __name__ == "__main__":
+    model = DinoV2Up()
+
+    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"num trainable params = {num_params/10**6:.2f} M")
+
+    x = torch.zeros((1, 3, 512, 512))
+    y = model(x)
+    print(y.shape)
diff --git a/keypoint_detection/models/detector.py b/keypoint_detection/models/detector.py
@@ -159,7 +159,7 @@ def __init__(
         # this is for later reference (e.g. checkpoint loading) and consistency.
         self.save_hyperparameters(ignore=["**kwargs", "backbone"])
 
-        self._most_recent_val_mean_ap = 0.0 # used to store the most recent validation mean AP and log it in each epoch, so that checkpoint can be chosen based on this one.
+        self._most_recent_val_mean_ap = 0.0  # used to store the most recent validation mean AP and log it in each epoch, so that checkpoint can be chosen based on this one.
 
     def forward(self, x: torch.Tensor):
         """
@@ -306,7 +306,7 @@ def log_channel_predictions_grids(self, image_grids, mode: str):
         for channel_configuration, grid in zip(self.keypoint_channel_configuration, image_grids):
             label = get_logging_label_from_channel_configuration(channel_configuration, mode)
             image_caption = "top: predicted heatmaps, bottom: gt heatmaps"
-            self.logger.experiment.log({label: wandb.Image(grid, caption=image_caption,file_type="jpg")})
+            self.logger.experiment.log({label: wandb.Image(grid, caption=image_caption, file_type="jpg")})
 
     def visualize_predicted_keypoints(self, result_dict):
         images = result_dict["input_images"]
@@ -388,7 +388,7 @@ def log_and_reset_mean_ap(self, mode: str):
         self.log(f"{mode}/meanAP", mean_ap)
         self.log(f"{mode}/meanAP/meanAP", mean_ap)
 
-        if mode== "validation":
+        if mode == "validation":
             self._most_recent_val_mean_ap = mean_ap
 
     def training_epoch_end(self, outputs):
diff --git a/keypoint_detection/models/metrics.py b/keypoint_detection/models/metrics.py
@@ -11,7 +11,6 @@
 
 import torch
 from torchmetrics import Metric
-from torchmetrics.utilities import check_forward_full_state_property
 
 
 @dataclass
@@ -239,11 +238,35 @@ def _zero_aware_division(num: float, denom: float) -> float:
         return num / denom
 
 
-if __name__ == "__main__":
-    print(
-        check_forward_full_state_property(
-            KeypointAPMetric,
-            init_args={"keypoint_threshold_distance": 2.0},
-            input_args={"detected_keypoints": [DetectedKeypoint(10, 20, 0.02)], "gt_keypoints": [Keypoint(10, 23)]},
-        )
-    )
+# if __name__ == "__main__":
+#     print(
+#         check_forward_full_state_property(
+#             KeypointAPMetric,
+#             init_args={"keypoint_threshold_distance": 2.0},
+#             input_args={"detected_keypoints": [DetectedKeypoint(10, 20, 0.02)], "gt_keypoints": [Keypoint(10, 23)]},
+#         )
+#     )
+
+
+# if __name__ == "__main__":
+#     import numpy as np
+#     from sklearn.metrics import average_precision_score, precision_recall_curve
+#     import matplotlib.pyplot as plt
+
+#     y_true = np.array([1, 1, 0, 1,0,0,0,0])
+#     y_scores = np.array([0.1, 0.4, 0.35, 0.8,0.01,0.01,0.01,0.01])
+
+#     y_true = np.random.randint(0,2,100)
+#     y_scores = np.random.rand(100)
+#     sklearn_precisions, sklearn_recalls, _ = precision_recall_curve(y_true, y_scores)
+#     sklearnAP = average_precision_score(y_true, y_scores)
+
+#     print(f"sklearn AP: {sklearnAP}")
+#     my_precisions, my_recalls  = calculate_precision_recall([ClassifiedKeypoint(None,None,y_scores[i],None,y_true[i]) for i in range(len(y_true))], sum(y_true))
+#     myAP = calculate_ap_from_pr(my_precisions, my_recalls)
+#     print(f"my AP: {myAP}")
+
+#     plt.plot(sklearn_recalls, sklearn_precisions, label=f"sklearn AP: {sklearnAP}")
+#     plt.plot(my_recalls, my_precisions, label=f"my AP: {myAP}")
+#     plt.legend()
+#     plt.savefig("test.png")
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
         "pytest",
         "pre-commit",
         "scikit-image",
-        "albumentations",
+        "albumentations<2.0",  # >=2.0 requires higher version of pydantic  than wandb currently allows
         "matplotlib",
         "pydantic>=2.0.0",  # 2.0 has breaking changes
         "fiftyone",