open-mmlab · BLUE-coconut · Aug 15, 2023 · Aug 15, 2023 · Aug 16, 2023 · Aug 16, 2023
diff --git a/projects/pixel_contrast_cross_entropy_loss/README.md b/projects/pixel_contrast_cross_entropy_loss/README.md
@@ -0,0 +1,37 @@
+# Pixel contrast cross entropy loss
+
+[Exploring Cross-Image Pixel Contrast for Semantic Segmentation](https://arxiv.org/pdf/2101.11939.pdf)
-[Exploring Cross-Image Pixel Contrast for Semantic Segmentation](https://arxiv.org/pdf/2101.11939.pdf)
+> [Exploring Cross-Image Pixel Contrast for Semantic Segmentation](https://arxiv.org/pdf/2101.11939.pdf)
-[Exploring Cross-Image Pixel Contrast for Semantic Segmentation](https://arxiv.org/pdf/2101.11939.pdf)
+> [Exploring Cross-Image Pixel Contrast for Semantic Segmentation](https://arxiv.org/pdf/2101.11939.pdf)
+
+## Description
+
+This is an implementation of **pixel contrast cross entropy loss**
+
+[Official Repo](https://github.com/tfzhou/ContrastiveSeg)
+
+## Abstract
+
+Current semantic segmentation methods focus only on mining “local” context, i.e., dependencies between pixels within individual images, by context-aggregation modules (e.g., dilated convolution, neural attention) or structureaware optimization criteria (e.g., IoU-like loss). However, they ignore “global” context of the training data, i.e., rich semantic relations between pixels across different images. Inspired by the recent advance in unsupervised contrastive representation learning, we propose a pixel-wise contrastive framework for semantic segmentation in the fully supervised setting. The core idea is to enforce pixel embeddings belonging to a same semantic class to be more similar than embeddings from different classes. It raises a pixel-wise metric learning paradigm for semantic segmentation, by explicitly exploring the structures of labeled pixels, which are long ignored in the field. Our method can be effortlessly incorporated into existing segmentation frameworks without extra overhead during testing.
+
+We experimentally show that, with famous segmentation models (i.e., DeepLabV3, HRNet, OCR) and backbones (i.e., ResNet, HRNet), our method brings consistent performance improvements across diverse datasets (i.e., Cityscapes, PASCALContext, COCO-Stuff).
+
+## Usage
+
+Here the configs for HRNet-W18 and HRNet-W48 with pixel_contrast_cross_entropy_loss on cityscapes dataset are provided.
+
+After putting Cityscapes dataset into "mmsegmentation/data/" dir, train the network by:
+
+```python
+python tools/train.py projects/pixel_contrast_cross_entropy_loss/configs/fcn_hrcontrast48_4xb2-40k_cityscapes-512x1024.py
+```
+
+## Citation
+
+```bibtex
+@inproceedings{Wang_2021_ICCV,
+    author    = {Wang, Wenguan and Zhou, Tianfei and Yu, Fisher and Dai, Jifeng and Konukoglu, Ender and Van Gool, Luc},
+    title     = {Exploring Cross-Image Pixel Contrast for Semantic Segmentation},
+    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+    year      = {2021},
+    pages     = {7303-7313}
+}
+```
diff --git a/projects/pixel_contrast_cross_entropy_loss/__init__.py b/projects/pixel_contrast_cross_entropy_loss/__init__.py
@@ -0,0 +1,4 @@
+from .hrnetconstrast_head import ContrastHead
+from .pixel_contrast_cross_entropy_loss import PixelContrastCrossEntropyLoss
+
+__all__ = ['ContrastHead', 'PixelContrastCrossEntropyLoss']
diff --git a/projects/pixel_contrast_cross_entropy_loss/configs/fcn_hrcontrast18.py b/projects/pixel_contrast_cross_entropy_loss/configs/fcn_hrcontrast18.py
@@ -0,0 +1,86 @@
+# model settings
+
+custom_imports = dict(imports=['projects.pixel_contrast_cross_entropy_loss'])
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='HRNet',
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144)))),
+    decode_head=dict(
+        type='ContrastHead',
+        in_channels=[18, 36, 72, 144],
+        channels=sum([18, 36, 72, 144]),
+        num_classes=19,
+        in_index=(0, 1, 2, 3),
+        input_transform='resize_concat',
+        proj_n=256,
+        proj_mode='convmlp',
+        drop_p=0.1,
+        dropout_ratio=-1,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        seg_head=dict(
+            type='FCNHead',
+            in_channels=[18, 36, 72, 144],
+            in_index=(0, 1, 2, 3),
+            channels=sum([18, 36, 72, 144]),
+            input_transform='resize_concat',
+            kernel_size=1,
+            num_convs=1,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=19,
+            norm_cfg=norm_cfg,
+            align_corners=False),
+        loss_decode=[
+            dict(
+                type='PixelContrastCrossEntropyLoss',
+                base_temperature=0.07,
+                temperature=0.1,
+                ignore_index=255,
+                max_samples=1024,
+                max_views=100,
+                loss_weight=0.1),
+            dict(type='CrossEntropyLoss', loss_weight=1.0)
+        ]),
+
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/...ixel_contrast_cross_entropy_loss/configs/fcn_hrcontrast18_4xb2-40k_cityscapes-512x1024.py b/...ixel_contrast_cross_entropy_loss/configs/fcn_hrcontrast18_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,16 @@
+_base_ = [
+    './fcn_hrcontrast18.py', '../../../configs/_base_/datasets/cityscapes.py',
+    '../../../configs/_base_/default_runtime.py',
+    '../../../configs/_base_/schedules/schedule_40k.py'
+]
+data_root = 'data/cityscapes/'
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0002)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+
+train_dataloader = dict(dataset=dict(data_root=data_root))
+val_dataloader = dict(dataset=dict(data_root=data_root))
+test_dataloader = dict(dataset=dict(data_root=data_root))
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/...ixel_contrast_cross_entropy_loss/configs/fcn_hrcontrast48_4xb2-40k_cityscapes-512x1024.py b/...ixel_contrast_cross_entropy_loss/configs/fcn_hrcontrast48_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,28 @@
+_base_ = './fcn_hrcontrast18_4xb2-40k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        type='ContrastHead',
+        in_channels=[48, 96, 192, 384],
+        channels=sum([48, 96, 192, 384]),
+        proj_n=720,
+        seg_head=dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            in_index=(0, 1, 2, 3),
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            kernel_size=1,
+            num_convs=1,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=19,
+            norm_cfg=norm_cfg,
+            align_corners=False),
+    ))