Tim77777767
commited on
Commit
·
85ebba9
1
Parent(s):
e98bd8c
Re-added .bin with Git LFS, fixed tracking
Browse files- .gitattributes +3 -0
- .gitignore +2 -0
- build/lib/segformer_plusplus/__init__.py +0 -4
- build/lib/segformer_plusplus/build_model.py +0 -108
- build/lib/segformer_plusplus/configs/__init__.py +0 -1
- build/lib/segformer_plusplus/configs/segformer_mit_b0.py +0 -28
- build/lib/segformer_plusplus/configs/segformer_mit_b1.py +0 -8
- build/lib/segformer_plusplus/configs/segformer_mit_b2.py +0 -6
- build/lib/segformer_plusplus/configs/segformer_mit_b3.py +0 -6
- build/lib/segformer_plusplus/configs/segformer_mit_b4.py +0 -6
- build/lib/segformer_plusplus/configs/segformer_mit_b5.py +0 -6
- build/lib/segformer_plusplus/model/__init__.py +0 -1
- build/lib/segformer_plusplus/model/backbone/__init__.py +0 -3
- build/lib/segformer_plusplus/model/backbone/mit.py +0 -479
- build/lib/segformer_plusplus/model/head/__init__.py +0 -3
- build/lib/segformer_plusplus/model/head/segformer_head.py +0 -95
- build/lib/segformer_plusplus/random_benchmark.py +0 -61
- build/lib/segformer_plusplus/utils/__init__.py +0 -12
- build/lib/segformer_plusplus/utils/benchmark.py +0 -76
- build/lib/segformer_plusplus/utils/embed.py +0 -330
- build/lib/segformer_plusplus/utils/imagenet_weights.py +0 -8
- build/lib/segformer_plusplus/utils/registry.py +0 -6
- build/lib/segformer_plusplus/utils/shape_convert.py +0 -107
- build/lib/segformer_plusplus/utils/tome_presets.py +0 -20
- build/lib/segformer_plusplus/utils/wrappers.py +0 -51
- segformer_plusplus.egg-info/SOURCES.txt +10 -0
- segformer_plusplus.egg-info/requires.txt +3 -0
- segformer_plusplus/cityscape/berlin_000543_000019_leftImg8bit.png +3 -0
- segformer_plusplus/cityscape_benchmark.py +3 -11
- cityscapes_prediction_output_reference.txt → segformer_plusplus/cityscapes_prediction_output.txt +0 -0
- segformer_plusplus/cityscapes_prediction_output_reference_b05_nocheckpoint.txt +0 -0
- segformer_plusplus/config.json +10 -0
- segformer_plusplus/configs/config/utils.py +129 -2
- segformer_plusplus/modeling_segformer_plusplus.py +69 -0
- segformer_plusplus/pytorch_model.bin +3 -0
- segformer_plusplus/start_cityscape_benchmark.py +6 -4
- setup.py +12 -1
.gitattributes
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
@@ -10,3 +10,5 @@ __pycache__/
|
|
10 |
.vscode/
|
11 |
.idea/
|
12 |
.DS_Store
|
|
|
|
|
|
10 |
.vscode/
|
11 |
.idea/
|
12 |
.DS_Store
|
13 |
+
build/
|
14 |
+
venv/
|
build/lib/segformer_plusplus/__init__.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
from .build_model import create_model, create_custom_model
|
2 |
-
from .random_benchmark import random_benchmark
|
3 |
-
|
4 |
-
__all__ = ['create_model', 'create_custom_model', 'random_benchmark']
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/build_model.py
DELETED
@@ -1,108 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
|
3 |
-
from mmengine import registry
|
4 |
-
from mmengine.config import Config
|
5 |
-
from mmengine.model import BaseModule
|
6 |
-
|
7 |
-
from .utils import MODELS, imagenet_weights
|
8 |
-
from .utils import tome_presets
|
9 |
-
|
10 |
-
|
11 |
-
class SegFormer(BaseModule):
|
12 |
-
"""
|
13 |
-
This class represents a SegFormer model that allows for the application of token merging.
|
14 |
-
|
15 |
-
Attributes:
|
16 |
-
backbone (BaseModule): MixVisionTransformer backbone
|
17 |
-
decode_head (BaseModule): SegFormer head
|
18 |
-
|
19 |
-
"""
|
20 |
-
def __init__(self, cfg):
|
21 |
-
"""
|
22 |
-
Initialize the SegFormer model.
|
23 |
-
|
24 |
-
Args:
|
25 |
-
cfg (Config): an mmengine Config object, which defines the backbone, head and token merging strategy used.
|
26 |
-
|
27 |
-
"""
|
28 |
-
super().__init__()
|
29 |
-
self.backbone = registry.build_model_from_cfg(cfg.backbone, registry=MODELS)
|
30 |
-
self.decode_head = registry.build_model_from_cfg(cfg.decode_head, registry=MODELS)
|
31 |
-
|
32 |
-
def forward(self, x):
|
33 |
-
"""
|
34 |
-
Forward pass of the model.
|
35 |
-
|
36 |
-
Args:
|
37 |
-
x (torch.Tensor): input tensor of shape [B, C, H, W]
|
38 |
-
|
39 |
-
Returns:
|
40 |
-
torch.Tensor: output tensor
|
41 |
-
|
42 |
-
"""
|
43 |
-
x = self.backbone(x)
|
44 |
-
x = self.decode_head(x)
|
45 |
-
return x
|
46 |
-
|
47 |
-
|
48 |
-
def create_model(
|
49 |
-
backbone: str = 'b0',
|
50 |
-
tome_strategy: str = None,
|
51 |
-
out_channels: int = 19,
|
52 |
-
pretrained: bool = False,
|
53 |
-
):
|
54 |
-
"""
|
55 |
-
Create a SegFormer model using the predefined SegFormer backbones from the MiT series (b0-b5).
|
56 |
-
|
57 |
-
Args:
|
58 |
-
backbone (str): backbone name (e.g. 'b0')
|
59 |
-
tome_strategy (str | list(dict)): select strategy from presets ('bsm_hq', 'bsm_fast', 'n2d_2x2') or define a
|
60 |
-
custom strategy using a list, that contains of dictionaries, in which the strategies for the stage are
|
61 |
-
defined
|
62 |
-
out_channels (int): number of output channels (e.g. 19 for the cityscapes semantic segmentation task)
|
63 |
-
pretrained: use pretrained (imagenet) weights
|
64 |
-
|
65 |
-
Returns:
|
66 |
-
BaseModule: SegFormer model
|
67 |
-
|
68 |
-
"""
|
69 |
-
backbone = backbone.lower()
|
70 |
-
assert backbone in [f'b{i}' for i in range(6)]
|
71 |
-
|
72 |
-
wd = os.path.dirname(os.path.abspath(__file__))
|
73 |
-
|
74 |
-
cfg = Config.fromfile(os.path.join(wd, 'configs', f'segformer_mit_{backbone}.py'))
|
75 |
-
|
76 |
-
cfg.decode_head.out_channels = out_channels
|
77 |
-
|
78 |
-
if tome_strategy is not None:
|
79 |
-
if tome_strategy not in list(tome_presets.keys()):
|
80 |
-
print("Using custom merging strategy.")
|
81 |
-
cfg.backbone.tome_cfg = tome_presets[tome_strategy]
|
82 |
-
|
83 |
-
# load imagenet weights
|
84 |
-
if pretrained:
|
85 |
-
cfg.backbone.init_cfg = dict(type='Pretrained', checkpoint=imagenet_weights[backbone])
|
86 |
-
|
87 |
-
return SegFormer(cfg)
|
88 |
-
|
89 |
-
|
90 |
-
def create_custom_model(
|
91 |
-
model_cfg: Config,
|
92 |
-
tome_strategy: list[dict] = None,
|
93 |
-
):
|
94 |
-
"""
|
95 |
-
Create a SegFormer model with customizable backbone and head.
|
96 |
-
|
97 |
-
Args:
|
98 |
-
model_cfg (Config): backbone name (e.g. 'b0')
|
99 |
-
tome_strategy (list(dict)): custom token merging strategy
|
100 |
-
|
101 |
-
Returns:
|
102 |
-
BaseModule: SegFormer model
|
103 |
-
|
104 |
-
"""
|
105 |
-
if tome_strategy is not None:
|
106 |
-
model_cfg.backbone.tome_cfg = tome_strategy
|
107 |
-
|
108 |
-
return SegFormer(model_cfg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/configs/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
__all__ = []
|
|
|
|
build/lib/segformer_plusplus/configs/segformer_mit_b0.py
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
norm_cfg = dict(type='SyncBN', requires_grad=True)
|
2 |
-
backbone = dict(
|
3 |
-
type='MixVisionTransformer',
|
4 |
-
in_channels=3,
|
5 |
-
embed_dims=32,
|
6 |
-
num_stages=4,
|
7 |
-
num_layers=[2, 2, 2, 2],
|
8 |
-
num_heads=[1, 2, 5, 8],
|
9 |
-
patch_sizes=[7, 3, 3, 3],
|
10 |
-
sr_ratios=[8, 4, 2, 1],
|
11 |
-
out_indices=(0, 1, 2, 3),
|
12 |
-
mlp_ratio=4,
|
13 |
-
qkv_bias=True,
|
14 |
-
drop_rate=0.0,
|
15 |
-
attn_drop_rate=0.0,
|
16 |
-
drop_path_rate=0.1
|
17 |
-
)
|
18 |
-
decode_head = dict(
|
19 |
-
type='SegformerHead',
|
20 |
-
in_channels=[32, 64, 160, 256],
|
21 |
-
in_index=[0, 1, 2, 3],
|
22 |
-
channels=256,
|
23 |
-
dropout_ratio=0.1,
|
24 |
-
out_channels=19,
|
25 |
-
norm_cfg=norm_cfg,
|
26 |
-
align_corners=False,
|
27 |
-
interpolate_mode='bilinear'
|
28 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/configs/segformer_mit_b1.py
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
_base_ = ['./segformer_mit_b0.py']
|
2 |
-
|
3 |
-
backbone = dict(
|
4 |
-
embed_dims=64,
|
5 |
-
)
|
6 |
-
decode_head = dict(
|
7 |
-
in_channels=[64, 128, 320, 512]
|
8 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/configs/segformer_mit_b2.py
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
_base_ = ['./segformer_mit_b1.py']
|
2 |
-
|
3 |
-
backbone = dict(
|
4 |
-
embed_dims=64,
|
5 |
-
num_layers=[3, 4, 6, 3]
|
6 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/configs/segformer_mit_b3.py
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
_base_ = ['./segformer_mit_b1.py']
|
2 |
-
|
3 |
-
backbone = dict(
|
4 |
-
embed_dims=64,
|
5 |
-
num_layers=[3, 4, 18, 3]
|
6 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/configs/segformer_mit_b4.py
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
_base_ = ['./segformer_mit_b1.py']
|
2 |
-
|
3 |
-
backbone = dict(
|
4 |
-
embed_dims=64,
|
5 |
-
num_layers=[3, 8, 27, 3]
|
6 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/configs/segformer_mit_b5.py
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
_base_ = ['./segformer_mit_b1.py']
|
2 |
-
|
3 |
-
backbone = dict(
|
4 |
-
embed_dims=64,
|
5 |
-
num_layers=[3, 6, 40, 3]
|
6 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/model/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
__all__ = []
|
|
|
|
build/lib/segformer_plusplus/model/backbone/__init__.py
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
from .mit import MixVisionTransformer
|
2 |
-
|
3 |
-
__all__ = ['MixVisionTransformer']
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/model/backbone/mit.py
DELETED
@@ -1,479 +0,0 @@
|
|
1 |
-
# Copyright (c) OpenMMLab. All rights reserved.
|
2 |
-
import math
|
3 |
-
|
4 |
-
import torch
|
5 |
-
import torch.nn as nn
|
6 |
-
import torch.utils.checkpoint as cp
|
7 |
-
from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer
|
8 |
-
from mmcv.cnn.bricks.drop import build_dropout
|
9 |
-
from mmcv.cnn.bricks.transformer import MultiheadAttention
|
10 |
-
from mmengine.model import BaseModule, ModuleList, Sequential
|
11 |
-
from mmengine.model.weight_init import (constant_init, normal_init,
|
12 |
-
trunc_normal_init)
|
13 |
-
from tomesd.merge import bipartite_soft_matching_random2d
|
14 |
-
|
15 |
-
from ...utils import PatchEmbed
|
16 |
-
from ...utils import nchw_to_nlc, nlc_to_nchw
|
17 |
-
from ...utils import MODELS
|
18 |
-
|
19 |
-
class MixFFN(BaseModule):
|
20 |
-
"""An implementation of MixFFN of Segformer.
|
21 |
-
|
22 |
-
The differences between MixFFN & FFN:
|
23 |
-
1. Use 1X1 Conv to replace Linear layer.
|
24 |
-
2. Introduce 3X3 Conv to encode positional information.
|
25 |
-
Args:
|
26 |
-
embed_dims (int): The feature dimension. Same as
|
27 |
-
`MultiheadAttention`. Defaults: 256.
|
28 |
-
feedforward_channels (int): The hidden dimension of FFNs.
|
29 |
-
Defaults: 1024.
|
30 |
-
act_cfg (dict, optional): The activation config for FFNs.
|
31 |
-
Default: dict(type='ReLU')
|
32 |
-
ffn_drop (float, optional): Probability of an element to be
|
33 |
-
zeroed in FFN. Default 0.0.
|
34 |
-
dropout_layer (obj:`ConfigDict`): The dropout_layer used
|
35 |
-
when adding the shortcut.
|
36 |
-
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
|
37 |
-
Default: None.
|
38 |
-
"""
|
39 |
-
|
40 |
-
def __init__(self,
|
41 |
-
embed_dims,
|
42 |
-
feedforward_channels,
|
43 |
-
act_cfg=dict(type='GELU'),
|
44 |
-
ffn_drop=0.,
|
45 |
-
dropout_layer=None,
|
46 |
-
init_cfg=None):
|
47 |
-
super().__init__(init_cfg)
|
48 |
-
|
49 |
-
self.embed_dims = embed_dims
|
50 |
-
self.feedforward_channels = feedforward_channels
|
51 |
-
self.act_cfg = act_cfg
|
52 |
-
self.activate = build_activation_layer(act_cfg)
|
53 |
-
|
54 |
-
in_channels = embed_dims
|
55 |
-
fc1 = Conv2d(
|
56 |
-
in_channels=in_channels,
|
57 |
-
out_channels=feedforward_channels,
|
58 |
-
kernel_size=1,
|
59 |
-
stride=1,
|
60 |
-
bias=True)
|
61 |
-
# 3x3 depth wise conv to provide positional encode information
|
62 |
-
pe_conv = Conv2d(
|
63 |
-
in_channels=feedforward_channels,
|
64 |
-
out_channels=feedforward_channels,
|
65 |
-
kernel_size=3,
|
66 |
-
stride=1,
|
67 |
-
padding=(3 - 1) // 2,
|
68 |
-
bias=True,
|
69 |
-
groups=feedforward_channels)
|
70 |
-
fc2 = Conv2d(
|
71 |
-
in_channels=feedforward_channels,
|
72 |
-
out_channels=in_channels,
|
73 |
-
kernel_size=1,
|
74 |
-
stride=1,
|
75 |
-
bias=True)
|
76 |
-
drop = nn.Dropout(ffn_drop)
|
77 |
-
layers = [fc1, pe_conv, self.activate, drop, fc2, drop]
|
78 |
-
self.layers = Sequential(*layers)
|
79 |
-
self.dropout_layer = build_dropout(
|
80 |
-
dropout_layer) if dropout_layer else torch.nn.Identity()
|
81 |
-
|
82 |
-
def forward(self, x, hw_shape, identity=None):
|
83 |
-
out = nlc_to_nchw(x, hw_shape)
|
84 |
-
out = self.layers(out)
|
85 |
-
out = nchw_to_nlc(out)
|
86 |
-
if identity is None:
|
87 |
-
identity = x
|
88 |
-
return identity + self.dropout_layer(out)
|
89 |
-
|
90 |
-
|
91 |
-
class EfficientMultiheadAttention(MultiheadAttention):
|
92 |
-
"""An implementation of Efficient Multi-head Attention of Segformer.
|
93 |
-
|
94 |
-
This module is modified from MultiheadAttention which is a module from
|
95 |
-
mmcv.cnn.bricks.transformer.
|
96 |
-
Args:
|
97 |
-
embed_dims (int): The embedding dimension.
|
98 |
-
num_heads (int): Parallel attention heads.
|
99 |
-
attn_drop (float): A Dropout layer on attn_output_weights.
|
100 |
-
Default: 0.0.
|
101 |
-
proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
|
102 |
-
Default: 0.0.
|
103 |
-
dropout_layer (obj:`ConfigDict`): The dropout_layer used
|
104 |
-
when adding the shortcut. Default: None.
|
105 |
-
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
|
106 |
-
Default: None.
|
107 |
-
batch_first (bool): Key, Query and Value are shape of
|
108 |
-
(batch, n, embed_dim)
|
109 |
-
or (n, batch, embed_dim). Default: False.
|
110 |
-
qkv_bias (bool): enable bias for qkv if True. Default True.
|
111 |
-
norm_cfg (dict): Config dict for normalization layer.
|
112 |
-
Default: dict(type='LN').
|
113 |
-
sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
|
114 |
-
Attention of Segformer. Default: 1.
|
115 |
-
"""
|
116 |
-
|
117 |
-
def __init__(self,
|
118 |
-
embed_dims,
|
119 |
-
num_heads,
|
120 |
-
attn_drop=0.,
|
121 |
-
proj_drop=0.,
|
122 |
-
dropout_layer=None,
|
123 |
-
init_cfg=None,
|
124 |
-
batch_first=True,
|
125 |
-
qkv_bias=False,
|
126 |
-
tome_cfg=dict(),
|
127 |
-
norm_cfg=dict(type='LN'),
|
128 |
-
sr_ratio=1):
|
129 |
-
super().__init__(
|
130 |
-
embed_dims,
|
131 |
-
num_heads,
|
132 |
-
attn_drop,
|
133 |
-
proj_drop,
|
134 |
-
dropout_layer=dropout_layer,
|
135 |
-
init_cfg=init_cfg,
|
136 |
-
batch_first=batch_first,
|
137 |
-
bias=qkv_bias)
|
138 |
-
|
139 |
-
self.q_mode = tome_cfg.get('q_mode')
|
140 |
-
self.kv_mode = tome_cfg.get('kv_mode')
|
141 |
-
self.tome_cfg = tome_cfg
|
142 |
-
|
143 |
-
self.sr_ratio = sr_ratio
|
144 |
-
if sr_ratio > 1:
|
145 |
-
self.sr = Conv2d(
|
146 |
-
in_channels=embed_dims,
|
147 |
-
out_channels=embed_dims,
|
148 |
-
kernel_size=sr_ratio,
|
149 |
-
stride=sr_ratio)
|
150 |
-
# The ret[0] of build_norm_layer is norm name.
|
151 |
-
self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
|
152 |
-
|
153 |
-
def forward(self, x, hw_shape, identity=None):
|
154 |
-
x_q = x
|
155 |
-
|
156 |
-
if self.sr_ratio > 1:
|
157 |
-
x_kv = nlc_to_nchw(x, hw_shape)
|
158 |
-
x_kv = self.sr(x_kv)
|
159 |
-
x_kv = nchw_to_nlc(x_kv)
|
160 |
-
x_kv = self.norm(x_kv)
|
161 |
-
else:
|
162 |
-
x_kv = x
|
163 |
-
|
164 |
-
# 2D Neighbour Merging KV
|
165 |
-
if self.kv_mode == 'n2d':
|
166 |
-
kv_hw_shape = (int(hw_shape[0] / self.sr_ratio), int(hw_shape[1] / self.sr_ratio))
|
167 |
-
x_kv = nlc_to_nchw(x_kv, kv_hw_shape)
|
168 |
-
x_kv = torch.nn.functional.avg_pool2d(x_kv, kernel_size=self.tome_cfg['kv_s'],
|
169 |
-
stride=self.tome_cfg['kv_s'],
|
170 |
-
ceil_mode=True)
|
171 |
-
x_kv = nchw_to_nlc(x_kv)
|
172 |
-
|
173 |
-
# Bipartite Soft Matching (tomesd) KV
|
174 |
-
if self.kv_mode == 'bsm':
|
175 |
-
w_kv = int(hw_shape[1] / self.sr_ratio)
|
176 |
-
h_kv = int(hw_shape[0] / self.sr_ratio)
|
177 |
-
merge, unmerge = bipartite_soft_matching_random2d(metric=x_kv, w=w_kv, h=h_kv,
|
178 |
-
r=int(x_kv.size()[1] * self.tome_cfg['kv_r']),
|
179 |
-
sx=self.tome_cfg['kv_sx'], sy=self.tome_cfg['kv_sy'],
|
180 |
-
no_rand=True)
|
181 |
-
x_kv = merge(x_kv)
|
182 |
-
|
183 |
-
if identity is None:
|
184 |
-
identity = x_q
|
185 |
-
|
186 |
-
# 1D Neighbor Merging Q
|
187 |
-
if self.q_mode == 'n1d':
|
188 |
-
x_q = x_q.transpose(-2, -1)
|
189 |
-
x_q = torch.nn.functional.avg_pool1d(x_q, kernel_size=self.tome_cfg['q_s'],
|
190 |
-
stride=self.tome_cfg['q_s'],
|
191 |
-
ceil_mode=True)
|
192 |
-
x_q = x_q.transpose(-2, -1)
|
193 |
-
|
194 |
-
# 2D Neighbor Merging Q
|
195 |
-
if self.q_mode == 'n2d':
|
196 |
-
reduced_hw = (int(torch.ceil(torch.tensor(hw_shape[0] / self.tome_cfg['q_s'][0]))),
|
197 |
-
int(torch.ceil(torch.tensor(hw_shape[1] / self.tome_cfg['q_s'][1]))))
|
198 |
-
x_q = nlc_to_nchw(x_q, hw_shape)
|
199 |
-
x_q = torch.nn.functional.avg_pool2d(x_q, kernel_size=self.tome_cfg['q_s'],
|
200 |
-
stride=self.tome_cfg['q_s'],
|
201 |
-
ceil_mode=True)
|
202 |
-
x_q = nchw_to_nlc(x_q)
|
203 |
-
|
204 |
-
# Bipartite Soft Matching (tomesd) Q
|
205 |
-
if self.q_mode == 'bsm':
|
206 |
-
merge, unmerge = bipartite_soft_matching_random2d(metric=x_q, w=hw_shape[1], h=hw_shape[0],
|
207 |
-
r=int(x_q.size()[1] * self.tome_cfg['q_r']),
|
208 |
-
sx=self.tome_cfg['q_sx'], sy=self.tome_cfg['q_sy'],
|
209 |
-
no_rand=True)
|
210 |
-
x_q = merge(x_q)
|
211 |
-
|
212 |
-
# Because the dataflow('key', 'query', 'value') of
|
213 |
-
# ``torch.nn.MultiheadAttention`` is (num_query, batch,
|
214 |
-
# embed_dims), We should adjust the shape of dataflow from
|
215 |
-
# batch_first (batch, num_query, embed_dims) to num_query_first
|
216 |
-
# (num_query ,batch, embed_dims), and recover ``attn_output``
|
217 |
-
# from num_query_first to batch_first.
|
218 |
-
|
219 |
-
if self.batch_first:
|
220 |
-
x_q = x_q.transpose(0, 1)
|
221 |
-
x_kv = x_kv.transpose(0, 1)
|
222 |
-
out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
|
223 |
-
if self.batch_first:
|
224 |
-
out = out.transpose(0, 1)
|
225 |
-
|
226 |
-
# Unmerging BSM (tome+tomesd)
|
227 |
-
if self.q_mode == 'bsm':
|
228 |
-
out = unmerge(out)
|
229 |
-
|
230 |
-
# Unmerging 1D Neighbour Merging
|
231 |
-
if self.q_mode == 'n1d':
|
232 |
-
out = out.transpose(-2, -1)
|
233 |
-
out = torch.nn.functional.interpolate(out, size=identity.size()[-2])
|
234 |
-
out = out.transpose(-2, -1)
|
235 |
-
|
236 |
-
# Unmerging 2D Neighbor Merging
|
237 |
-
if self.q_mode == 'n2d':
|
238 |
-
out = nlc_to_nchw(out, reduced_hw)
|
239 |
-
out = torch.nn.functional.interpolate(out, size=hw_shape)
|
240 |
-
out = nchw_to_nlc(out)
|
241 |
-
|
242 |
-
return identity + self.dropout_layer(self.proj_drop(out))
|
243 |
-
|
244 |
-
|
245 |
-
class TransformerEncoderLayer(BaseModule):
|
246 |
-
"""Implements one encoder layer in Segformer.
|
247 |
-
|
248 |
-
Args:
|
249 |
-
embed_dims (int): The feature dimension.
|
250 |
-
num_heads (int): Parallel attention heads.
|
251 |
-
feedforward_channels (int): The hidden dimension for FFNs.
|
252 |
-
drop_rate (float): Probability of an element to be zeroed.
|
253 |
-
after the feed forward layer. Default 0.0.
|
254 |
-
attn_drop_rate (float): The drop out rate for attention layer.
|
255 |
-
Default 0.0.
|
256 |
-
drop_path_rate (float): stochastic depth rate. Default 0.0.
|
257 |
-
qkv_bias (bool): enable bias for qkv if True.
|
258 |
-
Default: True.
|
259 |
-
act_cfg (dict): The activation config for FFNs.
|
260 |
-
Default: dict(type='GELU').
|
261 |
-
norm_cfg (dict): Config dict for normalization layer.
|
262 |
-
Default: dict(type='LN').
|
263 |
-
batch_first (bool): Key, Query and Value are shape of
|
264 |
-
(batch, n, embed_dim)
|
265 |
-
or (n, batch, embed_dim). Default: False.
|
266 |
-
init_cfg (dict, optional): Initialization config dict.
|
267 |
-
Default:None.
|
268 |
-
sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
|
269 |
-
Attention of Segformer. Default: 1.
|
270 |
-
with_cp (bool): Use checkpoint or not. Using checkpoint will save
|
271 |
-
some memory while slowing down the training speed. Default: False.
|
272 |
-
"""
|
273 |
-
|
274 |
-
def __init__(self,
|
275 |
-
embed_dims,
|
276 |
-
num_heads,
|
277 |
-
feedforward_channels,
|
278 |
-
drop_rate=0.,
|
279 |
-
attn_drop_rate=0.,
|
280 |
-
drop_path_rate=0.,
|
281 |
-
qkv_bias=True,
|
282 |
-
tome_cfg=dict(),
|
283 |
-
act_cfg=dict(type='GELU'),
|
284 |
-
norm_cfg=dict(type='LN'),
|
285 |
-
batch_first=True,
|
286 |
-
sr_ratio=1,
|
287 |
-
with_cp=False):
|
288 |
-
super().__init__()
|
289 |
-
|
290 |
-
# The ret[0] of build_norm_layer is norm name.
|
291 |
-
self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
|
292 |
-
|
293 |
-
self.attn = EfficientMultiheadAttention(
|
294 |
-
embed_dims=embed_dims,
|
295 |
-
num_heads=num_heads,
|
296 |
-
attn_drop=attn_drop_rate,
|
297 |
-
proj_drop=drop_rate,
|
298 |
-
dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
|
299 |
-
batch_first=batch_first,
|
300 |
-
qkv_bias=qkv_bias,
|
301 |
-
tome_cfg=tome_cfg,
|
302 |
-
norm_cfg=norm_cfg,
|
303 |
-
sr_ratio=sr_ratio)
|
304 |
-
|
305 |
-
# The ret[0] of build_norm_layer is norm name.
|
306 |
-
self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
|
307 |
-
|
308 |
-
self.ffn = MixFFN(
|
309 |
-
embed_dims=embed_dims,
|
310 |
-
feedforward_channels=feedforward_channels,
|
311 |
-
ffn_drop=drop_rate,
|
312 |
-
dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
|
313 |
-
act_cfg=act_cfg)
|
314 |
-
|
315 |
-
self.with_cp = with_cp
|
316 |
-
|
317 |
-
def forward(self, x, hw_shape):
|
318 |
-
|
319 |
-
def _inner_forward(x):
|
320 |
-
x = self.attn(self.norm1(x), hw_shape, identity=x)
|
321 |
-
x = self.ffn(self.norm2(x), hw_shape, identity=x)
|
322 |
-
return x
|
323 |
-
|
324 |
-
if self.with_cp and x.requires_grad:
|
325 |
-
x = cp.checkpoint(_inner_forward, x)
|
326 |
-
else:
|
327 |
-
x = _inner_forward(x)
|
328 |
-
return x
|
329 |
-
|
330 |
-
|
331 |
-
@MODELS.register_module()
|
332 |
-
class MixVisionTransformer(BaseModule):
|
333 |
-
"""The backbone of Segformer.
|
334 |
-
|
335 |
-
This backbone is the implementation of `SegFormer: Simple and
|
336 |
-
Efficient Design for Semantic Segmentation with
|
337 |
-
Transformers <https://arxiv.org/abs/2105.15203>`_.
|
338 |
-
Args:
|
339 |
-
in_channels (int): Number of input channels. Default: 3.
|
340 |
-
embed_dims (int): Embedding dimension. Default: 768.
|
341 |
-
num_stags (int): The num of stages. Default: 4.
|
342 |
-
num_layers (Sequence[int]): The layer number of each transformer encode
|
343 |
-
layer. Default: [3, 4, 6, 3].
|
344 |
-
num_heads (Sequence[int]): The attention heads of each transformer
|
345 |
-
encode layer. Default: [1, 2, 4, 8].
|
346 |
-
patch_sizes (Sequence[int]): The patch_size of each overlapped patch
|
347 |
-
embedding. Default: [7, 3, 3, 3].
|
348 |
-
strides (Sequence[int]): The stride of each overlapped patch embedding.
|
349 |
-
Default: [4, 2, 2, 2].
|
350 |
-
sr_ratios (Sequence[int]): The spatial reduction rate of each
|
351 |
-
transformer encode layer. Default: [8, 4, 2, 1].
|
352 |
-
out_indices (Sequence[int] | int): Output from which stages.
|
353 |
-
Default: (0, 1, 2, 3).
|
354 |
-
mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
|
355 |
-
Default: 4.
|
356 |
-
qkv_bias (bool): Enable bias for qkv if True. Default: True.
|
357 |
-
drop_rate (float): Probability of an element to be zeroed.
|
358 |
-
Default 0.0
|
359 |
-
attn_drop_rate (float): The drop out rate for attention layer.
|
360 |
-
Default 0.0
|
361 |
-
drop_path_rate (float): stochastic depth rate. Default 0.0
|
362 |
-
norm_cfg (dict): Config dict for normalization layer.
|
363 |
-
Default: dict(type='LN')
|
364 |
-
act_cfg (dict): The activation config for FFNs.
|
365 |
-
Default: dict(type='GELU').
|
366 |
-
pretrained (str, optional): model pretrained path. Default: None.
|
367 |
-
init_cfg (dict or list[dict], optional): Initialization config dict.
|
368 |
-
Default: None.
|
369 |
-
with_cp (bool): Use checkpoint or not. Using checkpoint will save
|
370 |
-
some memory while slowing down the training speed. Default: False.
|
371 |
-
"""
|
372 |
-
|
373 |
-
def __init__(self,
|
374 |
-
in_channels=3,
|
375 |
-
embed_dims=64,
|
376 |
-
num_stages=4,
|
377 |
-
num_layers=[3, 4, 6, 3],
|
378 |
-
num_heads=[1, 2, 4, 8],
|
379 |
-
patch_sizes=[7, 3, 3, 3],
|
380 |
-
strides=[4, 2, 2, 2],
|
381 |
-
sr_ratios=[8, 4, 2, 1],
|
382 |
-
out_indices=(0, 1, 2, 3),
|
383 |
-
mlp_ratio=4,
|
384 |
-
qkv_bias=True,
|
385 |
-
drop_rate=0.,
|
386 |
-
attn_drop_rate=0.,
|
387 |
-
drop_path_rate=0.,
|
388 |
-
tome_cfg=[dict(), dict(), dict(), dict()],
|
389 |
-
act_cfg=dict(type='GELU'),
|
390 |
-
norm_cfg=dict(type='LN', eps=1e-6),
|
391 |
-
init_cfg=None,
|
392 |
-
with_cp=False,
|
393 |
-
down_sample=False):
|
394 |
-
super().__init__(init_cfg=init_cfg)
|
395 |
-
|
396 |
-
self.embed_dims = embed_dims
|
397 |
-
self.num_stages = num_stages
|
398 |
-
self.num_layers = num_layers
|
399 |
-
self.num_heads = num_heads
|
400 |
-
self.patch_sizes = patch_sizes
|
401 |
-
self.strides = strides
|
402 |
-
self.sr_ratios = sr_ratios
|
403 |
-
self.with_cp = with_cp
|
404 |
-
self.down_sample = down_sample
|
405 |
-
assert num_stages == len(num_layers) == len(num_heads) \
|
406 |
-
== len(patch_sizes) == len(strides) == len(sr_ratios)
|
407 |
-
|
408 |
-
self.out_indices = out_indices
|
409 |
-
assert max(out_indices) < self.num_stages
|
410 |
-
|
411 |
-
# transformer encoder
|
412 |
-
dpr = [
|
413 |
-
x.item()
|
414 |
-
for x in torch.linspace(0, drop_path_rate, sum(num_layers))
|
415 |
-
] # stochastic num_layer decay rule
|
416 |
-
|
417 |
-
cur = 0
|
418 |
-
self.layers = ModuleList()
|
419 |
-
for i, num_layer in enumerate(num_layers):
|
420 |
-
embed_dims_i = embed_dims * num_heads[i]
|
421 |
-
patch_embed = PatchEmbed(
|
422 |
-
in_channels=in_channels,
|
423 |
-
embed_dims=embed_dims_i,
|
424 |
-
kernel_size=patch_sizes[i],
|
425 |
-
stride=strides[i],
|
426 |
-
padding=patch_sizes[i] // 2,
|
427 |
-
norm_cfg=norm_cfg)
|
428 |
-
layer = ModuleList([
|
429 |
-
TransformerEncoderLayer(
|
430 |
-
embed_dims=embed_dims_i,
|
431 |
-
num_heads=num_heads[i],
|
432 |
-
feedforward_channels=mlp_ratio * embed_dims_i,
|
433 |
-
drop_rate=drop_rate,
|
434 |
-
attn_drop_rate=attn_drop_rate,
|
435 |
-
drop_path_rate=dpr[cur + idx],
|
436 |
-
qkv_bias=qkv_bias,
|
437 |
-
tome_cfg=tome_cfg[i],
|
438 |
-
act_cfg=act_cfg,
|
439 |
-
norm_cfg=norm_cfg,
|
440 |
-
with_cp=with_cp,
|
441 |
-
sr_ratio=sr_ratios[i]) for idx in range(num_layer)
|
442 |
-
])
|
443 |
-
in_channels = embed_dims_i
|
444 |
-
# The ret[0] of build_norm_layer is norm name.
|
445 |
-
norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
|
446 |
-
self.layers.append(ModuleList([patch_embed, layer, norm]))
|
447 |
-
cur += num_layer
|
448 |
-
|
449 |
-
def init_weights(self):
|
450 |
-
if self.init_cfg is None:
|
451 |
-
for m in self.modules():
|
452 |
-
if isinstance(m, nn.Linear):
|
453 |
-
trunc_normal_init(m, std=.02, bias=0.)
|
454 |
-
elif isinstance(m, nn.LayerNorm):
|
455 |
-
constant_init(m, val=1.0, bias=0.)
|
456 |
-
elif isinstance(m, nn.Conv2d):
|
457 |
-
fan_out = m.kernel_size[0] * m.kernel_size[
|
458 |
-
1] * m.out_channels
|
459 |
-
fan_out //= m.groups
|
460 |
-
normal_init(
|
461 |
-
m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
|
462 |
-
else:
|
463 |
-
super().init_weights()
|
464 |
-
|
465 |
-
def forward(self, x):
|
466 |
-
if self.down_sample:
|
467 |
-
x = torch.nn.functional.interpolate(x, scale_factor=(0.5, 0.5))
|
468 |
-
outs = []
|
469 |
-
|
470 |
-
for i, layer in enumerate(self.layers):
|
471 |
-
x, hw_shape = layer[0](x)
|
472 |
-
for block in layer[1]:
|
473 |
-
x = block(x, hw_shape)
|
474 |
-
x = layer[2](x)
|
475 |
-
x = nlc_to_nchw(x, hw_shape)
|
476 |
-
if i in self.out_indices:
|
477 |
-
outs.append(x)
|
478 |
-
|
479 |
-
return outs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/model/head/__init__.py
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
from .segformer_head import SegformerHead
|
2 |
-
|
3 |
-
__all__ = ['SegformerHead']
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/model/head/segformer_head.py
DELETED
@@ -1,95 +0,0 @@
|
|
1 |
-
# Copyright (c) OpenMMLab. All rights reserved.
|
2 |
-
import torch
|
3 |
-
import torch.nn as nn
|
4 |
-
from mmcv.cnn import ConvModule
|
5 |
-
from mmengine.model import BaseModule
|
6 |
-
|
7 |
-
from ...utils import MODELS
|
8 |
-
from ...utils import resize
|
9 |
-
|
10 |
-
|
11 |
-
@MODELS.register_module()
|
12 |
-
class SegformerHead(BaseModule):
|
13 |
-
"""The all mlp Head of segformer.
|
14 |
-
|
15 |
-
This head is the implementation of
|
16 |
-
`Segformer <https://arxiv.org/abs/2105.15203>` _.
|
17 |
-
|
18 |
-
Args:
|
19 |
-
interpolate_mode: The interpolate mode of MLP head upsample operation.
|
20 |
-
Default: 'bilinear'.
|
21 |
-
"""
|
22 |
-
|
23 |
-
def __init__(self,
|
24 |
-
in_channels=[32, 64, 160, 256],
|
25 |
-
in_index=[0, 1, 2, 3],
|
26 |
-
channels=256,
|
27 |
-
dropout_ratio=0.1,
|
28 |
-
out_channels=19,
|
29 |
-
norm_cfg=None,
|
30 |
-
align_corners=False,
|
31 |
-
interpolate_mode='bilinear'):
|
32 |
-
super().__init__()
|
33 |
-
|
34 |
-
self.in_channels = in_channels
|
35 |
-
self.in_index = in_index
|
36 |
-
self.channels = channels
|
37 |
-
self.dropout_ratio = dropout_ratio
|
38 |
-
self.out_channels = out_channels
|
39 |
-
self.norm_cfg = norm_cfg
|
40 |
-
self.align_corners = align_corners
|
41 |
-
self.interpolate_mode = interpolate_mode
|
42 |
-
|
43 |
-
self.act_cfg = dict(type='ReLU')
|
44 |
-
self.conv_seg = nn.Conv2d(channels, self.out_channels, kernel_size=1)
|
45 |
-
if dropout_ratio > 0:
|
46 |
-
self.dropout = nn.Dropout2d(dropout_ratio)
|
47 |
-
else:
|
48 |
-
self.dropout = None
|
49 |
-
|
50 |
-
num_inputs = len(self.in_channels)
|
51 |
-
|
52 |
-
assert num_inputs == len(self.in_index)
|
53 |
-
|
54 |
-
self.convs = nn.ModuleList()
|
55 |
-
for i in range(num_inputs):
|
56 |
-
self.convs.append(
|
57 |
-
ConvModule(
|
58 |
-
in_channels=self.in_channels[i],
|
59 |
-
out_channels=self.channels,
|
60 |
-
kernel_size=1,
|
61 |
-
stride=1,
|
62 |
-
norm_cfg=self.norm_cfg,
|
63 |
-
act_cfg=self.act_cfg))
|
64 |
-
|
65 |
-
self.fusion_conv = ConvModule(
|
66 |
-
in_channels=self.channels * num_inputs,
|
67 |
-
out_channels=self.channels,
|
68 |
-
kernel_size=1,
|
69 |
-
norm_cfg=self.norm_cfg)
|
70 |
-
|
71 |
-
def cls_seg(self, feat):
|
72 |
-
"""Classify each pixel."""
|
73 |
-
if self.dropout is not None:
|
74 |
-
feat = self.dropout(feat)
|
75 |
-
output = self.conv_seg(feat)
|
76 |
-
return output
|
77 |
-
|
78 |
-
def forward(self, inputs):
|
79 |
-
# Receive 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32
|
80 |
-
outs = []
|
81 |
-
for idx in range(len(inputs)):
|
82 |
-
x = inputs[idx]
|
83 |
-
conv = self.convs[idx]
|
84 |
-
outs.append(
|
85 |
-
resize(
|
86 |
-
input=conv(x),
|
87 |
-
size=inputs[0].shape[2:],
|
88 |
-
mode=self.interpolate_mode,
|
89 |
-
align_corners=self.align_corners))
|
90 |
-
|
91 |
-
out = self.fusion_conv(torch.cat(outs, dim=1))
|
92 |
-
|
93 |
-
out = self.cls_seg(out)
|
94 |
-
|
95 |
-
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/random_benchmark.py
DELETED
@@ -1,61 +0,0 @@
|
|
1 |
-
from typing import Union, List, Tuple
|
2 |
-
|
3 |
-
import numpy as np
|
4 |
-
import torch
|
5 |
-
|
6 |
-
from .utils import benchmark
|
7 |
-
|
8 |
-
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
|
9 |
-
|
10 |
-
|
11 |
-
def random_benchmark(
|
12 |
-
model: torch.nn.Module,
|
13 |
-
batch_size: Union[int, List[int]] = 1,
|
14 |
-
image_size: Union[Tuple[int], List[Tuple[int]]] = (3, 1024, 1024),
|
15 |
-
):
|
16 |
-
"""
|
17 |
-
Calculate the FPS of a given model using randomly generated tensors.
|
18 |
-
|
19 |
-
Args:
|
20 |
-
model: instance of a model (e.g. SegFormer)
|
21 |
-
batch_size: the batch size(s) at which to calculate the FPS (e.g. 1 or [1, 2, 4])
|
22 |
-
image_size: the size of the images to use (e.g. (3, 1024, 1024))
|
23 |
-
|
24 |
-
Returns: the FPS values calculated for all image sizes and batch sizes in the form of a dictionary
|
25 |
-
|
26 |
-
"""
|
27 |
-
if isinstance(batch_size, int):
|
28 |
-
batch_size = [batch_size]
|
29 |
-
if isinstance(image_size, tuple):
|
30 |
-
image_size = [image_size]
|
31 |
-
|
32 |
-
values = {}
|
33 |
-
throughput_values = []
|
34 |
-
|
35 |
-
for i in image_size:
|
36 |
-
# fill with fps for each batch size
|
37 |
-
fps = []
|
38 |
-
for b in batch_size:
|
39 |
-
for _ in range(4):
|
40 |
-
# Baseline benchmark
|
41 |
-
if i[1] >= 1024:
|
42 |
-
r = 16
|
43 |
-
else:
|
44 |
-
r = 32
|
45 |
-
baseline_throughput = benchmark(
|
46 |
-
model.to(device),
|
47 |
-
device=device,
|
48 |
-
verbose=True,
|
49 |
-
runs=r,
|
50 |
-
batch_size=b,
|
51 |
-
input_size=i
|
52 |
-
)
|
53 |
-
throughput_values.append(baseline_throughput)
|
54 |
-
throughput_values = np.asarray(throughput_values)
|
55 |
-
throughput = np.around(np.mean(throughput_values), decimals=2)
|
56 |
-
print('Im_size:', i, 'Batch_size:', b, 'Mean:', throughput, 'Std:',
|
57 |
-
np.around(np.std(throughput_values), decimals=2))
|
58 |
-
throughput_values = []
|
59 |
-
fps.append({b: throughput})
|
60 |
-
values[i] = fps
|
61 |
-
return values
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/utils/__init__.py
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
# Copyright (c) OpenMMLab. All rights reserved.
|
2 |
-
from .embed import PatchEmbed
|
3 |
-
from .shape_convert import nchw_to_nlc, nlc_to_nchw
|
4 |
-
from .wrappers import resize
|
5 |
-
from .tome_presets import tome_presets
|
6 |
-
from .registry import MODELS
|
7 |
-
from .imagenet_weights import imagenet_weights
|
8 |
-
from .benchmark import benchmark
|
9 |
-
|
10 |
-
__all__ = [
|
11 |
-
'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw', 'resize', 'tome_presets', 'MODELS', 'imagenet_weights', 'benchmark'
|
12 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/utils/benchmark.py
DELETED
@@ -1,76 +0,0 @@
|
|
1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
-
# All rights reserved.
|
3 |
-
|
4 |
-
# Source: https://github.com/facebookresearch/ToMe/blob/main/tome/utils.py
|
5 |
-
# --------------------------------------------------------
|
6 |
-
|
7 |
-
import time
|
8 |
-
from typing import Tuple
|
9 |
-
|
10 |
-
import torch
|
11 |
-
from tqdm import tqdm
|
12 |
-
|
13 |
-
|
14 |
-
def benchmark(
|
15 |
-
model: torch.nn.Module,
|
16 |
-
device: torch.device = 0,
|
17 |
-
input_size: Tuple[int] = (3, 224, 224),
|
18 |
-
batch_size: int = 64,
|
19 |
-
runs: int = 40,
|
20 |
-
throw_out: float = 0.25,
|
21 |
-
use_fp16: bool = False,
|
22 |
-
verbose: bool = False,
|
23 |
-
) -> float:
|
24 |
-
"""
|
25 |
-
Benchmark the given model with random inputs at the given batch size.
|
26 |
-
|
27 |
-
Args:
|
28 |
-
- model: the module to benchmark
|
29 |
-
- device: the device to use for benchmarking
|
30 |
-
- input_size: the input size to pass to the model (channels, h, w)
|
31 |
-
- batch_size: the batch size to use for evaluation
|
32 |
-
- runs: the number of total runs to do
|
33 |
-
- throw_out: the percentage of runs to throw out at the start of testing
|
34 |
-
- use_fp16: whether or not to benchmark with float16 and autocast
|
35 |
-
- verbose: whether or not to use tqdm to print progress / print throughput at end
|
36 |
-
|
37 |
-
Returns:
|
38 |
-
- the throughput measured in images / second
|
39 |
-
"""
|
40 |
-
if not isinstance(device, torch.device):
|
41 |
-
device = torch.device(device)
|
42 |
-
is_cuda = torch.device(device).type == "cuda"
|
43 |
-
|
44 |
-
model = model.eval().to(device)
|
45 |
-
input = torch.rand(batch_size, *input_size, device=device)
|
46 |
-
if use_fp16:
|
47 |
-
input = input.half()
|
48 |
-
|
49 |
-
warm_up = int(runs * throw_out)
|
50 |
-
total = 0
|
51 |
-
start = time.time()
|
52 |
-
|
53 |
-
with torch.autocast(device.type, enabled=use_fp16):
|
54 |
-
with torch.no_grad():
|
55 |
-
for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"):
|
56 |
-
if i == warm_up:
|
57 |
-
if is_cuda:
|
58 |
-
torch.cuda.synchronize()
|
59 |
-
total = 0
|
60 |
-
start = time.time()
|
61 |
-
|
62 |
-
model(input)
|
63 |
-
total += batch_size
|
64 |
-
|
65 |
-
if is_cuda:
|
66 |
-
torch.cuda.synchronize()
|
67 |
-
|
68 |
-
end = time.time()
|
69 |
-
elapsed = end - start
|
70 |
-
|
71 |
-
throughput = total / elapsed
|
72 |
-
|
73 |
-
if verbose:
|
74 |
-
print(f"Throughput: {throughput:.2f} im/s")
|
75 |
-
|
76 |
-
return throughput
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/utils/embed.py
DELETED
@@ -1,330 +0,0 @@
|
|
1 |
-
# Copyright (c) OpenMMLab. All rights reserved.
|
2 |
-
import math
|
3 |
-
from typing import Sequence
|
4 |
-
|
5 |
-
import torch.nn as nn
|
6 |
-
import torch.nn.functional as F
|
7 |
-
from mmcv.cnn import build_conv_layer, build_norm_layer
|
8 |
-
from mmengine.model import BaseModule
|
9 |
-
from mmengine.utils import to_2tuple
|
10 |
-
|
11 |
-
|
12 |
-
class AdaptivePadding(nn.Module):
|
13 |
-
"""Applies padding to input (if needed) so that input can get fully covered
|
14 |
-
by filter you specified. It supports two modes "same" and "corner". The
|
15 |
-
"same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
|
16 |
-
input. The "corner" mode would pad zero to bottom right.
|
17 |
-
|
18 |
-
Args:
|
19 |
-
kernel_size (int | tuple): Size of the kernel:
|
20 |
-
stride (int | tuple): Stride of the filter. Default: 1:
|
21 |
-
dilation (int | tuple): Spacing between kernel elements.
|
22 |
-
Default: 1.
|
23 |
-
padding (str): Support "same" and "corner", "corner" mode
|
24 |
-
would pad zero to bottom right, and "same" mode would
|
25 |
-
pad zero around input. Default: "corner".
|
26 |
-
Example:
|
27 |
-
>>> kernel_size = 16
|
28 |
-
>>> stride = 16
|
29 |
-
>>> dilation = 1
|
30 |
-
>>> input = torch.rand(1, 1, 15, 17)
|
31 |
-
>>> adap_pad = AdaptivePadding(
|
32 |
-
>>> kernel_size=kernel_size,
|
33 |
-
>>> stride=stride,
|
34 |
-
>>> dilation=dilation,
|
35 |
-
>>> padding="corner")
|
36 |
-
>>> out = adap_pad(input)
|
37 |
-
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
|
38 |
-
>>> input = torch.rand(1, 1, 16, 17)
|
39 |
-
>>> out = adap_pad(input)
|
40 |
-
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
|
41 |
-
"""
|
42 |
-
|
43 |
-
def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
|
44 |
-
|
45 |
-
super().__init__()
|
46 |
-
|
47 |
-
assert padding in ('same', 'corner')
|
48 |
-
|
49 |
-
kernel_size = to_2tuple(kernel_size)
|
50 |
-
stride = to_2tuple(stride)
|
51 |
-
dilation = to_2tuple(dilation)
|
52 |
-
|
53 |
-
self.padding = padding
|
54 |
-
self.kernel_size = kernel_size
|
55 |
-
self.stride = stride
|
56 |
-
self.dilation = dilation
|
57 |
-
|
58 |
-
def get_pad_shape(self, input_shape):
|
59 |
-
input_h, input_w = input_shape
|
60 |
-
kernel_h, kernel_w = self.kernel_size
|
61 |
-
stride_h, stride_w = self.stride
|
62 |
-
output_h = math.ceil(input_h / stride_h)
|
63 |
-
output_w = math.ceil(input_w / stride_w)
|
64 |
-
pad_h = max((output_h - 1) * stride_h +
|
65 |
-
(kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
|
66 |
-
pad_w = max((output_w - 1) * stride_w +
|
67 |
-
(kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
|
68 |
-
return pad_h, pad_w
|
69 |
-
|
70 |
-
def forward(self, x):
|
71 |
-
pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
|
72 |
-
if pad_h > 0 or pad_w > 0:
|
73 |
-
if self.padding == 'corner':
|
74 |
-
x = F.pad(x, [0, pad_w, 0, pad_h])
|
75 |
-
elif self.padding == 'same':
|
76 |
-
x = F.pad(x, [
|
77 |
-
pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
|
78 |
-
pad_h - pad_h // 2
|
79 |
-
])
|
80 |
-
return x
|
81 |
-
|
82 |
-
|
83 |
-
class PatchEmbed(BaseModule):
|
84 |
-
"""Image to Patch Embedding.
|
85 |
-
|
86 |
-
We use a conv layer to implement PatchEmbed.
|
87 |
-
|
88 |
-
Args:
|
89 |
-
in_channels (int): The num of input channels. Default: 3
|
90 |
-
embed_dims (int): The dimensions of embedding. Default: 768
|
91 |
-
conv_type (str): The config dict for embedding
|
92 |
-
conv layer type selection. Default: "Conv2d".
|
93 |
-
kernel_size (int): The kernel_size of embedding conv. Default: 16.
|
94 |
-
stride (int, optional): The slide stride of embedding conv.
|
95 |
-
Default: None (Would be set as `kernel_size`).
|
96 |
-
padding (int | tuple | string ): The padding length of
|
97 |
-
embedding conv. When it is a string, it means the mode
|
98 |
-
of adaptive padding, support "same" and "corner" now.
|
99 |
-
Default: "corner".
|
100 |
-
dilation (int): The dilation rate of embedding conv. Default: 1.
|
101 |
-
bias (bool): Bias of embed conv. Default: True.
|
102 |
-
norm_cfg (dict, optional): Config dict for normalization layer.
|
103 |
-
Default: None.
|
104 |
-
input_size (int | tuple | None): The size of input, which will be
|
105 |
-
used to calculate the out size. Only work when `dynamic_size`
|
106 |
-
is False. Default: None.
|
107 |
-
init_cfg (`mmengine.ConfigDict`, optional): The Config for
|
108 |
-
initialization. Default: None.
|
109 |
-
"""
|
110 |
-
|
111 |
-
def __init__(self,
|
112 |
-
in_channels=3,
|
113 |
-
embed_dims=768,
|
114 |
-
conv_type='Conv2d',
|
115 |
-
kernel_size=16,
|
116 |
-
stride=None,
|
117 |
-
padding='corner',
|
118 |
-
dilation=1,
|
119 |
-
bias=True,
|
120 |
-
norm_cfg=None,
|
121 |
-
input_size=None,
|
122 |
-
init_cfg=None):
|
123 |
-
super().__init__(init_cfg=init_cfg)
|
124 |
-
|
125 |
-
self.embed_dims = embed_dims
|
126 |
-
if stride is None:
|
127 |
-
stride = kernel_size
|
128 |
-
|
129 |
-
kernel_size = to_2tuple(kernel_size)
|
130 |
-
stride = to_2tuple(stride)
|
131 |
-
dilation = to_2tuple(dilation)
|
132 |
-
|
133 |
-
if isinstance(padding, str):
|
134 |
-
self.adap_padding = AdaptivePadding(
|
135 |
-
kernel_size=kernel_size,
|
136 |
-
stride=stride,
|
137 |
-
dilation=dilation,
|
138 |
-
padding=padding)
|
139 |
-
# disable the padding of conv
|
140 |
-
padding = 0
|
141 |
-
else:
|
142 |
-
self.adap_padding = None
|
143 |
-
padding = to_2tuple(padding)
|
144 |
-
|
145 |
-
self.projection = build_conv_layer(
|
146 |
-
dict(type=conv_type),
|
147 |
-
in_channels=in_channels,
|
148 |
-
out_channels=embed_dims,
|
149 |
-
kernel_size=kernel_size,
|
150 |
-
stride=stride,
|
151 |
-
padding=padding,
|
152 |
-
dilation=dilation,
|
153 |
-
bias=bias)
|
154 |
-
|
155 |
-
if norm_cfg is not None:
|
156 |
-
self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
|
157 |
-
else:
|
158 |
-
self.norm = None
|
159 |
-
|
160 |
-
if input_size:
|
161 |
-
input_size = to_2tuple(input_size)
|
162 |
-
# `init_out_size` would be used outside to
|
163 |
-
# calculate the num_patches
|
164 |
-
# when `use_abs_pos_embed` outside
|
165 |
-
self.init_input_size = input_size
|
166 |
-
if self.adap_padding:
|
167 |
-
pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
|
168 |
-
input_h, input_w = input_size
|
169 |
-
input_h = input_h + pad_h
|
170 |
-
input_w = input_w + pad_w
|
171 |
-
input_size = (input_h, input_w)
|
172 |
-
|
173 |
-
# https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
|
174 |
-
h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
|
175 |
-
(kernel_size[0] - 1) - 1) // stride[0] + 1
|
176 |
-
w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
|
177 |
-
(kernel_size[1] - 1) - 1) // stride[1] + 1
|
178 |
-
self.init_out_size = (h_out, w_out)
|
179 |
-
else:
|
180 |
-
self.init_input_size = None
|
181 |
-
self.init_out_size = None
|
182 |
-
|
183 |
-
def forward(self, x):
|
184 |
-
"""
|
185 |
-
Args:
|
186 |
-
x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
|
187 |
-
|
188 |
-
Returns:
|
189 |
-
tuple: Contains merged results and its spatial shape.
|
190 |
-
|
191 |
-
- x (Tensor): Has shape (B, out_h * out_w, embed_dims)
|
192 |
-
- out_size (tuple[int]): Spatial shape of x, arrange as
|
193 |
-
(out_h, out_w).
|
194 |
-
"""
|
195 |
-
|
196 |
-
if self.adap_padding:
|
197 |
-
x = self.adap_padding(x)
|
198 |
-
|
199 |
-
x = self.projection(x)
|
200 |
-
out_size = (x.shape[2], x.shape[3])
|
201 |
-
x = x.flatten(2).transpose(1, 2)
|
202 |
-
if self.norm is not None:
|
203 |
-
x = self.norm(x)
|
204 |
-
return x, out_size
|
205 |
-
|
206 |
-
|
207 |
-
class PatchMerging(BaseModule):
|
208 |
-
"""Merge patch feature map.
|
209 |
-
|
210 |
-
This layer groups feature map by kernel_size, and applies norm and linear
|
211 |
-
layers to the grouped feature map. Our implementation uses `nn.Unfold` to
|
212 |
-
merge patch, which is about 25% faster than original implementation.
|
213 |
-
Instead, we need to modify pretrained models for compatibility.
|
214 |
-
|
215 |
-
Args:
|
216 |
-
in_channels (int): The num of input channels.
|
217 |
-
out_channels (int): The num of output channels.
|
218 |
-
kernel_size (int | tuple, optional): the kernel size in the unfold
|
219 |
-
layer. Defaults to 2.
|
220 |
-
stride (int | tuple, optional): the stride of the sliding blocks in the
|
221 |
-
unfold layer. Default: None. (Would be set as `kernel_size`)
|
222 |
-
padding (int | tuple | string ): The padding length of
|
223 |
-
embedding conv. When it is a string, it means the mode
|
224 |
-
of adaptive padding, support "same" and "corner" now.
|
225 |
-
Default: "corner".
|
226 |
-
dilation (int | tuple, optional): dilation parameter in the unfold
|
227 |
-
layer. Default: 1.
|
228 |
-
bias (bool, optional): Whether to add bias in linear layer or not.
|
229 |
-
Defaults: False.
|
230 |
-
norm_cfg (dict, optional): Config dict for normalization layer.
|
231 |
-
Default: dict(type='LN').
|
232 |
-
init_cfg (dict, optional): The extra config for initialization.
|
233 |
-
Default: None.
|
234 |
-
"""
|
235 |
-
|
236 |
-
def __init__(self,
|
237 |
-
in_channels,
|
238 |
-
out_channels,
|
239 |
-
kernel_size=2,
|
240 |
-
stride=None,
|
241 |
-
padding='corner',
|
242 |
-
dilation=1,
|
243 |
-
bias=False,
|
244 |
-
norm_cfg=dict(type='LN'),
|
245 |
-
init_cfg=None):
|
246 |
-
super().__init__(init_cfg=init_cfg)
|
247 |
-
self.in_channels = in_channels
|
248 |
-
self.out_channels = out_channels
|
249 |
-
if stride:
|
250 |
-
stride = stride
|
251 |
-
else:
|
252 |
-
stride = kernel_size
|
253 |
-
|
254 |
-
kernel_size = to_2tuple(kernel_size)
|
255 |
-
stride = to_2tuple(stride)
|
256 |
-
dilation = to_2tuple(dilation)
|
257 |
-
|
258 |
-
if isinstance(padding, str):
|
259 |
-
self.adap_padding = AdaptivePadding(
|
260 |
-
kernel_size=kernel_size,
|
261 |
-
stride=stride,
|
262 |
-
dilation=dilation,
|
263 |
-
padding=padding)
|
264 |
-
# disable the padding of unfold
|
265 |
-
padding = 0
|
266 |
-
else:
|
267 |
-
self.adap_padding = None
|
268 |
-
|
269 |
-
padding = to_2tuple(padding)
|
270 |
-
self.sampler = nn.Unfold(
|
271 |
-
kernel_size=kernel_size,
|
272 |
-
dilation=dilation,
|
273 |
-
padding=padding,
|
274 |
-
stride=stride)
|
275 |
-
|
276 |
-
sample_dim = kernel_size[0] * kernel_size[1] * in_channels
|
277 |
-
|
278 |
-
if norm_cfg is not None:
|
279 |
-
self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
|
280 |
-
else:
|
281 |
-
self.norm = None
|
282 |
-
|
283 |
-
self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
|
284 |
-
|
285 |
-
def forward(self, x, input_size):
|
286 |
-
"""
|
287 |
-
Args:
|
288 |
-
x (Tensor): Has shape (B, H*W, C_in).
|
289 |
-
input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
|
290 |
-
Default: None.
|
291 |
-
|
292 |
-
Returns:
|
293 |
-
tuple: Contains merged results and its spatial shape.
|
294 |
-
|
295 |
-
- x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
|
296 |
-
- out_size (tuple[int]): Spatial shape of x, arrange as
|
297 |
-
(Merged_H, Merged_W).
|
298 |
-
"""
|
299 |
-
B, L, C = x.shape
|
300 |
-
assert isinstance(input_size, Sequence), f'Expect ' \
|
301 |
-
f'input_size is ' \
|
302 |
-
f'`Sequence` ' \
|
303 |
-
f'but get {input_size}'
|
304 |
-
|
305 |
-
H, W = input_size
|
306 |
-
assert L == H * W, 'input feature has wrong size'
|
307 |
-
|
308 |
-
x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W
|
309 |
-
# Use nn.Unfold to merge patch. About 25% faster than original method,
|
310 |
-
# but need to modify pretrained model for compatibility
|
311 |
-
|
312 |
-
if self.adap_padding:
|
313 |
-
x = self.adap_padding(x)
|
314 |
-
H, W = x.shape[-2:]
|
315 |
-
|
316 |
-
x = self.sampler(x)
|
317 |
-
# if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
|
318 |
-
|
319 |
-
out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
|
320 |
-
(self.sampler.kernel_size[0] - 1) -
|
321 |
-
1) // self.sampler.stride[0] + 1
|
322 |
-
out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
|
323 |
-
(self.sampler.kernel_size[1] - 1) -
|
324 |
-
1) // self.sampler.stride[1] + 1
|
325 |
-
|
326 |
-
output_size = (out_h, out_w)
|
327 |
-
x = x.transpose(1, 2) # B, H/2*W/2, 4*C
|
328 |
-
x = self.norm(x) if self.norm else x
|
329 |
-
x = self.reduction(x)
|
330 |
-
return x, output_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/utils/imagenet_weights.py
DELETED
@@ -1,8 +0,0 @@
|
|
1 |
-
imagenet_weights = {
|
2 |
-
'b0': 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth',
|
3 |
-
'b1': 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b1_20220624-02e5a6a1.pth',
|
4 |
-
'b2': 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b2_20220624-66e8bf70.pth',
|
5 |
-
'b3': 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b3_20220624-13b1141c.pth',
|
6 |
-
'b4': 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b4_20220624-d588d980.pth',
|
7 |
-
'b5': 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth'
|
8 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/utils/registry.py
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
from mmengine import Registry
|
2 |
-
|
3 |
-
MODELS = Registry(
|
4 |
-
'models',
|
5 |
-
locations=['segformer_plusplus.model.backbone', 'segformer_plusplus.model.head']
|
6 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/utils/shape_convert.py
DELETED
@@ -1,107 +0,0 @@
|
|
1 |
-
# Copyright (c) OpenMMLab. All rights reserved.
|
2 |
-
def nlc_to_nchw(x, hw_shape):
|
3 |
-
"""Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
|
4 |
-
|
5 |
-
Args:
|
6 |
-
x (Tensor): The input tensor of shape [N, L, C] before conversion.
|
7 |
-
hw_shape (Sequence[int]): The height and width of output feature map.
|
8 |
-
|
9 |
-
Returns:
|
10 |
-
Tensor: The output tensor of shape [N, C, H, W] after conversion.
|
11 |
-
"""
|
12 |
-
H, W = hw_shape
|
13 |
-
assert len(x.shape) == 3
|
14 |
-
B, L, C = x.shape
|
15 |
-
assert L == H * W, 'The seq_len doesn\'t match H, W'
|
16 |
-
return x.transpose(1, 2).reshape(B, C, H, W)
|
17 |
-
|
18 |
-
|
19 |
-
def nchw_to_nlc(x):
|
20 |
-
"""Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
|
21 |
-
|
22 |
-
Args:
|
23 |
-
x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
|
24 |
-
|
25 |
-
Returns:
|
26 |
-
Tensor: The output tensor of shape [N, L, C] after conversion.
|
27 |
-
"""
|
28 |
-
assert len(x.shape) == 4
|
29 |
-
return x.flatten(2).transpose(1, 2).contiguous()
|
30 |
-
|
31 |
-
|
32 |
-
def nchw2nlc2nchw(module, x, contiguous=False, **kwargs):
|
33 |
-
"""Flatten [N, C, H, W] shape tensor `x` to [N, L, C] shape tensor. Use the
|
34 |
-
reshaped tensor as the input of `module`, and the convert the output of
|
35 |
-
`module`, whose shape is.
|
36 |
-
|
37 |
-
[N, L, C], to [N, C, H, W].
|
38 |
-
|
39 |
-
Args:
|
40 |
-
module (Callable): A callable object the takes a tensor
|
41 |
-
with shape [N, L, C] as input.
|
42 |
-
x (Tensor): The input tensor of shape [N, C, H, W].
|
43 |
-
contiguous:
|
44 |
-
contiguous (Bool): Whether to make the tensor contiguous
|
45 |
-
after each shape transform.
|
46 |
-
|
47 |
-
Returns:
|
48 |
-
Tensor: The output tensor of shape [N, C, H, W].
|
49 |
-
|
50 |
-
Example:
|
51 |
-
>>> import torch
|
52 |
-
>>> import torch.nn as nn
|
53 |
-
>>> norm = nn.LayerNorm(4)
|
54 |
-
>>> feature_map = torch.rand(4, 4, 5, 5)
|
55 |
-
>>> output = nchw2nlc2nchw(norm, feature_map)
|
56 |
-
"""
|
57 |
-
B, C, H, W = x.shape
|
58 |
-
if not contiguous:
|
59 |
-
x = x.flatten(2).transpose(1, 2)
|
60 |
-
x = module(x, **kwargs)
|
61 |
-
x = x.transpose(1, 2).reshape(B, C, H, W)
|
62 |
-
else:
|
63 |
-
x = x.flatten(2).transpose(1, 2).contiguous()
|
64 |
-
x = module(x, **kwargs)
|
65 |
-
x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
|
66 |
-
return x
|
67 |
-
|
68 |
-
|
69 |
-
def nlc2nchw2nlc(module, x, hw_shape, contiguous=False, **kwargs):
|
70 |
-
"""Convert [N, L, C] shape tensor `x` to [N, C, H, W] shape tensor. Use the
|
71 |
-
reshaped tensor as the input of `module`, and convert the output of
|
72 |
-
`module`, whose shape is.
|
73 |
-
|
74 |
-
[N, C, H, W], to [N, L, C].
|
75 |
-
|
76 |
-
Args:
|
77 |
-
module (Callable): A callable object the takes a tensor
|
78 |
-
with shape [N, C, H, W] as input.
|
79 |
-
x (Tensor): The input tensor of shape [N, L, C].
|
80 |
-
hw_shape: (Sequence[int]): The height and width of the
|
81 |
-
feature map with shape [N, C, H, W].
|
82 |
-
contiguous (Bool): Whether to make the tensor contiguous
|
83 |
-
after each shape transform.
|
84 |
-
|
85 |
-
Returns:
|
86 |
-
Tensor: The output tensor of shape [N, L, C].
|
87 |
-
|
88 |
-
Example:
|
89 |
-
>>> import torch
|
90 |
-
>>> import torch.nn as nn
|
91 |
-
>>> conv = nn.Conv2d(16, 16, 3, 1, 1)
|
92 |
-
>>> feature_map = torch.rand(4, 25, 16)
|
93 |
-
>>> output = nlc2nchw2nlc(conv, feature_map, (5, 5))
|
94 |
-
"""
|
95 |
-
H, W = hw_shape
|
96 |
-
assert len(x.shape) == 3
|
97 |
-
B, L, C = x.shape
|
98 |
-
assert L == H * W, 'The seq_len doesn\'t match H, W'
|
99 |
-
if not contiguous:
|
100 |
-
x = x.transpose(1, 2).reshape(B, C, H, W)
|
101 |
-
x = module(x, **kwargs)
|
102 |
-
x = x.flatten(2).transpose(1, 2)
|
103 |
-
else:
|
104 |
-
x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
|
105 |
-
x = module(x, **kwargs)
|
106 |
-
x = x.flatten(2).transpose(1, 2).contiguous()
|
107 |
-
return x
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/utils/tome_presets.py
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
tome_presets = {
|
2 |
-
'bsm_hq': [
|
3 |
-
dict(q_mode=None, kv_mode='bsm', kv_r=0.6, kv_sx=2, kv_sy=2),
|
4 |
-
dict(q_mode=None, kv_mode='bsm', kv_r=0.6, kv_sx=2, kv_sy=2),
|
5 |
-
dict(q_mode='bsm', kv_mode=None, q_r=0.8, q_sx=4, q_sy=4),
|
6 |
-
dict(q_mode='bsm', kv_mode=None, q_r=0.8, q_sx=4, q_sy=4)
|
7 |
-
],
|
8 |
-
'bsm_fast': [
|
9 |
-
dict(q_mode=None, kv_mode='bsm_r2D', kv_r=0.9, kv_sx=4, kv_sy=4),
|
10 |
-
dict(q_mode=None, kv_mode='bsm_r2D', kv_r=0.9, kv_sx=4, kv_sy=4),
|
11 |
-
dict(q_mode='bsm_r2D', kv_mode=None, q_r=0.9, q_sx=4, q_sy=4),
|
12 |
-
dict(q_mode='bsm_r2D', kv_mode=None, q_r=0.9, q_sx=4, q_sy=4)
|
13 |
-
],
|
14 |
-
'n2d_2x2': [
|
15 |
-
dict(q_mode='neighbor_2D', kv_mode=None, q_s=(2, 2)),
|
16 |
-
dict(q_mode='neighbor_2D', kv_mode=None, q_s=(2, 2)),
|
17 |
-
dict(q_mode='neighbor_2D', kv_mode=None, q_s=(2, 2)),
|
18 |
-
dict(q_mode='neighbor_2D', kv_mode=None, q_s=(2, 2))
|
19 |
-
]
|
20 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
build/lib/segformer_plusplus/utils/wrappers.py
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
# Copyright (c) OpenMMLab. All rights reserved.
|
2 |
-
import warnings
|
3 |
-
|
4 |
-
import torch.nn as nn
|
5 |
-
import torch.nn.functional as F
|
6 |
-
|
7 |
-
|
8 |
-
def resize(input,
|
9 |
-
size=None,
|
10 |
-
scale_factor=None,
|
11 |
-
mode='nearest',
|
12 |
-
align_corners=None,
|
13 |
-
warning=True):
|
14 |
-
if warning:
|
15 |
-
if size is not None and align_corners:
|
16 |
-
input_h, input_w = tuple(int(x) for x in input.shape[2:])
|
17 |
-
output_h, output_w = tuple(int(x) for x in size)
|
18 |
-
if output_h > input_h or output_w > output_h:
|
19 |
-
if ((output_h > 1 and output_w > 1 and input_h > 1
|
20 |
-
and input_w > 1) and (output_h - 1) % (input_h - 1)
|
21 |
-
and (output_w - 1) % (input_w - 1)):
|
22 |
-
warnings.warn(
|
23 |
-
f'When align_corners={align_corners}, '
|
24 |
-
'the output would more aligned if '
|
25 |
-
f'input size {(input_h, input_w)} is `x+1` and '
|
26 |
-
f'out size {(output_h, output_w)} is `nx+1`')
|
27 |
-
return F.interpolate(input, size, scale_factor, mode, align_corners)
|
28 |
-
|
29 |
-
|
30 |
-
class Upsample(nn.Module):
|
31 |
-
|
32 |
-
def __init__(self,
|
33 |
-
size=None,
|
34 |
-
scale_factor=None,
|
35 |
-
mode='nearest',
|
36 |
-
align_corners=None):
|
37 |
-
super().__init__()
|
38 |
-
self.size = size
|
39 |
-
if isinstance(scale_factor, tuple):
|
40 |
-
self.scale_factor = tuple(float(factor) for factor in scale_factor)
|
41 |
-
else:
|
42 |
-
self.scale_factor = float(scale_factor) if scale_factor else None
|
43 |
-
self.mode = mode
|
44 |
-
self.align_corners = align_corners
|
45 |
-
|
46 |
-
def forward(self, x):
|
47 |
-
if not self.size:
|
48 |
-
size = [int(t * self.scale_factor) for t in x.shape[-2:]]
|
49 |
-
else:
|
50 |
-
size = self.size
|
51 |
-
return resize(x, size, None, self.mode, self.align_corners)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
segformer_plusplus.egg-info/SOURCES.txt
CHANGED
@@ -1,7 +1,10 @@
|
|
1 |
setup.py
|
2 |
segformer_plusplus/__init__.py
|
3 |
segformer_plusplus/build_model.py
|
|
|
4 |
segformer_plusplus/random_benchmark.py
|
|
|
|
|
5 |
segformer_plusplus.egg-info/PKG-INFO
|
6 |
segformer_plusplus.egg-info/SOURCES.txt
|
7 |
segformer_plusplus.egg-info/dependency_links.txt
|
@@ -15,15 +18,22 @@ segformer_plusplus/configs/segformer_mit_b3.py
|
|
15 |
segformer_plusplus/configs/segformer_mit_b4.py
|
16 |
segformer_plusplus/configs/segformer_mit_b5.py
|
17 |
segformer_plusplus/model/__init__.py
|
|
|
|
|
|
|
18 |
segformer_plusplus/model/backbone/__init__.py
|
19 |
segformer_plusplus/model/backbone/mit.py
|
20 |
segformer_plusplus/model/head/__init__.py
|
21 |
segformer_plusplus/model/head/segformer_head.py
|
22 |
segformer_plusplus/utils/__init__.py
|
|
|
23 |
segformer_plusplus/utils/benchmark.py
|
|
|
24 |
segformer_plusplus/utils/embed.py
|
25 |
segformer_plusplus/utils/imagenet_weights.py
|
|
|
26 |
segformer_plusplus/utils/registry.py
|
27 |
segformer_plusplus/utils/shape_convert.py
|
28 |
segformer_plusplus/utils/tome_presets.py
|
|
|
29 |
segformer_plusplus/utils/wrappers.py
|
|
|
1 |
setup.py
|
2 |
segformer_plusplus/__init__.py
|
3 |
segformer_plusplus/build_model.py
|
4 |
+
segformer_plusplus/cityscape_benchmark.py
|
5 |
segformer_plusplus/random_benchmark.py
|
6 |
+
segformer_plusplus/start_cityscape_benchmark.py
|
7 |
+
segformer_plusplus/start_random_benchmark.py
|
8 |
segformer_plusplus.egg-info/PKG-INFO
|
9 |
segformer_plusplus.egg-info/SOURCES.txt
|
10 |
segformer_plusplus.egg-info/dependency_links.txt
|
|
|
18 |
segformer_plusplus/configs/segformer_mit_b4.py
|
19 |
segformer_plusplus/configs/segformer_mit_b5.py
|
20 |
segformer_plusplus/model/__init__.py
|
21 |
+
segformer_plusplus/model/base_module.py
|
22 |
+
segformer_plusplus/model/utils.py
|
23 |
+
segformer_plusplus/model/weight_init.py
|
24 |
segformer_plusplus/model/backbone/__init__.py
|
25 |
segformer_plusplus/model/backbone/mit.py
|
26 |
segformer_plusplus/model/head/__init__.py
|
27 |
segformer_plusplus/model/head/segformer_head.py
|
28 |
segformer_plusplus/utils/__init__.py
|
29 |
+
segformer_plusplus/utils/activation.py
|
30 |
segformer_plusplus/utils/benchmark.py
|
31 |
+
segformer_plusplus/utils/build_functions.py
|
32 |
segformer_plusplus/utils/embed.py
|
33 |
segformer_plusplus/utils/imagenet_weights.py
|
34 |
+
segformer_plusplus/utils/manager.py
|
35 |
segformer_plusplus/utils/registry.py
|
36 |
segformer_plusplus/utils/shape_convert.py
|
37 |
segformer_plusplus/utils/tome_presets.py
|
38 |
+
segformer_plusplus/utils/version_utils.py
|
39 |
segformer_plusplus/utils/wrappers.py
|
segformer_plusplus.egg-info/requires.txt
CHANGED
@@ -1,2 +1,5 @@
|
|
|
|
|
|
|
|
1 |
tomesd
|
2 |
torch>=2.0.1
|
|
|
1 |
+
numpy
|
2 |
+
omegaconf
|
3 |
+
pyyaml
|
4 |
tomesd
|
5 |
torch>=2.0.1
|
segformer_plusplus/cityscape/berlin_000543_000019_leftImg8bit.png
ADDED
![]() |
Git LFS Details
|
segformer_plusplus/cityscape_benchmark.py
CHANGED
@@ -14,6 +14,8 @@ print(f"Using device: {device}")
|
|
14 |
if device.type == 'cuda':
|
15 |
print(f"CUDA Device Name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
|
16 |
|
|
|
|
|
17 |
|
18 |
def cityscape_benchmark(
|
19 |
model: torch.nn.Module,
|
@@ -96,17 +98,7 @@ def cityscape_benchmark(
|
|
96 |
|
97 |
if save_output:
|
98 |
with torch.no_grad():
|
99 |
-
|
100 |
-
f.write("=== Model Input Info ===\n")
|
101 |
-
f.write(f"Input tensor:\n{img_tensor}\n")
|
102 |
-
f.write(f"Input shape: {img_tensor.shape}\n")
|
103 |
-
f.write(f"Input stats: mean = {img_tensor.mean().item()}, std = {img_tensor.std().item()}\n\n")
|
104 |
-
|
105 |
-
output = model(img_tensor)
|
106 |
-
|
107 |
-
f.write("=== Raw Model Output ===\n")
|
108 |
-
f.write(f"{output}\n\n")
|
109 |
-
|
110 |
pred = torch.argmax(output, dim=1).squeeze(0).cpu().numpy()
|
111 |
|
112 |
# Speichere Prediction als Text ab
|
|
|
14 |
if device.type == 'cuda':
|
15 |
print(f"CUDA Device Name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
|
16 |
|
17 |
+
torch.manual_seed(42)
|
18 |
+
torch.cuda.manual_seed_all(42)
|
19 |
|
20 |
def cityscape_benchmark(
|
21 |
model: torch.nn.Module,
|
|
|
98 |
|
99 |
if save_output:
|
100 |
with torch.no_grad():
|
101 |
+
output = model(img_tensor)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
pred = torch.argmax(output, dim=1).squeeze(0).cpu().numpy()
|
103 |
|
104 |
# Speichere Prediction als Text ab
|
cityscapes_prediction_output_reference.txt → segformer_plusplus/cityscapes_prediction_output.txt
RENAMED
File without changes
|
segformer_plusplus/cityscapes_prediction_output_reference_b05_nocheckpoint.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
segformer_plusplus/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model_type": "segformerplusplus",
|
3 |
+
"architectures": ["SegFormerPlusPlus"],
|
4 |
+
"backbone": "b5",
|
5 |
+
"supported_backbones": ["b0", "b1", "b2", "b3", "b4", "b5"],
|
6 |
+
"head": "bsm_hq",
|
7 |
+
"supported_heads": ["bsm_hq", "bsm_fast", "n2d_2x2"],
|
8 |
+
"out_channels": 19,
|
9 |
+
"num_labels": 19
|
10 |
+
}
|
segformer_plusplus/configs/config/utils.py
CHANGED
@@ -10,11 +10,17 @@ from importlib import import_module as real_import_module
|
|
10 |
import json
|
11 |
import pickle
|
12 |
from pathlib import Path
|
13 |
-
|
14 |
|
15 |
import yaml
|
16 |
from omegaconf import OmegaConf
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
PYTHON_ROOT_DIR = osp.dirname(osp.dirname(sys.executable))
|
20 |
SYSTEM_PYTHON_PREFIX = '/usr/lib/python'
|
@@ -644,4 +650,125 @@ def dump(obj, file=None, file_format=None, **kwargs):
|
|
644 |
|
645 |
def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
|
646 |
if not osp.isfile(filename):
|
647 |
-
raise FileNotFoundError(msg_tmpl.format(filename))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
import json
|
11 |
import pickle
|
12 |
from pathlib import Path
|
13 |
+
import itertools
|
14 |
|
15 |
import yaml
|
16 |
from omegaconf import OmegaConf
|
17 |
|
18 |
+
from pkg_resources.extern import packaging
|
19 |
+
__import__('pkg_resources.extern.packaging.version')
|
20 |
+
__import__('pkg_resources.extern.packaging.specifiers')
|
21 |
+
__import__('pkg_resources.extern.packaging.requirements')
|
22 |
+
__import__('pkg_resources.extern.packaging.markers')
|
23 |
+
|
24 |
|
25 |
PYTHON_ROOT_DIR = osp.dirname(osp.dirname(sys.executable))
|
26 |
SYSTEM_PYTHON_PREFIX = '/usr/lib/python'
|
|
|
650 |
|
651 |
def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
|
652 |
if not osp.isfile(filename):
|
653 |
+
raise FileNotFoundError(msg_tmpl.format(filename))
|
654 |
+
|
655 |
+
|
656 |
+
def package2module(package: str):
|
657 |
+
"""Infer module name from package.
|
658 |
+
|
659 |
+
Args:
|
660 |
+
package (str): Package to infer module name.
|
661 |
+
"""
|
662 |
+
pkg = get_distribution(package)
|
663 |
+
if pkg.has_metadata('top_level.txt'):
|
664 |
+
module_name = pkg.get_metadata('top_level.txt').split('\n')[0]
|
665 |
+
return module_name
|
666 |
+
else:
|
667 |
+
raise ValueError(
|
668 |
+
highlighted_error(f'can not infer the module name of {package}'))
|
669 |
+
|
670 |
+
|
671 |
+
def get_distribution(dist):
|
672 |
+
"""Return a current distribution object for a Requirement or string"""
|
673 |
+
if isinstance(dist, str):
|
674 |
+
dist = Requirement.parse(dist)
|
675 |
+
return dist
|
676 |
+
|
677 |
+
|
678 |
+
def highlighted_error(msg: Union[str, Exception]) -> str:
|
679 |
+
return click.style(msg, fg='red', bold=True) # type: ignore
|
680 |
+
|
681 |
+
|
682 |
+
class Requirement(packaging.requirements.Requirement):
|
683 |
+
def __init__(self, requirement_string):
|
684 |
+
"""DO NOT CALL THIS UNDOCUMENTED METHOD; use Requirement.parse()!"""
|
685 |
+
super(Requirement, self).__init__(requirement_string)
|
686 |
+
self.unsafe_name = self.name
|
687 |
+
project_name = safe_name(self.name)
|
688 |
+
self.project_name, self.key = project_name, project_name.lower()
|
689 |
+
self.specs = [
|
690 |
+
(spec.operator, spec.version) for spec in self.specifier]
|
691 |
+
self.extras = tuple(map(safe_extra, self.extras))
|
692 |
+
self.hashCmp = (
|
693 |
+
self.key,
|
694 |
+
self.url,
|
695 |
+
self.specifier,
|
696 |
+
frozenset(self.extras),
|
697 |
+
str(self.marker) if self.marker else None,
|
698 |
+
)
|
699 |
+
self.__hash = hash(self.hashCmp)
|
700 |
+
|
701 |
+
def __eq__(self, other):
|
702 |
+
return (
|
703 |
+
isinstance(other, Requirement) and
|
704 |
+
self.hashCmp == other.hashCmp
|
705 |
+
)
|
706 |
+
|
707 |
+
def __ne__(self, other):
|
708 |
+
return not self == other
|
709 |
+
|
710 |
+
def __contains__(self, item):
|
711 |
+
if item.key != self.key:
|
712 |
+
return False
|
713 |
+
|
714 |
+
item = item.version
|
715 |
+
|
716 |
+
# Allow prereleases always in order to match the previous behavior of
|
717 |
+
# this method. In the future this should be smarter and follow PEP 440
|
718 |
+
# more accurately.
|
719 |
+
return self.specifier.contains(item, prereleases=True)
|
720 |
+
|
721 |
+
def __hash__(self):
|
722 |
+
return self.__hash
|
723 |
+
|
724 |
+
def __repr__(self):
|
725 |
+
return "Requirement.parse(%r)" % str(self)
|
726 |
+
|
727 |
+
@staticmethod
|
728 |
+
def parse(s):
|
729 |
+
req, = parse_requirements(s)
|
730 |
+
return req
|
731 |
+
|
732 |
+
|
733 |
+
def parse_requirements(strs):
|
734 |
+
"""Yield ``Requirement`` objects for each specification in `strs`
|
735 |
+
|
736 |
+
`strs` must be a string, or a (possibly-nested) iterable thereof.
|
737 |
+
"""
|
738 |
+
# create a steppable iterator, so we can handle \-continuations
|
739 |
+
lines = iter(yield_lines(strs))
|
740 |
+
|
741 |
+
for line in lines:
|
742 |
+
# Drop comments -- a hash without a space may be in a URL.
|
743 |
+
if ' #' in line:
|
744 |
+
line = line[:line.find(' #')]
|
745 |
+
# If there is a line continuation, drop it, and append the next line.
|
746 |
+
if line.endswith('\\'):
|
747 |
+
line = line[:-2].strip()
|
748 |
+
try:
|
749 |
+
line += next(lines)
|
750 |
+
except StopIteration:
|
751 |
+
return
|
752 |
+
yield Requirement(line)
|
753 |
+
|
754 |
+
|
755 |
+
def yield_lines(iterable):
|
756 |
+
"""Yield valid lines of a string or iterable"""
|
757 |
+
return itertools.chain.from_iterable(map(yield_lines, iterable))
|
758 |
+
|
759 |
+
|
760 |
+
def safe_extra(extra):
|
761 |
+
"""Convert an arbitrary string to a standard 'extra' name
|
762 |
+
|
763 |
+
Any runs of non-alphanumeric characters are replaced with a single '_',
|
764 |
+
and the result is always lowercased.
|
765 |
+
"""
|
766 |
+
return re.sub('[^A-Za-z0-9.-]+', '_', extra).lower()
|
767 |
+
|
768 |
+
|
769 |
+
def safe_name(name):
|
770 |
+
"""Convert an arbitrary string to a standard distribution name
|
771 |
+
|
772 |
+
Any runs of non-alphanumeric/. characters are replaced with a single '-'.
|
773 |
+
"""
|
774 |
+
return re.sub('[^A-Za-z0-9.]+', '-', name)
|
segformer_plusplus/modeling_segformer_plusplus.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# modeling_segformer_plusplus.py
|
2 |
+
|
3 |
+
from typing import Optional, Tuple
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
from transformers import PreTrainedModel, PretrainedConfig
|
7 |
+
from transformers.modeling_outputs import SemanticSegmenterOutput
|
8 |
+
|
9 |
+
# Falls du SegFormer direkt importieren willst, musst du sicherstellen,
|
10 |
+
# dass diese Klasse im selben Repo verfügbar ist.
|
11 |
+
from segformer_plusplus.model import create_model
|
12 |
+
|
13 |
+
|
14 |
+
class SegformerPlusPlusConfig(PretrainedConfig):
|
15 |
+
model_type = "segformer_plusplus"
|
16 |
+
|
17 |
+
def __init__(
|
18 |
+
self,
|
19 |
+
backbone: str = "b5",
|
20 |
+
tome_strategy: Optional[str] = "bsm_hq",
|
21 |
+
num_labels: int = 19,
|
22 |
+
id2label: Optional[dict] = None,
|
23 |
+
label2id: Optional[dict] = None,
|
24 |
+
**kwargs,
|
25 |
+
):
|
26 |
+
self.backbone = backbone
|
27 |
+
self.tome_strategy = tome_strategy
|
28 |
+
self.num_labels = num_labels
|
29 |
+
|
30 |
+
if id2label is None:
|
31 |
+
id2label = {i: f"class_{i}" for i in range(num_labels)}
|
32 |
+
if label2id is None:
|
33 |
+
label2id = {v: k for k, v in id2label.items()}
|
34 |
+
|
35 |
+
self.id2label = id2label
|
36 |
+
self.label2id = label2id
|
37 |
+
|
38 |
+
super().__init__(**kwargs)
|
39 |
+
|
40 |
+
|
41 |
+
class SegformerPlusPlusForSemanticSegmentation(PreTrainedModel):
|
42 |
+
config_class = SegformerPlusPlusConfig
|
43 |
+
|
44 |
+
def __init__(self, config: SegformerPlusPlusConfig):
|
45 |
+
super().__init__(config)
|
46 |
+
self.segformer = create_model(
|
47 |
+
backbone=config.backbone,
|
48 |
+
tome_strategy=config.tome_strategy,
|
49 |
+
out_channels=config.num_labels,
|
50 |
+
pretrained=False, # Kein Pretrained hier – wird über .from_pretrained geladen
|
51 |
+
)
|
52 |
+
|
53 |
+
def forward(
|
54 |
+
self,
|
55 |
+
pixel_values: torch.FloatTensor,
|
56 |
+
labels: Optional[torch.LongTensor] = None,
|
57 |
+
) -> SemanticSegmenterOutput:
|
58 |
+
|
59 |
+
logits = self.segformer(pixel_values)
|
60 |
+
|
61 |
+
loss = None
|
62 |
+
if labels is not None:
|
63 |
+
loss_fct = nn.CrossEntropyLoss(ignore_index=255)
|
64 |
+
loss = loss_fct(logits, labels.long())
|
65 |
+
|
66 |
+
return SemanticSegmenterOutput(
|
67 |
+
loss=loss,
|
68 |
+
logits=logits,
|
69 |
+
)
|
segformer_plusplus/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e064fa2fb7d618208c2542e76c543b7cb552a3d8997a0c6c4cc0a14da86ba58
|
3 |
+
size 328287002
|
segformer_plusplus/start_cityscape_benchmark.py
CHANGED
@@ -7,7 +7,7 @@ from .build_model import create_model
|
|
7 |
from .cityscape_benchmark import cityscape_benchmark
|
8 |
|
9 |
parser = argparse.ArgumentParser(description="Segformer Benchmarking Script")
|
10 |
-
parser.add_argument('--backbone', type=str, default='
|
11 |
parser.add_argument('--head', type=str, default='bsm_hq', choices=['bsm_hq', 'bsm_fast', 'n2d_2x2'], help='Model head type')
|
12 |
parser.add_argument('--checkpoint', type=str, default=None, help='Path to .pth checkpoint file (optional)')
|
13 |
args = parser.parse_args()
|
@@ -22,13 +22,15 @@ if args.checkpoint:
|
|
22 |
else:
|
23 |
print("No checkpoint provided – using model as initialized.")
|
24 |
|
25 |
-
|
|
|
|
|
26 |
result = cityscape_benchmark(model, image_path)
|
27 |
|
28 |
print("Cityscapes Benchmark Results:", result)
|
29 |
|
30 |
-
reference_txt_path = os.path.
|
31 |
-
generated_txt_path = os.path.
|
32 |
|
33 |
if os.path.exists(reference_txt_path) and os.path.exists(generated_txt_path):
|
34 |
ref_arr = np.loadtxt(reference_txt_path, dtype=int)
|
|
|
7 |
from .cityscape_benchmark import cityscape_benchmark
|
8 |
|
9 |
parser = argparse.ArgumentParser(description="Segformer Benchmarking Script")
|
10 |
+
parser.add_argument('--backbone', type=str, default='b5', choices=['b0', 'b1', 'b2', 'b3', 'b4', 'b5'], help='Model backbone version')
|
11 |
parser.add_argument('--head', type=str, default='bsm_hq', choices=['bsm_hq', 'bsm_fast', 'n2d_2x2'], help='Model head type')
|
12 |
parser.add_argument('--checkpoint', type=str, default=None, help='Path to .pth checkpoint file (optional)')
|
13 |
args = parser.parse_args()
|
|
|
22 |
else:
|
23 |
print("No checkpoint provided – using model as initialized.")
|
24 |
|
25 |
+
cwd = os.getcwd()
|
26 |
+
|
27 |
+
image_path = os.path.join(cwd, 'cityscape', 'berlin_000543_000019_leftImg8bit.png')
|
28 |
result = cityscape_benchmark(model, image_path)
|
29 |
|
30 |
print("Cityscapes Benchmark Results:", result)
|
31 |
|
32 |
+
reference_txt_path = os.path.join(cwd, 'cityscapes_prediction_output_reference_b05_nocheckpoint.txt')
|
33 |
+
generated_txt_path = os.path.join(cwd, 'cityscapes_prediction_output.txt')
|
34 |
|
35 |
if os.path.exists(reference_txt_path) and os.path.exists(generated_txt_path):
|
36 |
ref_arr = np.loadtxt(reference_txt_path, dtype=int)
|
setup.py
CHANGED
@@ -5,7 +5,18 @@ setup(
|
|
5 |
version="0.2",
|
6 |
author="Marco Kantonis",
|
7 |
description="Segformer++: Efficient Token-Merging Strategies for High-Resolution Semantic Segmentation",
|
8 |
-
install_requires=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
packages=find_packages(),
|
10 |
license='MIT',
|
11 |
long_description="https://arxiv.org/abs/2405.14467"
|
|
|
5 |
version="0.2",
|
6 |
author="Marco Kantonis",
|
7 |
description="Segformer++: Efficient Token-Merging Strategies for High-Resolution Semantic Segmentation",
|
8 |
+
install_requires=[
|
9 |
+
'torch>=2.0.1',
|
10 |
+
'tomesd',
|
11 |
+
'omegaconf',
|
12 |
+
'pyyaml',
|
13 |
+
'numpy',
|
14 |
+
'rich',
|
15 |
+
'yapf',
|
16 |
+
'addict',
|
17 |
+
'tqdm',
|
18 |
+
'packaging'
|
19 |
+
],
|
20 |
packages=find_packages(),
|
21 |
license='MIT',
|
22 |
long_description="https://arxiv.org/abs/2405.14467"
|