Tim77777767 commited on
Commit
85ebba9
·
1 Parent(s): e98bd8c

Re-added .bin with Git LFS, fixed tracking

Browse files
Files changed (37) hide show
  1. .gitattributes +3 -0
  2. .gitignore +2 -0
  3. build/lib/segformer_plusplus/__init__.py +0 -4
  4. build/lib/segformer_plusplus/build_model.py +0 -108
  5. build/lib/segformer_plusplus/configs/__init__.py +0 -1
  6. build/lib/segformer_plusplus/configs/segformer_mit_b0.py +0 -28
  7. build/lib/segformer_plusplus/configs/segformer_mit_b1.py +0 -8
  8. build/lib/segformer_plusplus/configs/segformer_mit_b2.py +0 -6
  9. build/lib/segformer_plusplus/configs/segformer_mit_b3.py +0 -6
  10. build/lib/segformer_plusplus/configs/segformer_mit_b4.py +0 -6
  11. build/lib/segformer_plusplus/configs/segformer_mit_b5.py +0 -6
  12. build/lib/segformer_plusplus/model/__init__.py +0 -1
  13. build/lib/segformer_plusplus/model/backbone/__init__.py +0 -3
  14. build/lib/segformer_plusplus/model/backbone/mit.py +0 -479
  15. build/lib/segformer_plusplus/model/head/__init__.py +0 -3
  16. build/lib/segformer_plusplus/model/head/segformer_head.py +0 -95
  17. build/lib/segformer_plusplus/random_benchmark.py +0 -61
  18. build/lib/segformer_plusplus/utils/__init__.py +0 -12
  19. build/lib/segformer_plusplus/utils/benchmark.py +0 -76
  20. build/lib/segformer_plusplus/utils/embed.py +0 -330
  21. build/lib/segformer_plusplus/utils/imagenet_weights.py +0 -8
  22. build/lib/segformer_plusplus/utils/registry.py +0 -6
  23. build/lib/segformer_plusplus/utils/shape_convert.py +0 -107
  24. build/lib/segformer_plusplus/utils/tome_presets.py +0 -20
  25. build/lib/segformer_plusplus/utils/wrappers.py +0 -51
  26. segformer_plusplus.egg-info/SOURCES.txt +10 -0
  27. segformer_plusplus.egg-info/requires.txt +3 -0
  28. segformer_plusplus/cityscape/berlin_000543_000019_leftImg8bit.png +3 -0
  29. segformer_plusplus/cityscape_benchmark.py +3 -11
  30. cityscapes_prediction_output_reference.txt → segformer_plusplus/cityscapes_prediction_output.txt +0 -0
  31. segformer_plusplus/cityscapes_prediction_output_reference_b05_nocheckpoint.txt +0 -0
  32. segformer_plusplus/config.json +10 -0
  33. segformer_plusplus/configs/config/utils.py +129 -2
  34. segformer_plusplus/modeling_segformer_plusplus.py +69 -0
  35. segformer_plusplus/pytorch_model.bin +3 -0
  36. segformer_plusplus/start_cityscape_benchmark.py +6 -4
  37. setup.py +12 -1
.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.bin filter=lfs diff=lfs merge=lfs -text
2
+ *.pth filter=lfs diff=lfs merge=lfs -text
3
+ *.png filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -10,3 +10,5 @@ __pycache__/
10
  .vscode/
11
  .idea/
12
  .DS_Store
 
 
 
10
  .vscode/
11
  .idea/
12
  .DS_Store
13
+ build/
14
+ venv/
build/lib/segformer_plusplus/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- from .build_model import create_model, create_custom_model
2
- from .random_benchmark import random_benchmark
3
-
4
- __all__ = ['create_model', 'create_custom_model', 'random_benchmark']
 
 
 
 
 
build/lib/segformer_plusplus/build_model.py DELETED
@@ -1,108 +0,0 @@
1
- import os
2
-
3
- from mmengine import registry
4
- from mmengine.config import Config
5
- from mmengine.model import BaseModule
6
-
7
- from .utils import MODELS, imagenet_weights
8
- from .utils import tome_presets
9
-
10
-
11
- class SegFormer(BaseModule):
12
- """
13
- This class represents a SegFormer model that allows for the application of token merging.
14
-
15
- Attributes:
16
- backbone (BaseModule): MixVisionTransformer backbone
17
- decode_head (BaseModule): SegFormer head
18
-
19
- """
20
- def __init__(self, cfg):
21
- """
22
- Initialize the SegFormer model.
23
-
24
- Args:
25
- cfg (Config): an mmengine Config object, which defines the backbone, head and token merging strategy used.
26
-
27
- """
28
- super().__init__()
29
- self.backbone = registry.build_model_from_cfg(cfg.backbone, registry=MODELS)
30
- self.decode_head = registry.build_model_from_cfg(cfg.decode_head, registry=MODELS)
31
-
32
- def forward(self, x):
33
- """
34
- Forward pass of the model.
35
-
36
- Args:
37
- x (torch.Tensor): input tensor of shape [B, C, H, W]
38
-
39
- Returns:
40
- torch.Tensor: output tensor
41
-
42
- """
43
- x = self.backbone(x)
44
- x = self.decode_head(x)
45
- return x
46
-
47
-
48
- def create_model(
49
- backbone: str = 'b0',
50
- tome_strategy: str = None,
51
- out_channels: int = 19,
52
- pretrained: bool = False,
53
- ):
54
- """
55
- Create a SegFormer model using the predefined SegFormer backbones from the MiT series (b0-b5).
56
-
57
- Args:
58
- backbone (str): backbone name (e.g. 'b0')
59
- tome_strategy (str | list(dict)): select strategy from presets ('bsm_hq', 'bsm_fast', 'n2d_2x2') or define a
60
- custom strategy using a list, that contains of dictionaries, in which the strategies for the stage are
61
- defined
62
- out_channels (int): number of output channels (e.g. 19 for the cityscapes semantic segmentation task)
63
- pretrained: use pretrained (imagenet) weights
64
-
65
- Returns:
66
- BaseModule: SegFormer model
67
-
68
- """
69
- backbone = backbone.lower()
70
- assert backbone in [f'b{i}' for i in range(6)]
71
-
72
- wd = os.path.dirname(os.path.abspath(__file__))
73
-
74
- cfg = Config.fromfile(os.path.join(wd, 'configs', f'segformer_mit_{backbone}.py'))
75
-
76
- cfg.decode_head.out_channels = out_channels
77
-
78
- if tome_strategy is not None:
79
- if tome_strategy not in list(tome_presets.keys()):
80
- print("Using custom merging strategy.")
81
- cfg.backbone.tome_cfg = tome_presets[tome_strategy]
82
-
83
- # load imagenet weights
84
- if pretrained:
85
- cfg.backbone.init_cfg = dict(type='Pretrained', checkpoint=imagenet_weights[backbone])
86
-
87
- return SegFormer(cfg)
88
-
89
-
90
- def create_custom_model(
91
- model_cfg: Config,
92
- tome_strategy: list[dict] = None,
93
- ):
94
- """
95
- Create a SegFormer model with customizable backbone and head.
96
-
97
- Args:
98
- model_cfg (Config): backbone name (e.g. 'b0')
99
- tome_strategy (list(dict)): custom token merging strategy
100
-
101
- Returns:
102
- BaseModule: SegFormer model
103
-
104
- """
105
- if tome_strategy is not None:
106
- model_cfg.backbone.tome_cfg = tome_strategy
107
-
108
- return SegFormer(model_cfg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/lib/segformer_plusplus/configs/__init__.py DELETED
@@ -1 +0,0 @@
1
- __all__ = []
 
 
build/lib/segformer_plusplus/configs/segformer_mit_b0.py DELETED
@@ -1,28 +0,0 @@
1
- norm_cfg = dict(type='SyncBN', requires_grad=True)
2
- backbone = dict(
3
- type='MixVisionTransformer',
4
- in_channels=3,
5
- embed_dims=32,
6
- num_stages=4,
7
- num_layers=[2, 2, 2, 2],
8
- num_heads=[1, 2, 5, 8],
9
- patch_sizes=[7, 3, 3, 3],
10
- sr_ratios=[8, 4, 2, 1],
11
- out_indices=(0, 1, 2, 3),
12
- mlp_ratio=4,
13
- qkv_bias=True,
14
- drop_rate=0.0,
15
- attn_drop_rate=0.0,
16
- drop_path_rate=0.1
17
- )
18
- decode_head = dict(
19
- type='SegformerHead',
20
- in_channels=[32, 64, 160, 256],
21
- in_index=[0, 1, 2, 3],
22
- channels=256,
23
- dropout_ratio=0.1,
24
- out_channels=19,
25
- norm_cfg=norm_cfg,
26
- align_corners=False,
27
- interpolate_mode='bilinear'
28
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/lib/segformer_plusplus/configs/segformer_mit_b1.py DELETED
@@ -1,8 +0,0 @@
1
- _base_ = ['./segformer_mit_b0.py']
2
-
3
- backbone = dict(
4
- embed_dims=64,
5
- )
6
- decode_head = dict(
7
- in_channels=[64, 128, 320, 512]
8
- )
 
 
 
 
 
 
 
 
 
build/lib/segformer_plusplus/configs/segformer_mit_b2.py DELETED
@@ -1,6 +0,0 @@
1
- _base_ = ['./segformer_mit_b1.py']
2
-
3
- backbone = dict(
4
- embed_dims=64,
5
- num_layers=[3, 4, 6, 3]
6
- )
 
 
 
 
 
 
 
build/lib/segformer_plusplus/configs/segformer_mit_b3.py DELETED
@@ -1,6 +0,0 @@
1
- _base_ = ['./segformer_mit_b1.py']
2
-
3
- backbone = dict(
4
- embed_dims=64,
5
- num_layers=[3, 4, 18, 3]
6
- )
 
 
 
 
 
 
 
build/lib/segformer_plusplus/configs/segformer_mit_b4.py DELETED
@@ -1,6 +0,0 @@
1
- _base_ = ['./segformer_mit_b1.py']
2
-
3
- backbone = dict(
4
- embed_dims=64,
5
- num_layers=[3, 8, 27, 3]
6
- )
 
 
 
 
 
 
 
build/lib/segformer_plusplus/configs/segformer_mit_b5.py DELETED
@@ -1,6 +0,0 @@
1
- _base_ = ['./segformer_mit_b1.py']
2
-
3
- backbone = dict(
4
- embed_dims=64,
5
- num_layers=[3, 6, 40, 3]
6
- )
 
 
 
 
 
 
 
build/lib/segformer_plusplus/model/__init__.py DELETED
@@ -1 +0,0 @@
1
- __all__ = []
 
 
build/lib/segformer_plusplus/model/backbone/__init__.py DELETED
@@ -1,3 +0,0 @@
1
- from .mit import MixVisionTransformer
2
-
3
- __all__ = ['MixVisionTransformer']
 
 
 
 
build/lib/segformer_plusplus/model/backbone/mit.py DELETED
@@ -1,479 +0,0 @@
1
- # Copyright (c) OpenMMLab. All rights reserved.
2
- import math
3
-
4
- import torch
5
- import torch.nn as nn
6
- import torch.utils.checkpoint as cp
7
- from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer
8
- from mmcv.cnn.bricks.drop import build_dropout
9
- from mmcv.cnn.bricks.transformer import MultiheadAttention
10
- from mmengine.model import BaseModule, ModuleList, Sequential
11
- from mmengine.model.weight_init import (constant_init, normal_init,
12
- trunc_normal_init)
13
- from tomesd.merge import bipartite_soft_matching_random2d
14
-
15
- from ...utils import PatchEmbed
16
- from ...utils import nchw_to_nlc, nlc_to_nchw
17
- from ...utils import MODELS
18
-
19
- class MixFFN(BaseModule):
20
- """An implementation of MixFFN of Segformer.
21
-
22
- The differences between MixFFN & FFN:
23
- 1. Use 1X1 Conv to replace Linear layer.
24
- 2. Introduce 3X3 Conv to encode positional information.
25
- Args:
26
- embed_dims (int): The feature dimension. Same as
27
- `MultiheadAttention`. Defaults: 256.
28
- feedforward_channels (int): The hidden dimension of FFNs.
29
- Defaults: 1024.
30
- act_cfg (dict, optional): The activation config for FFNs.
31
- Default: dict(type='ReLU')
32
- ffn_drop (float, optional): Probability of an element to be
33
- zeroed in FFN. Default 0.0.
34
- dropout_layer (obj:`ConfigDict`): The dropout_layer used
35
- when adding the shortcut.
36
- init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
37
- Default: None.
38
- """
39
-
40
- def __init__(self,
41
- embed_dims,
42
- feedforward_channels,
43
- act_cfg=dict(type='GELU'),
44
- ffn_drop=0.,
45
- dropout_layer=None,
46
- init_cfg=None):
47
- super().__init__(init_cfg)
48
-
49
- self.embed_dims = embed_dims
50
- self.feedforward_channels = feedforward_channels
51
- self.act_cfg = act_cfg
52
- self.activate = build_activation_layer(act_cfg)
53
-
54
- in_channels = embed_dims
55
- fc1 = Conv2d(
56
- in_channels=in_channels,
57
- out_channels=feedforward_channels,
58
- kernel_size=1,
59
- stride=1,
60
- bias=True)
61
- # 3x3 depth wise conv to provide positional encode information
62
- pe_conv = Conv2d(
63
- in_channels=feedforward_channels,
64
- out_channels=feedforward_channels,
65
- kernel_size=3,
66
- stride=1,
67
- padding=(3 - 1) // 2,
68
- bias=True,
69
- groups=feedforward_channels)
70
- fc2 = Conv2d(
71
- in_channels=feedforward_channels,
72
- out_channels=in_channels,
73
- kernel_size=1,
74
- stride=1,
75
- bias=True)
76
- drop = nn.Dropout(ffn_drop)
77
- layers = [fc1, pe_conv, self.activate, drop, fc2, drop]
78
- self.layers = Sequential(*layers)
79
- self.dropout_layer = build_dropout(
80
- dropout_layer) if dropout_layer else torch.nn.Identity()
81
-
82
- def forward(self, x, hw_shape, identity=None):
83
- out = nlc_to_nchw(x, hw_shape)
84
- out = self.layers(out)
85
- out = nchw_to_nlc(out)
86
- if identity is None:
87
- identity = x
88
- return identity + self.dropout_layer(out)
89
-
90
-
91
- class EfficientMultiheadAttention(MultiheadAttention):
92
- """An implementation of Efficient Multi-head Attention of Segformer.
93
-
94
- This module is modified from MultiheadAttention which is a module from
95
- mmcv.cnn.bricks.transformer.
96
- Args:
97
- embed_dims (int): The embedding dimension.
98
- num_heads (int): Parallel attention heads.
99
- attn_drop (float): A Dropout layer on attn_output_weights.
100
- Default: 0.0.
101
- proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
102
- Default: 0.0.
103
- dropout_layer (obj:`ConfigDict`): The dropout_layer used
104
- when adding the shortcut. Default: None.
105
- init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
106
- Default: None.
107
- batch_first (bool): Key, Query and Value are shape of
108
- (batch, n, embed_dim)
109
- or (n, batch, embed_dim). Default: False.
110
- qkv_bias (bool): enable bias for qkv if True. Default True.
111
- norm_cfg (dict): Config dict for normalization layer.
112
- Default: dict(type='LN').
113
- sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
114
- Attention of Segformer. Default: 1.
115
- """
116
-
117
- def __init__(self,
118
- embed_dims,
119
- num_heads,
120
- attn_drop=0.,
121
- proj_drop=0.,
122
- dropout_layer=None,
123
- init_cfg=None,
124
- batch_first=True,
125
- qkv_bias=False,
126
- tome_cfg=dict(),
127
- norm_cfg=dict(type='LN'),
128
- sr_ratio=1):
129
- super().__init__(
130
- embed_dims,
131
- num_heads,
132
- attn_drop,
133
- proj_drop,
134
- dropout_layer=dropout_layer,
135
- init_cfg=init_cfg,
136
- batch_first=batch_first,
137
- bias=qkv_bias)
138
-
139
- self.q_mode = tome_cfg.get('q_mode')
140
- self.kv_mode = tome_cfg.get('kv_mode')
141
- self.tome_cfg = tome_cfg
142
-
143
- self.sr_ratio = sr_ratio
144
- if sr_ratio > 1:
145
- self.sr = Conv2d(
146
- in_channels=embed_dims,
147
- out_channels=embed_dims,
148
- kernel_size=sr_ratio,
149
- stride=sr_ratio)
150
- # The ret[0] of build_norm_layer is norm name.
151
- self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
152
-
153
- def forward(self, x, hw_shape, identity=None):
154
- x_q = x
155
-
156
- if self.sr_ratio > 1:
157
- x_kv = nlc_to_nchw(x, hw_shape)
158
- x_kv = self.sr(x_kv)
159
- x_kv = nchw_to_nlc(x_kv)
160
- x_kv = self.norm(x_kv)
161
- else:
162
- x_kv = x
163
-
164
- # 2D Neighbour Merging KV
165
- if self.kv_mode == 'n2d':
166
- kv_hw_shape = (int(hw_shape[0] / self.sr_ratio), int(hw_shape[1] / self.sr_ratio))
167
- x_kv = nlc_to_nchw(x_kv, kv_hw_shape)
168
- x_kv = torch.nn.functional.avg_pool2d(x_kv, kernel_size=self.tome_cfg['kv_s'],
169
- stride=self.tome_cfg['kv_s'],
170
- ceil_mode=True)
171
- x_kv = nchw_to_nlc(x_kv)
172
-
173
- # Bipartite Soft Matching (tomesd) KV
174
- if self.kv_mode == 'bsm':
175
- w_kv = int(hw_shape[1] / self.sr_ratio)
176
- h_kv = int(hw_shape[0] / self.sr_ratio)
177
- merge, unmerge = bipartite_soft_matching_random2d(metric=x_kv, w=w_kv, h=h_kv,
178
- r=int(x_kv.size()[1] * self.tome_cfg['kv_r']),
179
- sx=self.tome_cfg['kv_sx'], sy=self.tome_cfg['kv_sy'],
180
- no_rand=True)
181
- x_kv = merge(x_kv)
182
-
183
- if identity is None:
184
- identity = x_q
185
-
186
- # 1D Neighbor Merging Q
187
- if self.q_mode == 'n1d':
188
- x_q = x_q.transpose(-2, -1)
189
- x_q = torch.nn.functional.avg_pool1d(x_q, kernel_size=self.tome_cfg['q_s'],
190
- stride=self.tome_cfg['q_s'],
191
- ceil_mode=True)
192
- x_q = x_q.transpose(-2, -1)
193
-
194
- # 2D Neighbor Merging Q
195
- if self.q_mode == 'n2d':
196
- reduced_hw = (int(torch.ceil(torch.tensor(hw_shape[0] / self.tome_cfg['q_s'][0]))),
197
- int(torch.ceil(torch.tensor(hw_shape[1] / self.tome_cfg['q_s'][1]))))
198
- x_q = nlc_to_nchw(x_q, hw_shape)
199
- x_q = torch.nn.functional.avg_pool2d(x_q, kernel_size=self.tome_cfg['q_s'],
200
- stride=self.tome_cfg['q_s'],
201
- ceil_mode=True)
202
- x_q = nchw_to_nlc(x_q)
203
-
204
- # Bipartite Soft Matching (tomesd) Q
205
- if self.q_mode == 'bsm':
206
- merge, unmerge = bipartite_soft_matching_random2d(metric=x_q, w=hw_shape[1], h=hw_shape[0],
207
- r=int(x_q.size()[1] * self.tome_cfg['q_r']),
208
- sx=self.tome_cfg['q_sx'], sy=self.tome_cfg['q_sy'],
209
- no_rand=True)
210
- x_q = merge(x_q)
211
-
212
- # Because the dataflow('key', 'query', 'value') of
213
- # ``torch.nn.MultiheadAttention`` is (num_query, batch,
214
- # embed_dims), We should adjust the shape of dataflow from
215
- # batch_first (batch, num_query, embed_dims) to num_query_first
216
- # (num_query ,batch, embed_dims), and recover ``attn_output``
217
- # from num_query_first to batch_first.
218
-
219
- if self.batch_first:
220
- x_q = x_q.transpose(0, 1)
221
- x_kv = x_kv.transpose(0, 1)
222
- out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
223
- if self.batch_first:
224
- out = out.transpose(0, 1)
225
-
226
- # Unmerging BSM (tome+tomesd)
227
- if self.q_mode == 'bsm':
228
- out = unmerge(out)
229
-
230
- # Unmerging 1D Neighbour Merging
231
- if self.q_mode == 'n1d':
232
- out = out.transpose(-2, -1)
233
- out = torch.nn.functional.interpolate(out, size=identity.size()[-2])
234
- out = out.transpose(-2, -1)
235
-
236
- # Unmerging 2D Neighbor Merging
237
- if self.q_mode == 'n2d':
238
- out = nlc_to_nchw(out, reduced_hw)
239
- out = torch.nn.functional.interpolate(out, size=hw_shape)
240
- out = nchw_to_nlc(out)
241
-
242
- return identity + self.dropout_layer(self.proj_drop(out))
243
-
244
-
245
- class TransformerEncoderLayer(BaseModule):
246
- """Implements one encoder layer in Segformer.
247
-
248
- Args:
249
- embed_dims (int): The feature dimension.
250
- num_heads (int): Parallel attention heads.
251
- feedforward_channels (int): The hidden dimension for FFNs.
252
- drop_rate (float): Probability of an element to be zeroed.
253
- after the feed forward layer. Default 0.0.
254
- attn_drop_rate (float): The drop out rate for attention layer.
255
- Default 0.0.
256
- drop_path_rate (float): stochastic depth rate. Default 0.0.
257
- qkv_bias (bool): enable bias for qkv if True.
258
- Default: True.
259
- act_cfg (dict): The activation config for FFNs.
260
- Default: dict(type='GELU').
261
- norm_cfg (dict): Config dict for normalization layer.
262
- Default: dict(type='LN').
263
- batch_first (bool): Key, Query and Value are shape of
264
- (batch, n, embed_dim)
265
- or (n, batch, embed_dim). Default: False.
266
- init_cfg (dict, optional): Initialization config dict.
267
- Default:None.
268
- sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
269
- Attention of Segformer. Default: 1.
270
- with_cp (bool): Use checkpoint or not. Using checkpoint will save
271
- some memory while slowing down the training speed. Default: False.
272
- """
273
-
274
- def __init__(self,
275
- embed_dims,
276
- num_heads,
277
- feedforward_channels,
278
- drop_rate=0.,
279
- attn_drop_rate=0.,
280
- drop_path_rate=0.,
281
- qkv_bias=True,
282
- tome_cfg=dict(),
283
- act_cfg=dict(type='GELU'),
284
- norm_cfg=dict(type='LN'),
285
- batch_first=True,
286
- sr_ratio=1,
287
- with_cp=False):
288
- super().__init__()
289
-
290
- # The ret[0] of build_norm_layer is norm name.
291
- self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
292
-
293
- self.attn = EfficientMultiheadAttention(
294
- embed_dims=embed_dims,
295
- num_heads=num_heads,
296
- attn_drop=attn_drop_rate,
297
- proj_drop=drop_rate,
298
- dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
299
- batch_first=batch_first,
300
- qkv_bias=qkv_bias,
301
- tome_cfg=tome_cfg,
302
- norm_cfg=norm_cfg,
303
- sr_ratio=sr_ratio)
304
-
305
- # The ret[0] of build_norm_layer is norm name.
306
- self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
307
-
308
- self.ffn = MixFFN(
309
- embed_dims=embed_dims,
310
- feedforward_channels=feedforward_channels,
311
- ffn_drop=drop_rate,
312
- dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
313
- act_cfg=act_cfg)
314
-
315
- self.with_cp = with_cp
316
-
317
- def forward(self, x, hw_shape):
318
-
319
- def _inner_forward(x):
320
- x = self.attn(self.norm1(x), hw_shape, identity=x)
321
- x = self.ffn(self.norm2(x), hw_shape, identity=x)
322
- return x
323
-
324
- if self.with_cp and x.requires_grad:
325
- x = cp.checkpoint(_inner_forward, x)
326
- else:
327
- x = _inner_forward(x)
328
- return x
329
-
330
-
331
- @MODELS.register_module()
332
- class MixVisionTransformer(BaseModule):
333
- """The backbone of Segformer.
334
-
335
- This backbone is the implementation of `SegFormer: Simple and
336
- Efficient Design for Semantic Segmentation with
337
- Transformers <https://arxiv.org/abs/2105.15203>`_.
338
- Args:
339
- in_channels (int): Number of input channels. Default: 3.
340
- embed_dims (int): Embedding dimension. Default: 768.
341
- num_stags (int): The num of stages. Default: 4.
342
- num_layers (Sequence[int]): The layer number of each transformer encode
343
- layer. Default: [3, 4, 6, 3].
344
- num_heads (Sequence[int]): The attention heads of each transformer
345
- encode layer. Default: [1, 2, 4, 8].
346
- patch_sizes (Sequence[int]): The patch_size of each overlapped patch
347
- embedding. Default: [7, 3, 3, 3].
348
- strides (Sequence[int]): The stride of each overlapped patch embedding.
349
- Default: [4, 2, 2, 2].
350
- sr_ratios (Sequence[int]): The spatial reduction rate of each
351
- transformer encode layer. Default: [8, 4, 2, 1].
352
- out_indices (Sequence[int] | int): Output from which stages.
353
- Default: (0, 1, 2, 3).
354
- mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
355
- Default: 4.
356
- qkv_bias (bool): Enable bias for qkv if True. Default: True.
357
- drop_rate (float): Probability of an element to be zeroed.
358
- Default 0.0
359
- attn_drop_rate (float): The drop out rate for attention layer.
360
- Default 0.0
361
- drop_path_rate (float): stochastic depth rate. Default 0.0
362
- norm_cfg (dict): Config dict for normalization layer.
363
- Default: dict(type='LN')
364
- act_cfg (dict): The activation config for FFNs.
365
- Default: dict(type='GELU').
366
- pretrained (str, optional): model pretrained path. Default: None.
367
- init_cfg (dict or list[dict], optional): Initialization config dict.
368
- Default: None.
369
- with_cp (bool): Use checkpoint or not. Using checkpoint will save
370
- some memory while slowing down the training speed. Default: False.
371
- """
372
-
373
- def __init__(self,
374
- in_channels=3,
375
- embed_dims=64,
376
- num_stages=4,
377
- num_layers=[3, 4, 6, 3],
378
- num_heads=[1, 2, 4, 8],
379
- patch_sizes=[7, 3, 3, 3],
380
- strides=[4, 2, 2, 2],
381
- sr_ratios=[8, 4, 2, 1],
382
- out_indices=(0, 1, 2, 3),
383
- mlp_ratio=4,
384
- qkv_bias=True,
385
- drop_rate=0.,
386
- attn_drop_rate=0.,
387
- drop_path_rate=0.,
388
- tome_cfg=[dict(), dict(), dict(), dict()],
389
- act_cfg=dict(type='GELU'),
390
- norm_cfg=dict(type='LN', eps=1e-6),
391
- init_cfg=None,
392
- with_cp=False,
393
- down_sample=False):
394
- super().__init__(init_cfg=init_cfg)
395
-
396
- self.embed_dims = embed_dims
397
- self.num_stages = num_stages
398
- self.num_layers = num_layers
399
- self.num_heads = num_heads
400
- self.patch_sizes = patch_sizes
401
- self.strides = strides
402
- self.sr_ratios = sr_ratios
403
- self.with_cp = with_cp
404
- self.down_sample = down_sample
405
- assert num_stages == len(num_layers) == len(num_heads) \
406
- == len(patch_sizes) == len(strides) == len(sr_ratios)
407
-
408
- self.out_indices = out_indices
409
- assert max(out_indices) < self.num_stages
410
-
411
- # transformer encoder
412
- dpr = [
413
- x.item()
414
- for x in torch.linspace(0, drop_path_rate, sum(num_layers))
415
- ] # stochastic num_layer decay rule
416
-
417
- cur = 0
418
- self.layers = ModuleList()
419
- for i, num_layer in enumerate(num_layers):
420
- embed_dims_i = embed_dims * num_heads[i]
421
- patch_embed = PatchEmbed(
422
- in_channels=in_channels,
423
- embed_dims=embed_dims_i,
424
- kernel_size=patch_sizes[i],
425
- stride=strides[i],
426
- padding=patch_sizes[i] // 2,
427
- norm_cfg=norm_cfg)
428
- layer = ModuleList([
429
- TransformerEncoderLayer(
430
- embed_dims=embed_dims_i,
431
- num_heads=num_heads[i],
432
- feedforward_channels=mlp_ratio * embed_dims_i,
433
- drop_rate=drop_rate,
434
- attn_drop_rate=attn_drop_rate,
435
- drop_path_rate=dpr[cur + idx],
436
- qkv_bias=qkv_bias,
437
- tome_cfg=tome_cfg[i],
438
- act_cfg=act_cfg,
439
- norm_cfg=norm_cfg,
440
- with_cp=with_cp,
441
- sr_ratio=sr_ratios[i]) for idx in range(num_layer)
442
- ])
443
- in_channels = embed_dims_i
444
- # The ret[0] of build_norm_layer is norm name.
445
- norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
446
- self.layers.append(ModuleList([patch_embed, layer, norm]))
447
- cur += num_layer
448
-
449
- def init_weights(self):
450
- if self.init_cfg is None:
451
- for m in self.modules():
452
- if isinstance(m, nn.Linear):
453
- trunc_normal_init(m, std=.02, bias=0.)
454
- elif isinstance(m, nn.LayerNorm):
455
- constant_init(m, val=1.0, bias=0.)
456
- elif isinstance(m, nn.Conv2d):
457
- fan_out = m.kernel_size[0] * m.kernel_size[
458
- 1] * m.out_channels
459
- fan_out //= m.groups
460
- normal_init(
461
- m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
462
- else:
463
- super().init_weights()
464
-
465
- def forward(self, x):
466
- if self.down_sample:
467
- x = torch.nn.functional.interpolate(x, scale_factor=(0.5, 0.5))
468
- outs = []
469
-
470
- for i, layer in enumerate(self.layers):
471
- x, hw_shape = layer[0](x)
472
- for block in layer[1]:
473
- x = block(x, hw_shape)
474
- x = layer[2](x)
475
- x = nlc_to_nchw(x, hw_shape)
476
- if i in self.out_indices:
477
- outs.append(x)
478
-
479
- return outs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/lib/segformer_plusplus/model/head/__init__.py DELETED
@@ -1,3 +0,0 @@
1
- from .segformer_head import SegformerHead
2
-
3
- __all__ = ['SegformerHead']
 
 
 
 
build/lib/segformer_plusplus/model/head/segformer_head.py DELETED
@@ -1,95 +0,0 @@
1
- # Copyright (c) OpenMMLab. All rights reserved.
2
- import torch
3
- import torch.nn as nn
4
- from mmcv.cnn import ConvModule
5
- from mmengine.model import BaseModule
6
-
7
- from ...utils import MODELS
8
- from ...utils import resize
9
-
10
-
11
- @MODELS.register_module()
12
- class SegformerHead(BaseModule):
13
- """The all mlp Head of segformer.
14
-
15
- This head is the implementation of
16
- `Segformer <https://arxiv.org/abs/2105.15203>` _.
17
-
18
- Args:
19
- interpolate_mode: The interpolate mode of MLP head upsample operation.
20
- Default: 'bilinear'.
21
- """
22
-
23
- def __init__(self,
24
- in_channels=[32, 64, 160, 256],
25
- in_index=[0, 1, 2, 3],
26
- channels=256,
27
- dropout_ratio=0.1,
28
- out_channels=19,
29
- norm_cfg=None,
30
- align_corners=False,
31
- interpolate_mode='bilinear'):
32
- super().__init__()
33
-
34
- self.in_channels = in_channels
35
- self.in_index = in_index
36
- self.channels = channels
37
- self.dropout_ratio = dropout_ratio
38
- self.out_channels = out_channels
39
- self.norm_cfg = norm_cfg
40
- self.align_corners = align_corners
41
- self.interpolate_mode = interpolate_mode
42
-
43
- self.act_cfg = dict(type='ReLU')
44
- self.conv_seg = nn.Conv2d(channels, self.out_channels, kernel_size=1)
45
- if dropout_ratio > 0:
46
- self.dropout = nn.Dropout2d(dropout_ratio)
47
- else:
48
- self.dropout = None
49
-
50
- num_inputs = len(self.in_channels)
51
-
52
- assert num_inputs == len(self.in_index)
53
-
54
- self.convs = nn.ModuleList()
55
- for i in range(num_inputs):
56
- self.convs.append(
57
- ConvModule(
58
- in_channels=self.in_channels[i],
59
- out_channels=self.channels,
60
- kernel_size=1,
61
- stride=1,
62
- norm_cfg=self.norm_cfg,
63
- act_cfg=self.act_cfg))
64
-
65
- self.fusion_conv = ConvModule(
66
- in_channels=self.channels * num_inputs,
67
- out_channels=self.channels,
68
- kernel_size=1,
69
- norm_cfg=self.norm_cfg)
70
-
71
- def cls_seg(self, feat):
72
- """Classify each pixel."""
73
- if self.dropout is not None:
74
- feat = self.dropout(feat)
75
- output = self.conv_seg(feat)
76
- return output
77
-
78
- def forward(self, inputs):
79
- # Receive 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32
80
- outs = []
81
- for idx in range(len(inputs)):
82
- x = inputs[idx]
83
- conv = self.convs[idx]
84
- outs.append(
85
- resize(
86
- input=conv(x),
87
- size=inputs[0].shape[2:],
88
- mode=self.interpolate_mode,
89
- align_corners=self.align_corners))
90
-
91
- out = self.fusion_conv(torch.cat(outs, dim=1))
92
-
93
- out = self.cls_seg(out)
94
-
95
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/lib/segformer_plusplus/random_benchmark.py DELETED
@@ -1,61 +0,0 @@
1
- from typing import Union, List, Tuple
2
-
3
- import numpy as np
4
- import torch
5
-
6
- from .utils import benchmark
7
-
8
- device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
9
-
10
-
11
- def random_benchmark(
12
- model: torch.nn.Module,
13
- batch_size: Union[int, List[int]] = 1,
14
- image_size: Union[Tuple[int], List[Tuple[int]]] = (3, 1024, 1024),
15
- ):
16
- """
17
- Calculate the FPS of a given model using randomly generated tensors.
18
-
19
- Args:
20
- model: instance of a model (e.g. SegFormer)
21
- batch_size: the batch size(s) at which to calculate the FPS (e.g. 1 or [1, 2, 4])
22
- image_size: the size of the images to use (e.g. (3, 1024, 1024))
23
-
24
- Returns: the FPS values calculated for all image sizes and batch sizes in the form of a dictionary
25
-
26
- """
27
- if isinstance(batch_size, int):
28
- batch_size = [batch_size]
29
- if isinstance(image_size, tuple):
30
- image_size = [image_size]
31
-
32
- values = {}
33
- throughput_values = []
34
-
35
- for i in image_size:
36
- # fill with fps for each batch size
37
- fps = []
38
- for b in batch_size:
39
- for _ in range(4):
40
- # Baseline benchmark
41
- if i[1] >= 1024:
42
- r = 16
43
- else:
44
- r = 32
45
- baseline_throughput = benchmark(
46
- model.to(device),
47
- device=device,
48
- verbose=True,
49
- runs=r,
50
- batch_size=b,
51
- input_size=i
52
- )
53
- throughput_values.append(baseline_throughput)
54
- throughput_values = np.asarray(throughput_values)
55
- throughput = np.around(np.mean(throughput_values), decimals=2)
56
- print('Im_size:', i, 'Batch_size:', b, 'Mean:', throughput, 'Std:',
57
- np.around(np.std(throughput_values), decimals=2))
58
- throughput_values = []
59
- fps.append({b: throughput})
60
- values[i] = fps
61
- return values
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/lib/segformer_plusplus/utils/__init__.py DELETED
@@ -1,12 +0,0 @@
1
- # Copyright (c) OpenMMLab. All rights reserved.
2
- from .embed import PatchEmbed
3
- from .shape_convert import nchw_to_nlc, nlc_to_nchw
4
- from .wrappers import resize
5
- from .tome_presets import tome_presets
6
- from .registry import MODELS
7
- from .imagenet_weights import imagenet_weights
8
- from .benchmark import benchmark
9
-
10
- __all__ = [
11
- 'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw', 'resize', 'tome_presets', 'MODELS', 'imagenet_weights', 'benchmark'
12
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
build/lib/segformer_plusplus/utils/benchmark.py DELETED
@@ -1,76 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
-
4
- # Source: https://github.com/facebookresearch/ToMe/blob/main/tome/utils.py
5
- # --------------------------------------------------------
6
-
7
- import time
8
- from typing import Tuple
9
-
10
- import torch
11
- from tqdm import tqdm
12
-
13
-
14
- def benchmark(
15
- model: torch.nn.Module,
16
- device: torch.device = 0,
17
- input_size: Tuple[int] = (3, 224, 224),
18
- batch_size: int = 64,
19
- runs: int = 40,
20
- throw_out: float = 0.25,
21
- use_fp16: bool = False,
22
- verbose: bool = False,
23
- ) -> float:
24
- """
25
- Benchmark the given model with random inputs at the given batch size.
26
-
27
- Args:
28
- - model: the module to benchmark
29
- - device: the device to use for benchmarking
30
- - input_size: the input size to pass to the model (channels, h, w)
31
- - batch_size: the batch size to use for evaluation
32
- - runs: the number of total runs to do
33
- - throw_out: the percentage of runs to throw out at the start of testing
34
- - use_fp16: whether or not to benchmark with float16 and autocast
35
- - verbose: whether or not to use tqdm to print progress / print throughput at end
36
-
37
- Returns:
38
- - the throughput measured in images / second
39
- """
40
- if not isinstance(device, torch.device):
41
- device = torch.device(device)
42
- is_cuda = torch.device(device).type == "cuda"
43
-
44
- model = model.eval().to(device)
45
- input = torch.rand(batch_size, *input_size, device=device)
46
- if use_fp16:
47
- input = input.half()
48
-
49
- warm_up = int(runs * throw_out)
50
- total = 0
51
- start = time.time()
52
-
53
- with torch.autocast(device.type, enabled=use_fp16):
54
- with torch.no_grad():
55
- for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"):
56
- if i == warm_up:
57
- if is_cuda:
58
- torch.cuda.synchronize()
59
- total = 0
60
- start = time.time()
61
-
62
- model(input)
63
- total += batch_size
64
-
65
- if is_cuda:
66
- torch.cuda.synchronize()
67
-
68
- end = time.time()
69
- elapsed = end - start
70
-
71
- throughput = total / elapsed
72
-
73
- if verbose:
74
- print(f"Throughput: {throughput:.2f} im/s")
75
-
76
- return throughput
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/lib/segformer_plusplus/utils/embed.py DELETED
@@ -1,330 +0,0 @@
1
- # Copyright (c) OpenMMLab. All rights reserved.
2
- import math
3
- from typing import Sequence
4
-
5
- import torch.nn as nn
6
- import torch.nn.functional as F
7
- from mmcv.cnn import build_conv_layer, build_norm_layer
8
- from mmengine.model import BaseModule
9
- from mmengine.utils import to_2tuple
10
-
11
-
12
- class AdaptivePadding(nn.Module):
13
- """Applies padding to input (if needed) so that input can get fully covered
14
- by filter you specified. It supports two modes "same" and "corner". The
15
- "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
16
- input. The "corner" mode would pad zero to bottom right.
17
-
18
- Args:
19
- kernel_size (int | tuple): Size of the kernel:
20
- stride (int | tuple): Stride of the filter. Default: 1:
21
- dilation (int | tuple): Spacing between kernel elements.
22
- Default: 1.
23
- padding (str): Support "same" and "corner", "corner" mode
24
- would pad zero to bottom right, and "same" mode would
25
- pad zero around input. Default: "corner".
26
- Example:
27
- >>> kernel_size = 16
28
- >>> stride = 16
29
- >>> dilation = 1
30
- >>> input = torch.rand(1, 1, 15, 17)
31
- >>> adap_pad = AdaptivePadding(
32
- >>> kernel_size=kernel_size,
33
- >>> stride=stride,
34
- >>> dilation=dilation,
35
- >>> padding="corner")
36
- >>> out = adap_pad(input)
37
- >>> assert (out.shape[2], out.shape[3]) == (16, 32)
38
- >>> input = torch.rand(1, 1, 16, 17)
39
- >>> out = adap_pad(input)
40
- >>> assert (out.shape[2], out.shape[3]) == (16, 32)
41
- """
42
-
43
- def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
44
-
45
- super().__init__()
46
-
47
- assert padding in ('same', 'corner')
48
-
49
- kernel_size = to_2tuple(kernel_size)
50
- stride = to_2tuple(stride)
51
- dilation = to_2tuple(dilation)
52
-
53
- self.padding = padding
54
- self.kernel_size = kernel_size
55
- self.stride = stride
56
- self.dilation = dilation
57
-
58
- def get_pad_shape(self, input_shape):
59
- input_h, input_w = input_shape
60
- kernel_h, kernel_w = self.kernel_size
61
- stride_h, stride_w = self.stride
62
- output_h = math.ceil(input_h / stride_h)
63
- output_w = math.ceil(input_w / stride_w)
64
- pad_h = max((output_h - 1) * stride_h +
65
- (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
66
- pad_w = max((output_w - 1) * stride_w +
67
- (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
68
- return pad_h, pad_w
69
-
70
- def forward(self, x):
71
- pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
72
- if pad_h > 0 or pad_w > 0:
73
- if self.padding == 'corner':
74
- x = F.pad(x, [0, pad_w, 0, pad_h])
75
- elif self.padding == 'same':
76
- x = F.pad(x, [
77
- pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
78
- pad_h - pad_h // 2
79
- ])
80
- return x
81
-
82
-
83
- class PatchEmbed(BaseModule):
84
- """Image to Patch Embedding.
85
-
86
- We use a conv layer to implement PatchEmbed.
87
-
88
- Args:
89
- in_channels (int): The num of input channels. Default: 3
90
- embed_dims (int): The dimensions of embedding. Default: 768
91
- conv_type (str): The config dict for embedding
92
- conv layer type selection. Default: "Conv2d".
93
- kernel_size (int): The kernel_size of embedding conv. Default: 16.
94
- stride (int, optional): The slide stride of embedding conv.
95
- Default: None (Would be set as `kernel_size`).
96
- padding (int | tuple | string ): The padding length of
97
- embedding conv. When it is a string, it means the mode
98
- of adaptive padding, support "same" and "corner" now.
99
- Default: "corner".
100
- dilation (int): The dilation rate of embedding conv. Default: 1.
101
- bias (bool): Bias of embed conv. Default: True.
102
- norm_cfg (dict, optional): Config dict for normalization layer.
103
- Default: None.
104
- input_size (int | tuple | None): The size of input, which will be
105
- used to calculate the out size. Only work when `dynamic_size`
106
- is False. Default: None.
107
- init_cfg (`mmengine.ConfigDict`, optional): The Config for
108
- initialization. Default: None.
109
- """
110
-
111
- def __init__(self,
112
- in_channels=3,
113
- embed_dims=768,
114
- conv_type='Conv2d',
115
- kernel_size=16,
116
- stride=None,
117
- padding='corner',
118
- dilation=1,
119
- bias=True,
120
- norm_cfg=None,
121
- input_size=None,
122
- init_cfg=None):
123
- super().__init__(init_cfg=init_cfg)
124
-
125
- self.embed_dims = embed_dims
126
- if stride is None:
127
- stride = kernel_size
128
-
129
- kernel_size = to_2tuple(kernel_size)
130
- stride = to_2tuple(stride)
131
- dilation = to_2tuple(dilation)
132
-
133
- if isinstance(padding, str):
134
- self.adap_padding = AdaptivePadding(
135
- kernel_size=kernel_size,
136
- stride=stride,
137
- dilation=dilation,
138
- padding=padding)
139
- # disable the padding of conv
140
- padding = 0
141
- else:
142
- self.adap_padding = None
143
- padding = to_2tuple(padding)
144
-
145
- self.projection = build_conv_layer(
146
- dict(type=conv_type),
147
- in_channels=in_channels,
148
- out_channels=embed_dims,
149
- kernel_size=kernel_size,
150
- stride=stride,
151
- padding=padding,
152
- dilation=dilation,
153
- bias=bias)
154
-
155
- if norm_cfg is not None:
156
- self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
157
- else:
158
- self.norm = None
159
-
160
- if input_size:
161
- input_size = to_2tuple(input_size)
162
- # `init_out_size` would be used outside to
163
- # calculate the num_patches
164
- # when `use_abs_pos_embed` outside
165
- self.init_input_size = input_size
166
- if self.adap_padding:
167
- pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
168
- input_h, input_w = input_size
169
- input_h = input_h + pad_h
170
- input_w = input_w + pad_w
171
- input_size = (input_h, input_w)
172
-
173
- # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
174
- h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
175
- (kernel_size[0] - 1) - 1) // stride[0] + 1
176
- w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
177
- (kernel_size[1] - 1) - 1) // stride[1] + 1
178
- self.init_out_size = (h_out, w_out)
179
- else:
180
- self.init_input_size = None
181
- self.init_out_size = None
182
-
183
- def forward(self, x):
184
- """
185
- Args:
186
- x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
187
-
188
- Returns:
189
- tuple: Contains merged results and its spatial shape.
190
-
191
- - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
192
- - out_size (tuple[int]): Spatial shape of x, arrange as
193
- (out_h, out_w).
194
- """
195
-
196
- if self.adap_padding:
197
- x = self.adap_padding(x)
198
-
199
- x = self.projection(x)
200
- out_size = (x.shape[2], x.shape[3])
201
- x = x.flatten(2).transpose(1, 2)
202
- if self.norm is not None:
203
- x = self.norm(x)
204
- return x, out_size
205
-
206
-
207
- class PatchMerging(BaseModule):
208
- """Merge patch feature map.
209
-
210
- This layer groups feature map by kernel_size, and applies norm and linear
211
- layers to the grouped feature map. Our implementation uses `nn.Unfold` to
212
- merge patch, which is about 25% faster than original implementation.
213
- Instead, we need to modify pretrained models for compatibility.
214
-
215
- Args:
216
- in_channels (int): The num of input channels.
217
- out_channels (int): The num of output channels.
218
- kernel_size (int | tuple, optional): the kernel size in the unfold
219
- layer. Defaults to 2.
220
- stride (int | tuple, optional): the stride of the sliding blocks in the
221
- unfold layer. Default: None. (Would be set as `kernel_size`)
222
- padding (int | tuple | string ): The padding length of
223
- embedding conv. When it is a string, it means the mode
224
- of adaptive padding, support "same" and "corner" now.
225
- Default: "corner".
226
- dilation (int | tuple, optional): dilation parameter in the unfold
227
- layer. Default: 1.
228
- bias (bool, optional): Whether to add bias in linear layer or not.
229
- Defaults: False.
230
- norm_cfg (dict, optional): Config dict for normalization layer.
231
- Default: dict(type='LN').
232
- init_cfg (dict, optional): The extra config for initialization.
233
- Default: None.
234
- """
235
-
236
- def __init__(self,
237
- in_channels,
238
- out_channels,
239
- kernel_size=2,
240
- stride=None,
241
- padding='corner',
242
- dilation=1,
243
- bias=False,
244
- norm_cfg=dict(type='LN'),
245
- init_cfg=None):
246
- super().__init__(init_cfg=init_cfg)
247
- self.in_channels = in_channels
248
- self.out_channels = out_channels
249
- if stride:
250
- stride = stride
251
- else:
252
- stride = kernel_size
253
-
254
- kernel_size = to_2tuple(kernel_size)
255
- stride = to_2tuple(stride)
256
- dilation = to_2tuple(dilation)
257
-
258
- if isinstance(padding, str):
259
- self.adap_padding = AdaptivePadding(
260
- kernel_size=kernel_size,
261
- stride=stride,
262
- dilation=dilation,
263
- padding=padding)
264
- # disable the padding of unfold
265
- padding = 0
266
- else:
267
- self.adap_padding = None
268
-
269
- padding = to_2tuple(padding)
270
- self.sampler = nn.Unfold(
271
- kernel_size=kernel_size,
272
- dilation=dilation,
273
- padding=padding,
274
- stride=stride)
275
-
276
- sample_dim = kernel_size[0] * kernel_size[1] * in_channels
277
-
278
- if norm_cfg is not None:
279
- self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
280
- else:
281
- self.norm = None
282
-
283
- self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
284
-
285
- def forward(self, x, input_size):
286
- """
287
- Args:
288
- x (Tensor): Has shape (B, H*W, C_in).
289
- input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
290
- Default: None.
291
-
292
- Returns:
293
- tuple: Contains merged results and its spatial shape.
294
-
295
- - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
296
- - out_size (tuple[int]): Spatial shape of x, arrange as
297
- (Merged_H, Merged_W).
298
- """
299
- B, L, C = x.shape
300
- assert isinstance(input_size, Sequence), f'Expect ' \
301
- f'input_size is ' \
302
- f'`Sequence` ' \
303
- f'but get {input_size}'
304
-
305
- H, W = input_size
306
- assert L == H * W, 'input feature has wrong size'
307
-
308
- x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W
309
- # Use nn.Unfold to merge patch. About 25% faster than original method,
310
- # but need to modify pretrained model for compatibility
311
-
312
- if self.adap_padding:
313
- x = self.adap_padding(x)
314
- H, W = x.shape[-2:]
315
-
316
- x = self.sampler(x)
317
- # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
318
-
319
- out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
320
- (self.sampler.kernel_size[0] - 1) -
321
- 1) // self.sampler.stride[0] + 1
322
- out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
323
- (self.sampler.kernel_size[1] - 1) -
324
- 1) // self.sampler.stride[1] + 1
325
-
326
- output_size = (out_h, out_w)
327
- x = x.transpose(1, 2) # B, H/2*W/2, 4*C
328
- x = self.norm(x) if self.norm else x
329
- x = self.reduction(x)
330
- return x, output_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/lib/segformer_plusplus/utils/imagenet_weights.py DELETED
@@ -1,8 +0,0 @@
1
- imagenet_weights = {
2
- 'b0': 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth',
3
- 'b1': 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b1_20220624-02e5a6a1.pth',
4
- 'b2': 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b2_20220624-66e8bf70.pth',
5
- 'b3': 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b3_20220624-13b1141c.pth',
6
- 'b4': 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b4_20220624-d588d980.pth',
7
- 'b5': 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth'
8
- }
 
 
 
 
 
 
 
 
 
build/lib/segformer_plusplus/utils/registry.py DELETED
@@ -1,6 +0,0 @@
1
- from mmengine import Registry
2
-
3
- MODELS = Registry(
4
- 'models',
5
- locations=['segformer_plusplus.model.backbone', 'segformer_plusplus.model.head']
6
- )
 
 
 
 
 
 
 
build/lib/segformer_plusplus/utils/shape_convert.py DELETED
@@ -1,107 +0,0 @@
1
- # Copyright (c) OpenMMLab. All rights reserved.
2
- def nlc_to_nchw(x, hw_shape):
3
- """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
4
-
5
- Args:
6
- x (Tensor): The input tensor of shape [N, L, C] before conversion.
7
- hw_shape (Sequence[int]): The height and width of output feature map.
8
-
9
- Returns:
10
- Tensor: The output tensor of shape [N, C, H, W] after conversion.
11
- """
12
- H, W = hw_shape
13
- assert len(x.shape) == 3
14
- B, L, C = x.shape
15
- assert L == H * W, 'The seq_len doesn\'t match H, W'
16
- return x.transpose(1, 2).reshape(B, C, H, W)
17
-
18
-
19
- def nchw_to_nlc(x):
20
- """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
21
-
22
- Args:
23
- x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
24
-
25
- Returns:
26
- Tensor: The output tensor of shape [N, L, C] after conversion.
27
- """
28
- assert len(x.shape) == 4
29
- return x.flatten(2).transpose(1, 2).contiguous()
30
-
31
-
32
- def nchw2nlc2nchw(module, x, contiguous=False, **kwargs):
33
- """Flatten [N, C, H, W] shape tensor `x` to [N, L, C] shape tensor. Use the
34
- reshaped tensor as the input of `module`, and the convert the output of
35
- `module`, whose shape is.
36
-
37
- [N, L, C], to [N, C, H, W].
38
-
39
- Args:
40
- module (Callable): A callable object the takes a tensor
41
- with shape [N, L, C] as input.
42
- x (Tensor): The input tensor of shape [N, C, H, W].
43
- contiguous:
44
- contiguous (Bool): Whether to make the tensor contiguous
45
- after each shape transform.
46
-
47
- Returns:
48
- Tensor: The output tensor of shape [N, C, H, W].
49
-
50
- Example:
51
- >>> import torch
52
- >>> import torch.nn as nn
53
- >>> norm = nn.LayerNorm(4)
54
- >>> feature_map = torch.rand(4, 4, 5, 5)
55
- >>> output = nchw2nlc2nchw(norm, feature_map)
56
- """
57
- B, C, H, W = x.shape
58
- if not contiguous:
59
- x = x.flatten(2).transpose(1, 2)
60
- x = module(x, **kwargs)
61
- x = x.transpose(1, 2).reshape(B, C, H, W)
62
- else:
63
- x = x.flatten(2).transpose(1, 2).contiguous()
64
- x = module(x, **kwargs)
65
- x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
66
- return x
67
-
68
-
69
- def nlc2nchw2nlc(module, x, hw_shape, contiguous=False, **kwargs):
70
- """Convert [N, L, C] shape tensor `x` to [N, C, H, W] shape tensor. Use the
71
- reshaped tensor as the input of `module`, and convert the output of
72
- `module`, whose shape is.
73
-
74
- [N, C, H, W], to [N, L, C].
75
-
76
- Args:
77
- module (Callable): A callable object the takes a tensor
78
- with shape [N, C, H, W] as input.
79
- x (Tensor): The input tensor of shape [N, L, C].
80
- hw_shape: (Sequence[int]): The height and width of the
81
- feature map with shape [N, C, H, W].
82
- contiguous (Bool): Whether to make the tensor contiguous
83
- after each shape transform.
84
-
85
- Returns:
86
- Tensor: The output tensor of shape [N, L, C].
87
-
88
- Example:
89
- >>> import torch
90
- >>> import torch.nn as nn
91
- >>> conv = nn.Conv2d(16, 16, 3, 1, 1)
92
- >>> feature_map = torch.rand(4, 25, 16)
93
- >>> output = nlc2nchw2nlc(conv, feature_map, (5, 5))
94
- """
95
- H, W = hw_shape
96
- assert len(x.shape) == 3
97
- B, L, C = x.shape
98
- assert L == H * W, 'The seq_len doesn\'t match H, W'
99
- if not contiguous:
100
- x = x.transpose(1, 2).reshape(B, C, H, W)
101
- x = module(x, **kwargs)
102
- x = x.flatten(2).transpose(1, 2)
103
- else:
104
- x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
105
- x = module(x, **kwargs)
106
- x = x.flatten(2).transpose(1, 2).contiguous()
107
- return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/lib/segformer_plusplus/utils/tome_presets.py DELETED
@@ -1,20 +0,0 @@
1
- tome_presets = {
2
- 'bsm_hq': [
3
- dict(q_mode=None, kv_mode='bsm', kv_r=0.6, kv_sx=2, kv_sy=2),
4
- dict(q_mode=None, kv_mode='bsm', kv_r=0.6, kv_sx=2, kv_sy=2),
5
- dict(q_mode='bsm', kv_mode=None, q_r=0.8, q_sx=4, q_sy=4),
6
- dict(q_mode='bsm', kv_mode=None, q_r=0.8, q_sx=4, q_sy=4)
7
- ],
8
- 'bsm_fast': [
9
- dict(q_mode=None, kv_mode='bsm_r2D', kv_r=0.9, kv_sx=4, kv_sy=4),
10
- dict(q_mode=None, kv_mode='bsm_r2D', kv_r=0.9, kv_sx=4, kv_sy=4),
11
- dict(q_mode='bsm_r2D', kv_mode=None, q_r=0.9, q_sx=4, q_sy=4),
12
- dict(q_mode='bsm_r2D', kv_mode=None, q_r=0.9, q_sx=4, q_sy=4)
13
- ],
14
- 'n2d_2x2': [
15
- dict(q_mode='neighbor_2D', kv_mode=None, q_s=(2, 2)),
16
- dict(q_mode='neighbor_2D', kv_mode=None, q_s=(2, 2)),
17
- dict(q_mode='neighbor_2D', kv_mode=None, q_s=(2, 2)),
18
- dict(q_mode='neighbor_2D', kv_mode=None, q_s=(2, 2))
19
- ]
20
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
build/lib/segformer_plusplus/utils/wrappers.py DELETED
@@ -1,51 +0,0 @@
1
- # Copyright (c) OpenMMLab. All rights reserved.
2
- import warnings
3
-
4
- import torch.nn as nn
5
- import torch.nn.functional as F
6
-
7
-
8
- def resize(input,
9
- size=None,
10
- scale_factor=None,
11
- mode='nearest',
12
- align_corners=None,
13
- warning=True):
14
- if warning:
15
- if size is not None and align_corners:
16
- input_h, input_w = tuple(int(x) for x in input.shape[2:])
17
- output_h, output_w = tuple(int(x) for x in size)
18
- if output_h > input_h or output_w > output_h:
19
- if ((output_h > 1 and output_w > 1 and input_h > 1
20
- and input_w > 1) and (output_h - 1) % (input_h - 1)
21
- and (output_w - 1) % (input_w - 1)):
22
- warnings.warn(
23
- f'When align_corners={align_corners}, '
24
- 'the output would more aligned if '
25
- f'input size {(input_h, input_w)} is `x+1` and '
26
- f'out size {(output_h, output_w)} is `nx+1`')
27
- return F.interpolate(input, size, scale_factor, mode, align_corners)
28
-
29
-
30
- class Upsample(nn.Module):
31
-
32
- def __init__(self,
33
- size=None,
34
- scale_factor=None,
35
- mode='nearest',
36
- align_corners=None):
37
- super().__init__()
38
- self.size = size
39
- if isinstance(scale_factor, tuple):
40
- self.scale_factor = tuple(float(factor) for factor in scale_factor)
41
- else:
42
- self.scale_factor = float(scale_factor) if scale_factor else None
43
- self.mode = mode
44
- self.align_corners = align_corners
45
-
46
- def forward(self, x):
47
- if not self.size:
48
- size = [int(t * self.scale_factor) for t in x.shape[-2:]]
49
- else:
50
- size = self.size
51
- return resize(x, size, None, self.mode, self.align_corners)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
segformer_plusplus.egg-info/SOURCES.txt CHANGED
@@ -1,7 +1,10 @@
1
  setup.py
2
  segformer_plusplus/__init__.py
3
  segformer_plusplus/build_model.py
 
4
  segformer_plusplus/random_benchmark.py
 
 
5
  segformer_plusplus.egg-info/PKG-INFO
6
  segformer_plusplus.egg-info/SOURCES.txt
7
  segformer_plusplus.egg-info/dependency_links.txt
@@ -15,15 +18,22 @@ segformer_plusplus/configs/segformer_mit_b3.py
15
  segformer_plusplus/configs/segformer_mit_b4.py
16
  segformer_plusplus/configs/segformer_mit_b5.py
17
  segformer_plusplus/model/__init__.py
 
 
 
18
  segformer_plusplus/model/backbone/__init__.py
19
  segformer_plusplus/model/backbone/mit.py
20
  segformer_plusplus/model/head/__init__.py
21
  segformer_plusplus/model/head/segformer_head.py
22
  segformer_plusplus/utils/__init__.py
 
23
  segformer_plusplus/utils/benchmark.py
 
24
  segformer_plusplus/utils/embed.py
25
  segformer_plusplus/utils/imagenet_weights.py
 
26
  segformer_plusplus/utils/registry.py
27
  segformer_plusplus/utils/shape_convert.py
28
  segformer_plusplus/utils/tome_presets.py
 
29
  segformer_plusplus/utils/wrappers.py
 
1
  setup.py
2
  segformer_plusplus/__init__.py
3
  segformer_plusplus/build_model.py
4
+ segformer_plusplus/cityscape_benchmark.py
5
  segformer_plusplus/random_benchmark.py
6
+ segformer_plusplus/start_cityscape_benchmark.py
7
+ segformer_plusplus/start_random_benchmark.py
8
  segformer_plusplus.egg-info/PKG-INFO
9
  segformer_plusplus.egg-info/SOURCES.txt
10
  segformer_plusplus.egg-info/dependency_links.txt
 
18
  segformer_plusplus/configs/segformer_mit_b4.py
19
  segformer_plusplus/configs/segformer_mit_b5.py
20
  segformer_plusplus/model/__init__.py
21
+ segformer_plusplus/model/base_module.py
22
+ segformer_plusplus/model/utils.py
23
+ segformer_plusplus/model/weight_init.py
24
  segformer_plusplus/model/backbone/__init__.py
25
  segformer_plusplus/model/backbone/mit.py
26
  segformer_plusplus/model/head/__init__.py
27
  segformer_plusplus/model/head/segformer_head.py
28
  segformer_plusplus/utils/__init__.py
29
+ segformer_plusplus/utils/activation.py
30
  segformer_plusplus/utils/benchmark.py
31
+ segformer_plusplus/utils/build_functions.py
32
  segformer_plusplus/utils/embed.py
33
  segformer_plusplus/utils/imagenet_weights.py
34
+ segformer_plusplus/utils/manager.py
35
  segformer_plusplus/utils/registry.py
36
  segformer_plusplus/utils/shape_convert.py
37
  segformer_plusplus/utils/tome_presets.py
38
+ segformer_plusplus/utils/version_utils.py
39
  segformer_plusplus/utils/wrappers.py
segformer_plusplus.egg-info/requires.txt CHANGED
@@ -1,2 +1,5 @@
 
 
 
1
  tomesd
2
  torch>=2.0.1
 
1
+ numpy
2
+ omegaconf
3
+ pyyaml
4
  tomesd
5
  torch>=2.0.1
segformer_plusplus/cityscape/berlin_000543_000019_leftImg8bit.png ADDED

Git LFS Details

  • SHA256: 3d616adab2c462fdee7f47a2de927436aebfb73843c38c3e3fbc85a6220955d4
  • Pointer size: 132 Bytes
  • Size of remote file: 2.37 MB
segformer_plusplus/cityscape_benchmark.py CHANGED
@@ -14,6 +14,8 @@ print(f"Using device: {device}")
14
  if device.type == 'cuda':
15
  print(f"CUDA Device Name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
16
 
 
 
17
 
18
  def cityscape_benchmark(
19
  model: torch.nn.Module,
@@ -96,17 +98,7 @@ def cityscape_benchmark(
96
 
97
  if save_output:
98
  with torch.no_grad():
99
- with open("model_output_log.txt", "w") as f:
100
- f.write("=== Model Input Info ===\n")
101
- f.write(f"Input tensor:\n{img_tensor}\n")
102
- f.write(f"Input shape: {img_tensor.shape}\n")
103
- f.write(f"Input stats: mean = {img_tensor.mean().item()}, std = {img_tensor.std().item()}\n\n")
104
-
105
- output = model(img_tensor)
106
-
107
- f.write("=== Raw Model Output ===\n")
108
- f.write(f"{output}\n\n")
109
-
110
  pred = torch.argmax(output, dim=1).squeeze(0).cpu().numpy()
111
 
112
  # Speichere Prediction als Text ab
 
14
  if device.type == 'cuda':
15
  print(f"CUDA Device Name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
16
 
17
+ torch.manual_seed(42)
18
+ torch.cuda.manual_seed_all(42)
19
 
20
  def cityscape_benchmark(
21
  model: torch.nn.Module,
 
98
 
99
  if save_output:
100
  with torch.no_grad():
101
+ output = model(img_tensor)
 
 
 
 
 
 
 
 
 
 
102
  pred = torch.argmax(output, dim=1).squeeze(0).cpu().numpy()
103
 
104
  # Speichere Prediction als Text ab
cityscapes_prediction_output_reference.txt → segformer_plusplus/cityscapes_prediction_output.txt RENAMED
File without changes
segformer_plusplus/cityscapes_prediction_output_reference_b05_nocheckpoint.txt ADDED
The diff for this file is too large to render. See raw diff
 
segformer_plusplus/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "segformerplusplus",
3
+ "architectures": ["SegFormerPlusPlus"],
4
+ "backbone": "b5",
5
+ "supported_backbones": ["b0", "b1", "b2", "b3", "b4", "b5"],
6
+ "head": "bsm_hq",
7
+ "supported_heads": ["bsm_hq", "bsm_fast", "n2d_2x2"],
8
+ "out_channels": 19,
9
+ "num_labels": 19
10
+ }
segformer_plusplus/configs/config/utils.py CHANGED
@@ -10,11 +10,17 @@ from importlib import import_module as real_import_module
10
  import json
11
  import pickle
12
  from pathlib import Path
13
- from mim.utils import package2module
14
 
15
  import yaml
16
  from omegaconf import OmegaConf
17
 
 
 
 
 
 
 
18
 
19
  PYTHON_ROOT_DIR = osp.dirname(osp.dirname(sys.executable))
20
  SYSTEM_PYTHON_PREFIX = '/usr/lib/python'
@@ -644,4 +650,125 @@ def dump(obj, file=None, file_format=None, **kwargs):
644
 
645
  def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
646
  if not osp.isfile(filename):
647
- raise FileNotFoundError(msg_tmpl.format(filename))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import json
11
  import pickle
12
  from pathlib import Path
13
+ import itertools
14
 
15
  import yaml
16
  from omegaconf import OmegaConf
17
 
18
+ from pkg_resources.extern import packaging
19
+ __import__('pkg_resources.extern.packaging.version')
20
+ __import__('pkg_resources.extern.packaging.specifiers')
21
+ __import__('pkg_resources.extern.packaging.requirements')
22
+ __import__('pkg_resources.extern.packaging.markers')
23
+
24
 
25
  PYTHON_ROOT_DIR = osp.dirname(osp.dirname(sys.executable))
26
  SYSTEM_PYTHON_PREFIX = '/usr/lib/python'
 
650
 
651
  def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
652
  if not osp.isfile(filename):
653
+ raise FileNotFoundError(msg_tmpl.format(filename))
654
+
655
+
656
+ def package2module(package: str):
657
+ """Infer module name from package.
658
+
659
+ Args:
660
+ package (str): Package to infer module name.
661
+ """
662
+ pkg = get_distribution(package)
663
+ if pkg.has_metadata('top_level.txt'):
664
+ module_name = pkg.get_metadata('top_level.txt').split('\n')[0]
665
+ return module_name
666
+ else:
667
+ raise ValueError(
668
+ highlighted_error(f'can not infer the module name of {package}'))
669
+
670
+
671
+ def get_distribution(dist):
672
+ """Return a current distribution object for a Requirement or string"""
673
+ if isinstance(dist, str):
674
+ dist = Requirement.parse(dist)
675
+ return dist
676
+
677
+
678
+ def highlighted_error(msg: Union[str, Exception]) -> str:
679
+ return click.style(msg, fg='red', bold=True) # type: ignore
680
+
681
+
682
+ class Requirement(packaging.requirements.Requirement):
683
+ def __init__(self, requirement_string):
684
+ """DO NOT CALL THIS UNDOCUMENTED METHOD; use Requirement.parse()!"""
685
+ super(Requirement, self).__init__(requirement_string)
686
+ self.unsafe_name = self.name
687
+ project_name = safe_name(self.name)
688
+ self.project_name, self.key = project_name, project_name.lower()
689
+ self.specs = [
690
+ (spec.operator, spec.version) for spec in self.specifier]
691
+ self.extras = tuple(map(safe_extra, self.extras))
692
+ self.hashCmp = (
693
+ self.key,
694
+ self.url,
695
+ self.specifier,
696
+ frozenset(self.extras),
697
+ str(self.marker) if self.marker else None,
698
+ )
699
+ self.__hash = hash(self.hashCmp)
700
+
701
+ def __eq__(self, other):
702
+ return (
703
+ isinstance(other, Requirement) and
704
+ self.hashCmp == other.hashCmp
705
+ )
706
+
707
+ def __ne__(self, other):
708
+ return not self == other
709
+
710
+ def __contains__(self, item):
711
+ if item.key != self.key:
712
+ return False
713
+
714
+ item = item.version
715
+
716
+ # Allow prereleases always in order to match the previous behavior of
717
+ # this method. In the future this should be smarter and follow PEP 440
718
+ # more accurately.
719
+ return self.specifier.contains(item, prereleases=True)
720
+
721
+ def __hash__(self):
722
+ return self.__hash
723
+
724
+ def __repr__(self):
725
+ return "Requirement.parse(%r)" % str(self)
726
+
727
+ @staticmethod
728
+ def parse(s):
729
+ req, = parse_requirements(s)
730
+ return req
731
+
732
+
733
+ def parse_requirements(strs):
734
+ """Yield ``Requirement`` objects for each specification in `strs`
735
+
736
+ `strs` must be a string, or a (possibly-nested) iterable thereof.
737
+ """
738
+ # create a steppable iterator, so we can handle \-continuations
739
+ lines = iter(yield_lines(strs))
740
+
741
+ for line in lines:
742
+ # Drop comments -- a hash without a space may be in a URL.
743
+ if ' #' in line:
744
+ line = line[:line.find(' #')]
745
+ # If there is a line continuation, drop it, and append the next line.
746
+ if line.endswith('\\'):
747
+ line = line[:-2].strip()
748
+ try:
749
+ line += next(lines)
750
+ except StopIteration:
751
+ return
752
+ yield Requirement(line)
753
+
754
+
755
+ def yield_lines(iterable):
756
+ """Yield valid lines of a string or iterable"""
757
+ return itertools.chain.from_iterable(map(yield_lines, iterable))
758
+
759
+
760
+ def safe_extra(extra):
761
+ """Convert an arbitrary string to a standard 'extra' name
762
+
763
+ Any runs of non-alphanumeric characters are replaced with a single '_',
764
+ and the result is always lowercased.
765
+ """
766
+ return re.sub('[^A-Za-z0-9.-]+', '_', extra).lower()
767
+
768
+
769
+ def safe_name(name):
770
+ """Convert an arbitrary string to a standard distribution name
771
+
772
+ Any runs of non-alphanumeric/. characters are replaced with a single '-'.
773
+ """
774
+ return re.sub('[^A-Za-z0-9.]+', '-', name)
segformer_plusplus/modeling_segformer_plusplus.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modeling_segformer_plusplus.py
2
+
3
+ from typing import Optional, Tuple
4
+ import torch
5
+ import torch.nn as nn
6
+ from transformers import PreTrainedModel, PretrainedConfig
7
+ from transformers.modeling_outputs import SemanticSegmenterOutput
8
+
9
+ # Falls du SegFormer direkt importieren willst, musst du sicherstellen,
10
+ # dass diese Klasse im selben Repo verfügbar ist.
11
+ from segformer_plusplus.model import create_model
12
+
13
+
14
+ class SegformerPlusPlusConfig(PretrainedConfig):
15
+ model_type = "segformer_plusplus"
16
+
17
+ def __init__(
18
+ self,
19
+ backbone: str = "b5",
20
+ tome_strategy: Optional[str] = "bsm_hq",
21
+ num_labels: int = 19,
22
+ id2label: Optional[dict] = None,
23
+ label2id: Optional[dict] = None,
24
+ **kwargs,
25
+ ):
26
+ self.backbone = backbone
27
+ self.tome_strategy = tome_strategy
28
+ self.num_labels = num_labels
29
+
30
+ if id2label is None:
31
+ id2label = {i: f"class_{i}" for i in range(num_labels)}
32
+ if label2id is None:
33
+ label2id = {v: k for k, v in id2label.items()}
34
+
35
+ self.id2label = id2label
36
+ self.label2id = label2id
37
+
38
+ super().__init__(**kwargs)
39
+
40
+
41
+ class SegformerPlusPlusForSemanticSegmentation(PreTrainedModel):
42
+ config_class = SegformerPlusPlusConfig
43
+
44
+ def __init__(self, config: SegformerPlusPlusConfig):
45
+ super().__init__(config)
46
+ self.segformer = create_model(
47
+ backbone=config.backbone,
48
+ tome_strategy=config.tome_strategy,
49
+ out_channels=config.num_labels,
50
+ pretrained=False, # Kein Pretrained hier – wird über .from_pretrained geladen
51
+ )
52
+
53
+ def forward(
54
+ self,
55
+ pixel_values: torch.FloatTensor,
56
+ labels: Optional[torch.LongTensor] = None,
57
+ ) -> SemanticSegmenterOutput:
58
+
59
+ logits = self.segformer(pixel_values)
60
+
61
+ loss = None
62
+ if labels is not None:
63
+ loss_fct = nn.CrossEntropyLoss(ignore_index=255)
64
+ loss = loss_fct(logits, labels.long())
65
+
66
+ return SemanticSegmenterOutput(
67
+ loss=loss,
68
+ logits=logits,
69
+ )
segformer_plusplus/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e064fa2fb7d618208c2542e76c543b7cb552a3d8997a0c6c4cc0a14da86ba58
3
+ size 328287002
segformer_plusplus/start_cityscape_benchmark.py CHANGED
@@ -7,7 +7,7 @@ from .build_model import create_model
7
  from .cityscape_benchmark import cityscape_benchmark
8
 
9
  parser = argparse.ArgumentParser(description="Segformer Benchmarking Script")
10
- parser.add_argument('--backbone', type=str, default='b0', choices=['b0', 'b1', 'b2', 'b3', 'b4', 'b5'], help='Model backbone version')
11
  parser.add_argument('--head', type=str, default='bsm_hq', choices=['bsm_hq', 'bsm_fast', 'n2d_2x2'], help='Model head type')
12
  parser.add_argument('--checkpoint', type=str, default=None, help='Path to .pth checkpoint file (optional)')
13
  args = parser.parse_args()
@@ -22,13 +22,15 @@ if args.checkpoint:
22
  else:
23
  print("No checkpoint provided – using model as initialized.")
24
 
25
- image_path = os.path.expanduser('~/SegformerPlusPlus/mmsegmentation/data/cityscapes/leftImg8bit/test/berlin/berlin_000543_000019_leftImg8bit.png')
 
 
26
  result = cityscape_benchmark(model, image_path)
27
 
28
  print("Cityscapes Benchmark Results:", result)
29
 
30
- reference_txt_path = os.path.expanduser('~/SegformerPlusPlus/model/cityscapes_prediction_output_reference.txt')
31
- generated_txt_path = os.path.expanduser('~/SegformerPlusPlus/model/cityscapes_prediction_output.txt')
32
 
33
  if os.path.exists(reference_txt_path) and os.path.exists(generated_txt_path):
34
  ref_arr = np.loadtxt(reference_txt_path, dtype=int)
 
7
  from .cityscape_benchmark import cityscape_benchmark
8
 
9
  parser = argparse.ArgumentParser(description="Segformer Benchmarking Script")
10
+ parser.add_argument('--backbone', type=str, default='b5', choices=['b0', 'b1', 'b2', 'b3', 'b4', 'b5'], help='Model backbone version')
11
  parser.add_argument('--head', type=str, default='bsm_hq', choices=['bsm_hq', 'bsm_fast', 'n2d_2x2'], help='Model head type')
12
  parser.add_argument('--checkpoint', type=str, default=None, help='Path to .pth checkpoint file (optional)')
13
  args = parser.parse_args()
 
22
  else:
23
  print("No checkpoint provided – using model as initialized.")
24
 
25
+ cwd = os.getcwd()
26
+
27
+ image_path = os.path.join(cwd, 'cityscape', 'berlin_000543_000019_leftImg8bit.png')
28
  result = cityscape_benchmark(model, image_path)
29
 
30
  print("Cityscapes Benchmark Results:", result)
31
 
32
+ reference_txt_path = os.path.join(cwd, 'cityscapes_prediction_output_reference_b05_nocheckpoint.txt')
33
+ generated_txt_path = os.path.join(cwd, 'cityscapes_prediction_output.txt')
34
 
35
  if os.path.exists(reference_txt_path) and os.path.exists(generated_txt_path):
36
  ref_arr = np.loadtxt(reference_txt_path, dtype=int)
setup.py CHANGED
@@ -5,7 +5,18 @@ setup(
5
  version="0.2",
6
  author="Marco Kantonis",
7
  description="Segformer++: Efficient Token-Merging Strategies for High-Resolution Semantic Segmentation",
8
- install_requires=['torch>=2.0.1', 'tomesd','omegaconf', 'pyyaml'],
 
 
 
 
 
 
 
 
 
 
 
9
  packages=find_packages(),
10
  license='MIT',
11
  long_description="https://arxiv.org/abs/2405.14467"
 
5
  version="0.2",
6
  author="Marco Kantonis",
7
  description="Segformer++: Efficient Token-Merging Strategies for High-Resolution Semantic Segmentation",
8
+ install_requires=[
9
+ 'torch>=2.0.1',
10
+ 'tomesd',
11
+ 'omegaconf',
12
+ 'pyyaml',
13
+ 'numpy',
14
+ 'rich',
15
+ 'yapf',
16
+ 'addict',
17
+ 'tqdm',
18
+ 'packaging'
19
+ ],
20
  packages=find_packages(),
21
  license='MIT',
22
  long_description="https://arxiv.org/abs/2405.14467"