Tim77777767
commited on
Commit
·
1a260cd
1
Parent(s):
02508fb
Anpassungen für HF, Checkpoint umgewandelt, config.json angepasst
Browse files- config.json +52 -11
- hf_segformer_converted/config.json +57 -0
- mix_vision_transformer_config.py +7 -1
- mmengineToHFCheckpoint.py +79 -0
- model.safetensors +3 -0
- modeling_my_segformer.py +21 -13
- preTrainedTest.py +13 -0
- segformer_plusplus/model/backbone/mit.py +1 -1
config.json
CHANGED
@@ -1,16 +1,57 @@
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"model_type": "my_segformer",
|
3 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
"num_stages": 4,
|
5 |
-
"
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
"qkv_bias": true,
|
12 |
-
"
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
}
|
|
|
1 |
{
|
2 |
+
"architectures": [
|
3 |
+
"MySegformerForSemanticSegmentation"
|
4 |
+
],
|
5 |
+
"attn_drop_rate": 0.0,
|
6 |
+
"drop_path_rate": 0.0,
|
7 |
+
"drop_rate": 0.0,
|
8 |
+
"embed_dims": [
|
9 |
+
64,
|
10 |
+
128,
|
11 |
+
320,
|
12 |
+
512
|
13 |
+
],
|
14 |
+
"mlp_ratio": 4,
|
15 |
"model_type": "my_segformer",
|
16 |
+
"num_classes": 19,
|
17 |
+
"num_heads": [
|
18 |
+
1,
|
19 |
+
2,
|
20 |
+
4,
|
21 |
+
8
|
22 |
+
],
|
23 |
+
"num_layers": [
|
24 |
+
3,
|
25 |
+
4,
|
26 |
+
6,
|
27 |
+
3
|
28 |
+
],
|
29 |
"num_stages": 4,
|
30 |
+
"out_indices": [
|
31 |
+
0,
|
32 |
+
1,
|
33 |
+
2,
|
34 |
+
3
|
35 |
+
],
|
36 |
+
"patch_sizes": [
|
37 |
+
7,
|
38 |
+
3,
|
39 |
+
3,
|
40 |
+
3
|
41 |
+
],
|
42 |
"qkv_bias": true,
|
43 |
+
"sr_ratios": [
|
44 |
+
8,
|
45 |
+
4,
|
46 |
+
2,
|
47 |
+
1
|
48 |
+
],
|
49 |
+
"strides": [
|
50 |
+
4,
|
51 |
+
2,
|
52 |
+
2,
|
53 |
+
2
|
54 |
+
],
|
55 |
+
"torch_dtype": "float32",
|
56 |
+
"transformers_version": "4.55.0"
|
57 |
}
|
hf_segformer_converted/config.json
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"MySegformerForSemanticSegmentation"
|
4 |
+
],
|
5 |
+
"attn_drop_rate": 0.0,
|
6 |
+
"drop_path_rate": 0.0,
|
7 |
+
"drop_rate": 0.0,
|
8 |
+
"embed_dims": [
|
9 |
+
64,
|
10 |
+
128,
|
11 |
+
320,
|
12 |
+
512
|
13 |
+
],
|
14 |
+
"mlp_ratio": 4,
|
15 |
+
"model_type": "my_segformer",
|
16 |
+
"num_classes": 19,
|
17 |
+
"num_heads": [
|
18 |
+
1,
|
19 |
+
2,
|
20 |
+
4,
|
21 |
+
8
|
22 |
+
],
|
23 |
+
"num_layers": [
|
24 |
+
3,
|
25 |
+
4,
|
26 |
+
6,
|
27 |
+
3
|
28 |
+
],
|
29 |
+
"num_stages": 4,
|
30 |
+
"out_indices": [
|
31 |
+
0,
|
32 |
+
1,
|
33 |
+
2,
|
34 |
+
3
|
35 |
+
],
|
36 |
+
"patch_sizes": [
|
37 |
+
7,
|
38 |
+
3,
|
39 |
+
3,
|
40 |
+
3
|
41 |
+
],
|
42 |
+
"qkv_bias": true,
|
43 |
+
"sr_ratios": [
|
44 |
+
8,
|
45 |
+
4,
|
46 |
+
2,
|
47 |
+
1
|
48 |
+
],
|
49 |
+
"strides": [
|
50 |
+
4,
|
51 |
+
2,
|
52 |
+
2,
|
53 |
+
2
|
54 |
+
],
|
55 |
+
"torch_dtype": "float32",
|
56 |
+
"transformers_version": "4.55.0"
|
57 |
+
}
|
mix_vision_transformer_config.py
CHANGED
@@ -5,7 +5,7 @@ class MySegformerConfig(PretrainedConfig):
|
|
5 |
|
6 |
def __init__(
|
7 |
self,
|
8 |
-
embed_dims=64,
|
9 |
num_stages=4,
|
10 |
num_layers=[3, 4, 6, 3],
|
11 |
num_heads=[1, 2, 4, 8],
|
@@ -21,6 +21,11 @@ class MySegformerConfig(PretrainedConfig):
|
|
21 |
**kwargs
|
22 |
):
|
23 |
super().__init__(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
24 |
self.embed_dims = embed_dims
|
25 |
self.num_stages = num_stages
|
26 |
self.num_layers = num_layers
|
@@ -34,3 +39,4 @@ class MySegformerConfig(PretrainedConfig):
|
|
34 |
self.attn_drop_rate = attn_drop_rate
|
35 |
self.drop_path_rate = drop_path_rate
|
36 |
self.out_indices = out_indices
|
|
|
|
5 |
|
6 |
def __init__(
|
7 |
self,
|
8 |
+
embed_dims=[64, 128, 320, 512],
|
9 |
num_stages=4,
|
10 |
num_layers=[3, 4, 6, 3],
|
11 |
num_heads=[1, 2, 4, 8],
|
|
|
21 |
**kwargs
|
22 |
):
|
23 |
super().__init__(**kwargs)
|
24 |
+
|
25 |
+
# Absicherung, falls embed_dims als int übergeben wird
|
26 |
+
if isinstance(embed_dims, int):
|
27 |
+
embed_dims = [embed_dims]
|
28 |
+
|
29 |
self.embed_dims = embed_dims
|
30 |
self.num_stages = num_stages
|
31 |
self.num_layers = num_layers
|
|
|
39 |
self.attn_drop_rate = attn_drop_rate
|
40 |
self.drop_path_rate = drop_path_rate
|
41 |
self.out_indices = out_indices
|
42 |
+
|
mmengineToHFCheckpoint.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from mix_vision_transformer_config import MySegformerConfig
|
3 |
+
from modeling_my_segformer import MySegformerForSemanticSegmentation
|
4 |
+
|
5 |
+
def convert_mmengine_checkpoint_to_hf(mm_checkpoint_path, hf_save_dir):
|
6 |
+
# 1. Lade mmengine checkpoint
|
7 |
+
mm_ckpt = torch.load(mm_checkpoint_path, map_location="cpu")
|
8 |
+
if 'state_dict' in mm_ckpt:
|
9 |
+
mm_state_dict = mm_ckpt['state_dict']
|
10 |
+
else:
|
11 |
+
mm_state_dict = mm_ckpt
|
12 |
+
|
13 |
+
# 2. Erstelle Config & Modell (achte darauf, dass Config-Parameter zum Checkpoint passen)
|
14 |
+
config = MySegformerConfig(
|
15 |
+
embed_dims=[64, 128, 320, 512], # <--- korrekte Liste mit 4 Werten
|
16 |
+
num_stages=4,
|
17 |
+
num_layers=[3, 4, 6, 3],
|
18 |
+
num_heads=[1, 2, 4, 8],
|
19 |
+
patch_sizes=[7, 3, 3, 3],
|
20 |
+
strides=[4, 2, 2, 2],
|
21 |
+
sr_ratios=[8, 4, 2, 1],
|
22 |
+
mlp_ratio=4,
|
23 |
+
qkv_bias=True,
|
24 |
+
drop_rate=0.0,
|
25 |
+
attn_drop_rate=0.0,
|
26 |
+
drop_path_rate=0.0,
|
27 |
+
out_indices=(0, 1, 2, 3),
|
28 |
+
num_classes=19
|
29 |
+
)
|
30 |
+
|
31 |
+
model = MySegformerForSemanticSegmentation(config)
|
32 |
+
|
33 |
+
# 3. Mappe mmengine Keys auf HF Keys
|
34 |
+
hf_state_dict = {}
|
35 |
+
|
36 |
+
for k, v in mm_state_dict.items():
|
37 |
+
new_k = k
|
38 |
+
|
39 |
+
# Falls "module." als Prefix da ist (DataParallel), entfernen
|
40 |
+
if new_k.startswith("module."):
|
41 |
+
new_k = new_k[len("module."):]
|
42 |
+
|
43 |
+
# Mapping von decode_head.* -> segmentation_head.*
|
44 |
+
if new_k.startswith("decode_head."):
|
45 |
+
new_k = new_k.replace("decode_head.", "segmentation_head.")
|
46 |
+
|
47 |
+
# BatchNorm-Namen vereinheitlichen
|
48 |
+
new_k = new_k.replace(".bn.", ".")
|
49 |
+
|
50 |
+
# Nur Keys übernehmen, die im HF-Modell existieren
|
51 |
+
if new_k not in model.state_dict():
|
52 |
+
print(f"⚠️ Ignoriere {new_k} (nicht im HF-Modell)")
|
53 |
+
continue
|
54 |
+
|
55 |
+
hf_state_dict[new_k] = v
|
56 |
+
|
57 |
+
# 4. Lade die Gewichte ins Modell
|
58 |
+
missing_keys, unexpected_keys = model.load_state_dict(hf_state_dict, strict=False)
|
59 |
+
|
60 |
+
print("Missing keys:", missing_keys)
|
61 |
+
print("Unexpected keys:", unexpected_keys)
|
62 |
+
|
63 |
+
# 5. Speichere das HF-kompatible Modell & Config
|
64 |
+
model.save_pretrained(hf_save_dir)
|
65 |
+
config.save_pretrained(hf_save_dir)
|
66 |
+
|
67 |
+
print(f"✅ Model und Config erfolgreich gespeichert in {hf_save_dir}")
|
68 |
+
|
69 |
+
# 5b. Auch als klassische .pth-Datei speichern
|
70 |
+
pth_path = hf_save_dir.rstrip("/") + ".pth"
|
71 |
+
torch.save(model.state_dict(), pth_path)
|
72 |
+
print(f"✅ Zusätzlich als .pth gespeichert unter {pth_path}")
|
73 |
+
|
74 |
+
|
75 |
+
if __name__ == "__main__":
|
76 |
+
mm_checkpoint_path = "./segformer-b5-bsm_hq.pth"
|
77 |
+
hf_save_dir = "hf_segformer_converted"
|
78 |
+
|
79 |
+
convert_mmengine_checkpoint_to_hf(mm_checkpoint_path, hf_save_dir)
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:818e1b51093355a5915fd844e68edc7af583b2a397272c84e6e12a670896bc23
|
3 |
+
size 98934820
|
modeling_my_segformer.py
CHANGED
@@ -2,17 +2,17 @@ from transformers import PreTrainedModel
|
|
2 |
import torch
|
3 |
import torch.nn as nn
|
4 |
from segformer_plusplus.utils import resize
|
5 |
-
from segformer_plusplus.model.backbone.mit import MixVisionTransformer #
|
6 |
-
from mix_vision_transformer_config import MySegformerConfig #
|
7 |
|
8 |
-
# Head-Implementierung (
|
9 |
class SegformerHead(nn.Module):
|
10 |
def __init__(self,
|
11 |
in_channels=[64, 128, 256, 512], # anpassen je nach Backbone-Ausgabe!
|
12 |
in_index=[0, 1, 2, 3],
|
13 |
channels=256,
|
14 |
dropout_ratio=0.1,
|
15 |
-
out_channels=19, # Anzahl Klassen
|
16 |
norm_cfg=None,
|
17 |
align_corners=False,
|
18 |
interpolate_mode='bilinear'):
|
@@ -26,6 +26,11 @@ class SegformerHead(nn.Module):
|
|
26 |
self.align_corners = align_corners
|
27 |
self.interpolate_mode = interpolate_mode
|
28 |
|
|
|
|
|
|
|
|
|
|
|
29 |
self.act_cfg = dict(type='ReLU')
|
30 |
self.conv_seg = nn.Conv2d(channels, out_channels, kernel_size=1)
|
31 |
self.dropout = nn.Dropout2d(dropout_ratio) if dropout_ratio > 0 else None
|
@@ -43,6 +48,7 @@ class SegformerHead(nn.Module):
|
|
43 |
out_channels=channels,
|
44 |
kernel_size=1,
|
45 |
stride=1,
|
|
|
46 |
norm_cfg=norm_cfg,
|
47 |
act_cfg=self.act_cfg))
|
48 |
|
@@ -50,6 +56,7 @@ class SegformerHead(nn.Module):
|
|
50 |
in_channels=channels * num_inputs,
|
51 |
out_channels=channels,
|
52 |
kernel_size=1,
|
|
|
53 |
norm_cfg=norm_cfg)
|
54 |
|
55 |
def cls_seg(self, feat):
|
@@ -81,9 +88,9 @@ class MySegformerForSemanticSegmentation(PreTrainedModel):
|
|
81 |
def __init__(self, config):
|
82 |
super().__init__(config)
|
83 |
|
84 |
-
#
|
85 |
self.backbone = MixVisionTransformer(
|
86 |
-
embed_dims=config.embed_dims,
|
87 |
num_stages=config.num_stages,
|
88 |
num_layers=config.num_layers,
|
89 |
num_heads=config.num_heads,
|
@@ -98,17 +105,18 @@ class MySegformerForSemanticSegmentation(PreTrainedModel):
|
|
98 |
out_indices=config.out_indices
|
99 |
)
|
100 |
|
101 |
-
#
|
|
|
|
|
|
|
|
|
|
|
102 |
self.segmentation_head = SegformerHead(
|
103 |
-
in_channels=[64, 128,
|
|
|
104 |
out_channels=config.num_classes if hasattr(config, 'num_classes') else 19,
|
105 |
dropout_ratio=0.1,
|
106 |
align_corners=False
|
107 |
)
|
108 |
|
109 |
self.post_init()
|
110 |
-
|
111 |
-
def forward(self, x):
|
112 |
-
features = self.backbone(x)
|
113 |
-
segmentation_output = self.segmentation_head(features)
|
114 |
-
return segmentation_output
|
|
|
2 |
import torch
|
3 |
import torch.nn as nn
|
4 |
from segformer_plusplus.utils import resize
|
5 |
+
from segformer_plusplus.model.backbone.mit import MixVisionTransformer # Backbone-Import
|
6 |
+
from mix_vision_transformer_config import MySegformerConfig # Config-Import
|
7 |
|
8 |
+
# Head-Implementierung (vereinfacht)
|
9 |
class SegformerHead(nn.Module):
|
10 |
def __init__(self,
|
11 |
in_channels=[64, 128, 256, 512], # anpassen je nach Backbone-Ausgabe!
|
12 |
in_index=[0, 1, 2, 3],
|
13 |
channels=256,
|
14 |
dropout_ratio=0.1,
|
15 |
+
out_channels=19, # Anzahl Klassen anpassen!
|
16 |
norm_cfg=None,
|
17 |
align_corners=False,
|
18 |
interpolate_mode='bilinear'):
|
|
|
26 |
self.align_corners = align_corners
|
27 |
self.interpolate_mode = interpolate_mode
|
28 |
|
29 |
+
print(f"in_channels: {self.in_channels}, type: {type(self.in_channels)}")
|
30 |
+
print(f"in_index: {self.in_index}, type: {type(self.in_index)}")
|
31 |
+
print(f"len(in_channels): {len(self.in_channels) if hasattr(self.in_channels, '__len__') else 'no len'}")
|
32 |
+
print(f"len(in_index): {len(self.in_index) if hasattr(self.in_index, '__len__') else 'no len'}")
|
33 |
+
|
34 |
self.act_cfg = dict(type='ReLU')
|
35 |
self.conv_seg = nn.Conv2d(channels, out_channels, kernel_size=1)
|
36 |
self.dropout = nn.Dropout2d(dropout_ratio) if dropout_ratio > 0 else None
|
|
|
48 |
out_channels=channels,
|
49 |
kernel_size=1,
|
50 |
stride=1,
|
51 |
+
bias=False,
|
52 |
norm_cfg=norm_cfg,
|
53 |
act_cfg=self.act_cfg))
|
54 |
|
|
|
56 |
in_channels=channels * num_inputs,
|
57 |
out_channels=channels,
|
58 |
kernel_size=1,
|
59 |
+
bias=False,
|
60 |
norm_cfg=norm_cfg)
|
61 |
|
62 |
def cls_seg(self, feat):
|
|
|
88 |
def __init__(self, config):
|
89 |
super().__init__(config)
|
90 |
|
91 |
+
# Wichtig: die gesamte Liste übergeben, nicht nur das erste Element
|
92 |
self.backbone = MixVisionTransformer(
|
93 |
+
embed_dims=config.embed_dims, # GANZE Liste, z.B. [64, 128, 320, 512]
|
94 |
num_stages=config.num_stages,
|
95 |
num_layers=config.num_layers,
|
96 |
num_heads=config.num_heads,
|
|
|
105 |
out_indices=config.out_indices
|
106 |
)
|
107 |
|
108 |
+
# Sicherstellen, dass in_channels eine Liste ist
|
109 |
+
in_channels = config.embed_dims
|
110 |
+
if isinstance(in_channels, int):
|
111 |
+
in_channels = [in_channels]
|
112 |
+
|
113 |
+
print(f"config.embed_dims: {config.embed_dims}, type: {type(config.embed_dims)}")
|
114 |
self.segmentation_head = SegformerHead(
|
115 |
+
in_channels=config.embed_dims, # z.B. [64, 128, 320, 512]
|
116 |
+
in_index=list(config.out_indices), # z.B. [0, 1, 2, 3]
|
117 |
out_channels=config.num_classes if hasattr(config, 'num_classes') else 19,
|
118 |
dropout_ratio=0.1,
|
119 |
align_corners=False
|
120 |
)
|
121 |
|
122 |
self.post_init()
|
|
|
|
|
|
|
|
|
|
preTrainedTest.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from modeling_my_segformer import MySegformerForSemanticSegmentation
|
2 |
+
from mix_vision_transformer_config import MySegformerConfig
|
3 |
+
|
4 |
+
# Der Pfad zu deinem HF-Repo (kann auch einfach als String benutzt werden)
|
5 |
+
model_name_or_path = "TimM77/SegformerPlusPlus"
|
6 |
+
|
7 |
+
# Config laden (automatisch aus config.json im Repo)
|
8 |
+
config = MySegformerConfig.from_pretrained(model_name_or_path)
|
9 |
+
|
10 |
+
# Modell laden (Gewichte aus pytorch_model.bin + Config)
|
11 |
+
model = MySegformerForSemanticSegmentation.from_pretrained(model_name_or_path, config=config)
|
12 |
+
|
13 |
+
print(model, config)
|
segformer_plusplus/model/backbone/mit.py
CHANGED
@@ -415,7 +415,7 @@ class MixVisionTransformer(BaseModule):
|
|
415 |
cur = 0
|
416 |
self.layers = ModuleList()
|
417 |
for i, num_layer in enumerate(num_layers):
|
418 |
-
embed_dims_i = embed_dims
|
419 |
patch_embed = PatchEmbed(
|
420 |
in_channels=in_channels,
|
421 |
embed_dims=embed_dims_i,
|
|
|
415 |
cur = 0
|
416 |
self.layers = ModuleList()
|
417 |
for i, num_layer in enumerate(num_layers):
|
418 |
+
embed_dims_i = embed_dims[i]
|
419 |
patch_embed = PatchEmbed(
|
420 |
in_channels=in_channels,
|
421 |
embed_dims=embed_dims_i,
|