hao9610 commited on
Commit
6289af9
·
verified ·
1 Parent(s): a21008f

Upload folder using huggingface_hub

Browse files
Files changed (34) hide show
  1. .gitattributes +2 -0
  2. README.md +1 -1
  3. s1_seg_finetune/xsam_sam_large_m2f_e36_gpu16_seg_finetune/pytorch_model.bin +3 -0
  4. s2_align_pretrain/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_e1_gpu16_align_pretrain/pytorch_model.bin +3 -0
  5. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/pytorch_model.bin +3 -0
  6. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/added_tokens.json +13 -0
  7. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/config.json +36 -0
  8. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/generation_config.json +11 -0
  9. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00001-of-00004.bin +3 -0
  10. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00002-of-00004.bin +3 -0
  11. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00003-of-00004.bin +3 -0
  12. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00004-of-00004.bin +3 -0
  13. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model.bin.index.json +202 -0
  14. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/special_tokens_map.json +30 -0
  15. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/tokenizer.json +0 -0
  16. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/tokenizer.model +3 -0
  17. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/tokenizer_config.json +132 -0
  18. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_encoder/config.json +33 -0
  19. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_encoder/preprocessor_config.json +44 -0
  20. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_encoder/pytorch_model.bin +3 -0
  21. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/config.json +18 -0
  22. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/configuration_projector.py +25 -0
  23. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/modeling_projector.py +48 -0
  24. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/pytorch_model.bin +3 -0
  25. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_encoder/config.json +19 -0
  26. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_encoder/preprocessor_config.json +24 -0
  27. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_encoder/pytorch_model.bin +3 -0
  28. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/config.json +18 -0
  29. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/configuration_projector.py +25 -0
  30. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/modeling_projector.py +48 -0
  31. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/pytorch_model.bin +3 -0
  32. s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/xtuner_config.py +703 -0
  33. vgdseg_annotations/coco_vgdseg_train.json +3 -0
  34. vgdseg_annotations/coco_vgdseg_val.json +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ vgdseg_annotations/coco_vgdseg_train.json filter=lfs diff=lfs merge=lfs -text
37
+ vgdseg_annotations/coco_vgdseg_val.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -16,7 +16,7 @@ tags:
16
 
17
  <sup>1</sup> Sun Yat-sen University, <sup>2</sup> Peng Cheng Laboratory, <sup>3</sup> Meituan Inc.
18
 
19
- <sup>📧</sup> corresponding author.
20
  </div>
21
 
22
  <div align="center" style="display: flex; justify-content: center; align-items: center;">
 
16
 
17
  <sup>1</sup> Sun Yat-sen University, <sup>2</sup> Peng Cheng Laboratory, <sup>3</sup> Meituan Inc.
18
 
19
+ <sup>📧</sup> Corresponding author
20
  </div>
21
 
22
  <div align="center" style="display: flex; justify-content: center; align-items: center;">
s1_seg_finetune/xsam_sam_large_m2f_e36_gpu16_seg_finetune/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8f0ea6912951c1a31e409e331be35471953190f31f38efd519c10f02e9b11da
3
+ size 679089406
s2_align_pretrain/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_e1_gpu16_align_pretrain/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:202d131ebb09b6d068c56c668bcb95780059021bbf5b506a13c3785261c66389
3
+ size 70019002
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1036e611890bd8af8e73e368b97bddd3bcbfc8abdcbb257f05226ac4e8ff1ec
3
+ size 9248986190
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/added_tokens.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|assistant|>": 32001,
3
+ "<|endoftext|>": 32000,
4
+ "<|end|>": 32007,
5
+ "<|placeholder1|>": 32002,
6
+ "<|placeholder2|>": 32003,
7
+ "<|placeholder3|>": 32004,
8
+ "<|placeholder4|>": 32005,
9
+ "<|placeholder5|>": 32008,
10
+ "<|placeholder6|>": 32009,
11
+ "<|system|>": 32006,
12
+ "<|user|>": 32010
13
+ }
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Phi-3-mini-4k-instruct",
3
+ "architectures": [
4
+ "Phi3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_phi3.Phi3Config",
10
+ "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
11
+ },
12
+ "bos_token_id": 1,
13
+ "embd_pdrop": 0.0,
14
+ "eos_token_id": 32000,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 3072,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 8192,
19
+ "max_position_embeddings": 4096,
20
+ "model_type": "phi3",
21
+ "num_attention_heads": 32,
22
+ "num_hidden_layers": 32,
23
+ "num_key_value_heads": 32,
24
+ "original_max_position_embeddings": 4096,
25
+ "pad_token_id": 32000,
26
+ "resid_pdrop": 0.0,
27
+ "rms_norm_eps": 1e-05,
28
+ "rope_scaling": null,
29
+ "rope_theta": 10000.0,
30
+ "sliding_window": 2047,
31
+ "tie_word_embeddings": false,
32
+ "torch_dtype": "float16",
33
+ "transformers_version": "4.48.0",
34
+ "use_cache": true,
35
+ "vocab_size": 32014
36
+ }
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": [
5
+ 32000,
6
+ 32001,
7
+ 32007
8
+ ],
9
+ "pad_token_id": 32000,
10
+ "transformers_version": "4.48.0"
11
+ }
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00001-of-00004.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1398fdd95dd16597366a564b7fdef4f928faa12fe2ac4b80b7f4511502abbc0
3
+ size 1958403558
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00002-of-00004.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ff085f4f962fe5ca14769b7d8a8235cfc2a58c80a2532da2ce31f21838141ac
3
+ size 1937897316
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00003-of-00004.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69b499b2a723e8ed2bdfb0213f75d5c2ece2ddb2830b26f8856b4dfe74968138
3
+ size 1981937508
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model-00004-of-00004.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac56ea2ec260c0f149361d5ffd1eced953727eee36b94094b47b376e606f249e
3
+ size 1763373552
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/pytorch_model.bin.index.json ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 7641544704
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "pytorch_model-00004-of-00004.bin",
7
+ "model.embed_tokens.weight": "pytorch_model-00001-of-00004.bin",
8
+ "model.layers.0.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
9
+ "model.layers.0.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
10
+ "model.layers.0.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
11
+ "model.layers.0.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
12
+ "model.layers.0.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
13
+ "model.layers.0.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
14
+ "model.layers.1.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
15
+ "model.layers.1.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
16
+ "model.layers.1.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
17
+ "model.layers.1.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
18
+ "model.layers.1.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
19
+ "model.layers.1.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
20
+ "model.layers.10.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
21
+ "model.layers.10.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
22
+ "model.layers.10.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
23
+ "model.layers.10.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
24
+ "model.layers.10.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
25
+ "model.layers.10.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
26
+ "model.layers.11.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
27
+ "model.layers.11.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
28
+ "model.layers.11.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
29
+ "model.layers.11.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
30
+ "model.layers.11.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
31
+ "model.layers.11.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
32
+ "model.layers.12.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
33
+ "model.layers.12.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
34
+ "model.layers.12.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
35
+ "model.layers.12.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
36
+ "model.layers.12.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
37
+ "model.layers.12.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
38
+ "model.layers.13.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
39
+ "model.layers.13.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
40
+ "model.layers.13.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
41
+ "model.layers.13.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
42
+ "model.layers.13.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
43
+ "model.layers.13.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
44
+ "model.layers.14.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
45
+ "model.layers.14.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
46
+ "model.layers.14.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
47
+ "model.layers.14.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
48
+ "model.layers.14.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
49
+ "model.layers.14.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
50
+ "model.layers.15.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
51
+ "model.layers.15.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
52
+ "model.layers.15.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
53
+ "model.layers.15.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
54
+ "model.layers.15.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
55
+ "model.layers.15.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
56
+ "model.layers.16.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
57
+ "model.layers.16.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
58
+ "model.layers.16.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
59
+ "model.layers.16.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
60
+ "model.layers.16.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
61
+ "model.layers.16.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
62
+ "model.layers.17.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
63
+ "model.layers.17.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
64
+ "model.layers.17.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
65
+ "model.layers.17.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
66
+ "model.layers.17.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
67
+ "model.layers.17.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
68
+ "model.layers.18.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
69
+ "model.layers.18.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
70
+ "model.layers.18.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
71
+ "model.layers.18.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
72
+ "model.layers.18.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
73
+ "model.layers.18.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
74
+ "model.layers.19.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
75
+ "model.layers.19.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
76
+ "model.layers.19.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
77
+ "model.layers.19.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
78
+ "model.layers.19.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
79
+ "model.layers.19.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
80
+ "model.layers.2.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
81
+ "model.layers.2.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
82
+ "model.layers.2.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
83
+ "model.layers.2.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
84
+ "model.layers.2.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
85
+ "model.layers.2.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
86
+ "model.layers.20.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
87
+ "model.layers.20.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
88
+ "model.layers.20.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
89
+ "model.layers.20.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
90
+ "model.layers.20.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
91
+ "model.layers.20.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
92
+ "model.layers.21.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
93
+ "model.layers.21.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
94
+ "model.layers.21.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
95
+ "model.layers.21.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
96
+ "model.layers.21.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
97
+ "model.layers.21.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
98
+ "model.layers.22.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
99
+ "model.layers.22.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
100
+ "model.layers.22.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
101
+ "model.layers.22.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
102
+ "model.layers.22.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
103
+ "model.layers.22.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
104
+ "model.layers.23.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
105
+ "model.layers.23.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
106
+ "model.layers.23.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
107
+ "model.layers.23.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
108
+ "model.layers.23.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
109
+ "model.layers.23.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
110
+ "model.layers.24.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
111
+ "model.layers.24.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
112
+ "model.layers.24.mlp.gate_up_proj.weight": "pytorch_model-00003-of-00004.bin",
113
+ "model.layers.24.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
114
+ "model.layers.24.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
115
+ "model.layers.24.self_attn.qkv_proj.weight": "pytorch_model-00003-of-00004.bin",
116
+ "model.layers.25.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
117
+ "model.layers.25.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
118
+ "model.layers.25.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
119
+ "model.layers.25.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
120
+ "model.layers.25.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
121
+ "model.layers.25.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
122
+ "model.layers.26.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
123
+ "model.layers.26.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
124
+ "model.layers.26.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
125
+ "model.layers.26.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
126
+ "model.layers.26.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
127
+ "model.layers.26.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
128
+ "model.layers.27.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
129
+ "model.layers.27.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
130
+ "model.layers.27.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
131
+ "model.layers.27.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
132
+ "model.layers.27.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
133
+ "model.layers.27.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
134
+ "model.layers.28.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
135
+ "model.layers.28.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
136
+ "model.layers.28.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
137
+ "model.layers.28.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
138
+ "model.layers.28.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
139
+ "model.layers.28.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
140
+ "model.layers.29.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
141
+ "model.layers.29.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
142
+ "model.layers.29.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
143
+ "model.layers.29.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
144
+ "model.layers.29.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
145
+ "model.layers.29.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
146
+ "model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
147
+ "model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
148
+ "model.layers.3.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
149
+ "model.layers.3.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
150
+ "model.layers.3.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
151
+ "model.layers.3.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
152
+ "model.layers.30.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
153
+ "model.layers.30.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
154
+ "model.layers.30.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
155
+ "model.layers.30.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
156
+ "model.layers.30.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
157
+ "model.layers.30.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
158
+ "model.layers.31.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
159
+ "model.layers.31.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
160
+ "model.layers.31.mlp.gate_up_proj.weight": "pytorch_model-00004-of-00004.bin",
161
+ "model.layers.31.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
162
+ "model.layers.31.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
163
+ "model.layers.31.self_attn.qkv_proj.weight": "pytorch_model-00004-of-00004.bin",
164
+ "model.layers.4.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
165
+ "model.layers.4.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
166
+ "model.layers.4.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
167
+ "model.layers.4.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
168
+ "model.layers.4.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
169
+ "model.layers.4.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
170
+ "model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
171
+ "model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
172
+ "model.layers.5.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
173
+ "model.layers.5.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
174
+ "model.layers.5.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
175
+ "model.layers.5.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
176
+ "model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
177
+ "model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
178
+ "model.layers.6.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
179
+ "model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
180
+ "model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
181
+ "model.layers.6.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
182
+ "model.layers.7.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
183
+ "model.layers.7.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
184
+ "model.layers.7.mlp.gate_up_proj.weight": "pytorch_model-00001-of-00004.bin",
185
+ "model.layers.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
186
+ "model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
187
+ "model.layers.7.self_attn.qkv_proj.weight": "pytorch_model-00001-of-00004.bin",
188
+ "model.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
189
+ "model.layers.8.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
190
+ "model.layers.8.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
191
+ "model.layers.8.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
192
+ "model.layers.8.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
193
+ "model.layers.8.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
194
+ "model.layers.9.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
195
+ "model.layers.9.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
196
+ "model.layers.9.mlp.gate_up_proj.weight": "pytorch_model-00002-of-00004.bin",
197
+ "model.layers.9.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
198
+ "model.layers.9.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
199
+ "model.layers.9.self_attn.qkv_proj.weight": "pytorch_model-00002-of-00004.bin",
200
+ "model.norm.weight": "pytorch_model-00004-of-00004.bin"
201
+ }
202
+ }
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/llm/tokenizer_config.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": true,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "32000": {
31
+ "content": "<|endoftext|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "32001": {
39
+ "content": "<|assistant|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": true,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "32002": {
47
+ "content": "<|placeholder1|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": true,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "32003": {
55
+ "content": "<|placeholder2|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": true,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "32004": {
63
+ "content": "<|placeholder3|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": true,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "32005": {
71
+ "content": "<|placeholder4|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": true,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "32006": {
79
+ "content": "<|system|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": true,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "32007": {
87
+ "content": "<|end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": true,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "32008": {
95
+ "content": "<|placeholder5|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": true,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "32009": {
103
+ "content": "<|placeholder6|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": true,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "32010": {
111
+ "content": "<|user|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": true,
115
+ "single_word": false,
116
+ "special": true
117
+ }
118
+ },
119
+ "bos_token": "<s>",
120
+ "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
+ "clean_up_tokenization_spaces": false,
122
+ "eos_token": "<|endoftext|>",
123
+ "extra_special_tokens": {},
124
+ "legacy": false,
125
+ "model_max_length": 4096,
126
+ "pad_token": "<|endoftext|>",
127
+ "padding_side": "right",
128
+ "sp_model_kwargs": {},
129
+ "tokenizer_class": "LlamaTokenizer",
130
+ "unk_token": "<unk>",
131
+ "use_default_system_prompt": false
132
+ }
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_encoder/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sam-vit-large",
3
+ "architectures": [
4
+ "XSegmentor"
5
+ ],
6
+ "initializer_range": 0.02,
7
+ "mask_decoder_config": {
8
+ "model_type": ""
9
+ },
10
+ "model_type": "sam",
11
+ "prompt_encoder_config": {
12
+ "model_type": ""
13
+ },
14
+ "torch_dtype": "bfloat16",
15
+ "transformers_version": "4.48.0",
16
+ "vision_config": {
17
+ "dropout": 0.0,
18
+ "global_attn_indexes": [
19
+ 5,
20
+ 11,
21
+ 17,
22
+ 23
23
+ ],
24
+ "hidden_size": 1024,
25
+ "initializer_factor": 1.0,
26
+ "intermediate_size": 6144,
27
+ "mlp_dim": 4096,
28
+ "model_type": "",
29
+ "num_attention_heads": 16,
30
+ "num_hidden_layers": 24,
31
+ "projection_dim": 512
32
+ }
33
+ }
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_encoder/preprocessor_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 1024,
4
+ "width": 1024
5
+ },
6
+ "do_convert_rgb": true,
7
+ "do_crop": false,
8
+ "do_flip": false,
9
+ "do_normalize": true,
10
+ "do_pad": true,
11
+ "do_rescale": true,
12
+ "do_resize": true,
13
+ "flip_direction": "horizontal",
14
+ "flip_ratio": 0.5,
15
+ "ignore_index": 0,
16
+ "image_mean": [
17
+ 0.485,
18
+ 0.456,
19
+ 0.406
20
+ ],
21
+ "image_processor_type": "SamImageProcessor",
22
+ "image_std": [
23
+ 0.229,
24
+ 0.224,
25
+ 0.225
26
+ ],
27
+ "mask_pad_size": {
28
+ "height": 1024,
29
+ "width": 1024
30
+ },
31
+ "mask_size": {
32
+ "longest_edge": 1024
33
+ },
34
+ "pad_size": {
35
+ "height": 1024,
36
+ "width": 1024
37
+ },
38
+ "processor_class": "SamProcessor",
39
+ "resample": 2,
40
+ "rescale_factor": 0.00392156862745098,
41
+ "size": {
42
+ "longest_edge": 1024
43
+ }
44
+ }
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73ee1f35874aba42b79cf385e9e7f8bbbf619e3bb8f3ad27955c41cbf3e8dcb3
3
+ size 616667758
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DynamicProjectorModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_projector.DynamicProjectorConfig",
7
+ "AutoModel": "modeling_projector.DynamicProjectorModel"
8
+ },
9
+ "bias": true,
10
+ "depth": 2,
11
+ "downsample_ratio": 0.5,
12
+ "hidden_act": "gelu",
13
+ "llm_hidden_size": 3072,
14
+ "model_type": "dynamic_projector",
15
+ "torch_dtype": "bfloat16",
16
+ "transformers_version": "4.48.0",
17
+ "visual_hidden_size": 1024
18
+ }
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/configuration_projector.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from transformers import PretrainedConfig
3
+
4
+
5
+ class DynamicProjectorConfig(PretrainedConfig):
6
+ model_type = "dynamic_projector"
7
+ _auto_class = "AutoConfig"
8
+
9
+ def __init__(
10
+ self,
11
+ visual_hidden_size=4096,
12
+ llm_hidden_size=4096,
13
+ downsample_ratio=1.0,
14
+ depth=2,
15
+ hidden_act="gelu",
16
+ bias=True,
17
+ **kwargs,
18
+ ):
19
+ self.visual_hidden_size = visual_hidden_size
20
+ self.llm_hidden_size = llm_hidden_size
21
+ self.downsample_ratio = downsample_ratio
22
+ self.depth = depth
23
+ self.hidden_act = hidden_act
24
+ self.bias = bias
25
+ super().__init__(**kwargs)
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/modeling_projector.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch.nn as nn
3
+ from transformers import PreTrainedModel
4
+ from transformers.activations import ACT2FN
5
+
6
+ from xsam.model.utils import pixel_shuffle
7
+
8
+ from .configuration_projector import DynamicProjectorConfig
9
+
10
+
11
+ class DynamicProjectorModel(PreTrainedModel):
12
+ _auto_class = "AutoModel"
13
+ config_class = DynamicProjectorConfig
14
+ base_model_prefix = "model"
15
+ supports_gradient_checkpointing = True
16
+ _no_split_modules = ["model"]
17
+
18
+ def __init__(self, config: DynamicProjectorConfig) -> None:
19
+ super().__init__(config)
20
+ self.gradient_checkpointing = False
21
+
22
+ visual_hidden_size = config.visual_hidden_size * int(1 / config.downsample_ratio) ** 2
23
+ modules = [
24
+ nn.Linear(visual_hidden_size, config.llm_hidden_size, bias=config.bias),
25
+ ]
26
+ for _ in range(1, config.depth):
27
+ modules.append(ACT2FN[config.hidden_act])
28
+ modules.append(nn.Linear(config.llm_hidden_size, config.llm_hidden_size, bias=config.bias))
29
+ self.model = nn.Sequential(*modules)
30
+
31
+ def enable_input_require_grads(self):
32
+ def make_inputs_require_grad(module, input, output):
33
+ output.requires_grad_(True)
34
+
35
+ self.model.register_forward_hook(make_inputs_require_grad)
36
+
37
+ def forward(self, x):
38
+ if x.ndim == 4:
39
+ if self.config.downsample_ratio != 1:
40
+ x = pixel_shuffle(x, self.config.downsample_ratio)
41
+ x = x.view(x.shape[0], -1, x.shape[-1])
42
+
43
+ if self.gradient_checkpointing and self.training:
44
+ layer_outputs = self._gradient_checkpointing_func(self.model, x)
45
+ else:
46
+ layer_outputs = self.model(x)
47
+
48
+ return layer_outputs
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/segmentor_projector/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51a2a66f2a0cd1b54c160916a628821da09f366256b7d5c9f73a05b261c9f71e
3
+ size 44054528
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_encoder/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "siglip2-so400m-patch14-384",
3
+ "architectures": [
4
+ "SiglipVisionModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "hidden_act": "gelu_pytorch_tanh",
8
+ "hidden_size": 1152,
9
+ "image_size": 384,
10
+ "intermediate_size": 4304,
11
+ "layer_norm_eps": 1e-06,
12
+ "model_type": "siglip_vision_model",
13
+ "num_attention_heads": 16,
14
+ "num_channels": 3,
15
+ "num_hidden_layers": 27,
16
+ "patch_size": 14,
17
+ "torch_dtype": "bfloat16",
18
+ "transformers_version": "4.48.0"
19
+ }
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_encoder/preprocessor_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "SiglipImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "processor_class": "SiglipProcessor",
18
+ "resample": 2,
19
+ "rescale_factor": 0.00392156862745098,
20
+ "size": {
21
+ "height": 384,
22
+ "width": 384
23
+ }
24
+ }
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d20f0e3b88fb7a553165f32ec37684da2d51f36e87ded7420d7ea3375b015e3
3
+ size 856600842
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DynamicProjectorModel"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_projector.DynamicProjectorConfig",
7
+ "AutoModel": "modeling_projector.DynamicProjectorModel"
8
+ },
9
+ "bias": true,
10
+ "depth": 2,
11
+ "downsample_ratio": 1.0,
12
+ "hidden_act": "gelu",
13
+ "llm_hidden_size": 3072,
14
+ "model_type": "dynamic_projector",
15
+ "torch_dtype": "bfloat16",
16
+ "transformers_version": "4.48.0",
17
+ "visual_hidden_size": 1152
18
+ }
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/configuration_projector.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from transformers import PretrainedConfig
3
+
4
+
5
+ class DynamicProjectorConfig(PretrainedConfig):
6
+ model_type = "dynamic_projector"
7
+ _auto_class = "AutoConfig"
8
+
9
+ def __init__(
10
+ self,
11
+ visual_hidden_size=4096,
12
+ llm_hidden_size=4096,
13
+ downsample_ratio=1.0,
14
+ depth=2,
15
+ hidden_act="gelu",
16
+ bias=True,
17
+ **kwargs,
18
+ ):
19
+ self.visual_hidden_size = visual_hidden_size
20
+ self.llm_hidden_size = llm_hidden_size
21
+ self.downsample_ratio = downsample_ratio
22
+ self.depth = depth
23
+ self.hidden_act = hidden_act
24
+ self.bias = bias
25
+ super().__init__(**kwargs)
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/modeling_projector.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import torch.nn as nn
3
+ from transformers import PreTrainedModel
4
+ from transformers.activations import ACT2FN
5
+
6
+ from xsam.model.utils import pixel_shuffle
7
+
8
+ from .configuration_projector import DynamicProjectorConfig
9
+
10
+
11
+ class DynamicProjectorModel(PreTrainedModel):
12
+ _auto_class = "AutoModel"
13
+ config_class = DynamicProjectorConfig
14
+ base_model_prefix = "model"
15
+ supports_gradient_checkpointing = True
16
+ _no_split_modules = ["model"]
17
+
18
+ def __init__(self, config: DynamicProjectorConfig) -> None:
19
+ super().__init__(config)
20
+ self.gradient_checkpointing = False
21
+
22
+ visual_hidden_size = config.visual_hidden_size * int(1 / config.downsample_ratio) ** 2
23
+ modules = [
24
+ nn.Linear(visual_hidden_size, config.llm_hidden_size, bias=config.bias),
25
+ ]
26
+ for _ in range(1, config.depth):
27
+ modules.append(ACT2FN[config.hidden_act])
28
+ modules.append(nn.Linear(config.llm_hidden_size, config.llm_hidden_size, bias=config.bias))
29
+ self.model = nn.Sequential(*modules)
30
+
31
+ def enable_input_require_grads(self):
32
+ def make_inputs_require_grad(module, input, output):
33
+ output.requires_grad_(True)
34
+
35
+ self.model.register_forward_hook(make_inputs_require_grad)
36
+
37
+ def forward(self, x):
38
+ if x.ndim == 4:
39
+ if self.config.downsample_ratio != 1:
40
+ x = pixel_shuffle(x, self.config.downsample_ratio)
41
+ x = x.view(x.shape[0], -1, x.shape[-1])
42
+
43
+ if self.gradient_checkpointing and self.training:
44
+ layer_outputs = self._gradient_checkpointing_func(self.model, x)
45
+ else:
46
+ layer_outputs = self.model(x)
47
+
48
+ return layer_outputs
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/visual_projector/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:509d21776557ab6566a4b2163e29df9de3299e5c7af9b2e906f6fdf447d91795
3
+ size 25966592
s3_mixed_finetune/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_m2f_gpu16_mixed_finetune/xtuner_model/xtuner_config.py ADDED
@@ -0,0 +1,703 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from copy import deepcopy
3
+ from os import getenv
4
+
5
+ import torch
6
+ from mmengine.hooks import CheckpointHook, DistSamplerSeedHook, IterTimerHook, LoggerHook, ParamSchedulerHook
7
+ from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
8
+ from torch.optim import AdamW
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer, SiglipImageProcessor, SiglipVisionModel
10
+ from xsam.dataset import GenericSegDataset, VGDSegDataset
11
+ from xsam.dataset.map_fns import dataset_map_fn_factory, generic_seg_map_fn, template_map_fn_factory, vgd_seg_map_fn
12
+ from xsam.dataset.process_fns import (
13
+ gcg_seg_postprocess_fn,
14
+ generic_seg_postprocess_fn,
15
+ inter_seg_postprocess_fn,
16
+ process_map_fn_factory,
17
+ reason_seg_postprocess_fn,
18
+ refer_seg_postprocess_fn,
19
+ vgd_seg_postprocess_fn,
20
+ )
21
+ from xsam.dataset.processors import SamImageProcessor
22
+ from xsam.engine.hooks import DatasetInfoHook, EvaluateChatHook, ModelInfoHook, PTCheckpointHook
23
+ from xsam.engine.runners import TrainLoop
24
+ from xsam.evaluation.evaluators import GenericSegEvaluator, VGDSegEvaluator
25
+ from xsam.model import XSamModel
26
+ from xsam.model.segmentors import XSegmentor
27
+ from xsam.model.segmentors.mask2former import Mask2FormerConfig, Mask2FormerModel
28
+ from xsam.model.segmentors.sam import SamModel
29
+ from xsam.utils.visualizer import Visualizer
30
+ from xtuner.utils import PROMPT_TEMPLATE
31
+
32
+ #######################################################################
33
+ # PART 1 Settings #
34
+ #######################################################################
35
+ # Directories
36
+ code_dir = getenv("CODE_DIR", "./xsam/")
37
+ data_dir = getenv("DATA_DIR", "./datas/")
38
+ init_dir = getenv("INIT_DIR", "./inits/")
39
+ work_dir = getenv("WORK_DIR", "./wkdrs/")
40
+
41
+ # Model
42
+ llm_name_or_path = init_dir + "Phi-3-mini-4k-instruct"
43
+ visual_encoder_name_or_path = init_dir + "siglip2-so400m-patch14-384"
44
+ seg_encoder_name_or_path = init_dir + "sam-vit-large"
45
+ seg_decoder_name_or_path = init_dir + "mask2former-swin-large-coco-panoptic"
46
+
47
+ # Specify the pretrained pth
48
+ s1_pretrained_pth = work_dir + "s1_seg_finetune/xsam_sam_large_m2f_e36_gpu16_seg_finetune/pytorch_model.bin"
49
+ s2_pretrained_pth = (
50
+ work_dir
51
+ + "s2_align_pretrain/xsam_phi3_mini_4k_instruct_siglip2_so400m_p14_384_sam_large_e1_gpu16_align_pretrain/pytorch_model.bin"
52
+ ) # noqa: E501
53
+
54
+ # Prompt
55
+ prompt_template = PROMPT_TEMPLATE.phi3_chat
56
+ max_length = int(4096 - (384 / 14) ** 2 - 1024)
57
+
58
+ # Scheduler & Optimizer
59
+ batch_size = 4 # per_device
60
+ accumulative_counts = 1
61
+ dataloader_num_workers = 4
62
+ max_epochs = 1
63
+ optim_type = AdamW
64
+ lr = 4e-5
65
+ betas = (0.9, 0.999)
66
+ weight_decay = 0.05
67
+ max_norm = 1 # grad clip
68
+ warmup_ratio = 0.03
69
+
70
+ # Save
71
+ save_steps = 2000
72
+ save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
73
+
74
+ # Logging
75
+ logging_interval = 10
76
+
77
+ # Evaluate the generation performance during the training
78
+ evaluation_freq = 2000
79
+ SYSTEM = ""
80
+ evaluation_images = [
81
+ code_dir + "xsam/configs/xsam/images/llava_imgconv.jpg",
82
+ code_dir + "xsam/configs/xsam/images/panoptic_genseg.jpg",
83
+ code_dir + "xsam/configs/xsam/images/refcoco_refseg.jpg",
84
+ code_dir + "xsam/configs/xsam/images/lisa_reaseg.jpg",
85
+ code_dir + "xsam/configs/xsam/images/refcocog_gcgseg.jpg",
86
+ code_dir + "xsam/configs/xsam/images/coco_interseg.jpg",
87
+ code_dir + "xsam/configs/xsam/images/coco_interseg.jpg",
88
+ code_dir + "xsam/configs/xsam/images/coco_interseg.jpg",
89
+ code_dir + "xsam/configs/xsam/images/coco_interseg.jpg",
90
+ code_dir + "xsam/configs/xsam/images/coco_vgdseg.jpg",
91
+ code_dir + "xsam/configs/xsam/images/coco_vgdseg.jpg",
92
+ code_dir + "xsam/configs/xsam/images/coco_vgdseg.jpg",
93
+ code_dir + "xsam/configs/xsam/images/coco_vgdseg.jpg",
94
+ code_dir + "xsam/configs/xsam/images/coco_vgdseg.jpg",
95
+ ]
96
+ evaluation_inputs = [
97
+ "Can you describe this image in detail? Please elaborate in your response.",
98
+ "Can you generate segmentation masks for this image based on the specified categories: <p>person</p>, <p>bicycle</p>, <p>car</p>, <p>motorcycle</p>, <p>airplane</p>, <p>bus</p>, <p>train</p>, <p>truck</p>, <p>boat</p>, <p>traffic light</p>, <p>fire hydrant</p>, <p>stop sign</p>, <p>parking meter</p>, <p>bench</p>, <p>bird</p>, <p>cat</p>, <p>dog</p>, <p>horse</p>, <p>sheep</p>, <p>cow</p>, <p>elephant</p>, <p>bear</p>, <p>zebra</p>, <p>giraffe</p>, <p>backpack</p>, <p>umbrella</p>, <p>handbag</p>, <p>tie</p>, <p>suitcase</p>, <p>frisbee</p>, <p>skis</p>, <p>snowboard</p>, <p>sports ball</p>, <p>kite</p>, <p>baseball bat</p>, <p>baseball glove</p>, <p>skateboard</p>, <p>surfboard</p>, <p>tennis racket</p>, <p>bottle</p>, <p>wine glass</p>, <p>cup</p>, <p>fork</p>, <p>knife</p>, <p>spoon</p>, <p>bowl</p>, <p>banana</p>, <p>apple</p>, <p>sandwich</p>, <p>orange</p>, <p>broccoli</p>, <p>carrot</p>, <p>hot dog</p>, <p>pizza</p>, <p>donut</p>, <p>cake</p>, <p>chair</p>, <p>couch</p>, <p>potted plant</p>, <p>bed</p>, <p>dining table</p>, <p>toilet</p>, <p>tv</p>, <p>laptop</p>, <p>mouse</p>, <p>remote</p>, <p>keyboard</p>, <p>cell phone</p>, <p>microwave</p>, <p>oven</p>, <p>toaster</p>, <p>sink</p>, <p>refrigerator</p>, <p>book</p>, <p>clock</p>, <p>vase</p>, <p>scissors</p>, <p>teddy bear</p>, <p>hair drier</p>, <p>toothbrush</p>, <p>banner</p>, <p>blanket</p>, <p>bridge</p>, <p>cardboard</p>, <p>counter</p>, <p>curtain</p>, <p>door</p>, <p>floor wood</p>, <p>flower</p>, <p>fruit</p>, <p>gravel</p>, <p>house</p>, <p>light</p>, <p>mirror</p>, <p>net</p>, <p>pillow</p>, <p>platform</p>, <p>playingfield</p>, <p>railroad</p>, <p>river</p>, <p>road</p>, <p>roof</p>, <p>sand</p>, <p>sea</p>, <p>shelf</p>, <p>snow</p>, <p>stairs</p>, <p>tent</p>, <p>towel</p>, <p>wall brick</p>, <p>wall stone</p>, <p>wall tile</p>, <p>wall wood</p>, <p>water</p>, <p>window blind</p>, <p>window</p>, <p>tree</p>, <p>fence</p>, <p>ceiling</p>, <p>sky</p>, <p>cabinet</p>, <p>table</p>, <p>floor</p>, <p>pavement</p>, <p>mountain</p>, <p>grass</p>, <p>dirt</p>, <p>paper</p>, <p>food</p>, <p>building</p>, <p>rock</p>, <p>wall</p>, <p>rug</p>? Please output the segmentation mask.",
99
+ "Can you segment <p>the women with red coat</p> in this image? Please output the corresponding segmentation mask.",
100
+ "<p>when enjoying an ice cream sundae, what can we use to scoop up the whipped cream and place it on top of the ice cream?</p> Please output the corresponding segmentation mask.",
101
+ "Can you provide a brief description of the this image? Respond with interleaved segmentation masks for the corresponding phrases.",
102
+ "Can you segment the <p><region></p> in this image? Please output the corresponding segmentation mask.",
103
+ "Can you segment the <p><region></p> in this image? Please output the corresponding segmentation mask.",
104
+ "Can you segment the <p><region></p> in this image? Please output the corresponding segmentation mask.",
105
+ "Can you segment the <p><region></p> in this image? Please output the corresponding segmentation mask.",
106
+ "Can you segment the image based on the following regions: <p><region></p>? Please output the segmentation mask.",
107
+ "Can you segment the image based on the following regions: <p><region></p>? Please output the segmentation mask.",
108
+ "Can you segment the image based on the following regions: <p><region></p>? Please output the segmentation mask.",
109
+ "Can you segment the image based on the following regions: <p><region></p>, <p><region></p>? Please output the segmentation mask.",
110
+ "Can you segment the image based on the following regions: <p><region></p>, <p><region></p>? Please output the segmentation mask.",
111
+ ]
112
+ vprompt_masks = [
113
+ (None,),
114
+ (None,),
115
+ (None,),
116
+ (None,),
117
+ (None,),
118
+ (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_interseg_point0.png",),
119
+ (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_interseg_scribble1.png",),
120
+ (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_interseg_box0.png",),
121
+ (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_interseg_mask1.png",),
122
+ (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_point0.png",),
123
+ (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_scribble1.png",),
124
+ (code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_box0.png",),
125
+ (
126
+ code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_point0.png",
127
+ code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_scribble1.png",
128
+ ),
129
+ (
130
+ code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_box0.png",
131
+ code_dir + "xsam/configs/xsam/images/vprompt_masks/coco_vgdseg_point1.png",
132
+ ),
133
+ ]
134
+
135
+ #######################################################################
136
+ # PART 2 Model & Tokenizer & Image Processor #
137
+ #######################################################################
138
+ # TODO: add special tokens via import from xsam.utils
139
+ special_tokens = ["<SEG>", "<p>", "</p>"]
140
+ cond_type = "phrase" # "phrase" "cls" "all"
141
+ ignore_label = 255
142
+ tokenizer = dict(
143
+ type=AutoTokenizer.from_pretrained,
144
+ pretrained_model_name_or_path=llm_name_or_path,
145
+ trust_remote_code=True,
146
+ padding_side="right",
147
+ )
148
+
149
+ image_processor = dict(
150
+ type=SiglipImageProcessor.from_pretrained,
151
+ pretrained_model_name_or_path=visual_encoder_name_or_path,
152
+ trust_remote_code=True,
153
+ )
154
+
155
+ extra_image_processor = dict(
156
+ type=SamImageProcessor.from_pretrained,
157
+ pretrained_model_name_or_path=seg_encoder_name_or_path,
158
+ trust_remote_code=True,
159
+ ignore_index=0,
160
+ )
161
+
162
+ model = dict(
163
+ type=XSamModel,
164
+ freeze_llm=False,
165
+ freeze_visual_encoder=False,
166
+ freeze_segmentor_encoder=False,
167
+ use_dual_encoder=True,
168
+ use_vision_sampler=True,
169
+ connector_type="conv",
170
+ cond_type=cond_type,
171
+ seg_select_layers=[6, 12, 18, 24],
172
+ connector_hidden_dim=512,
173
+ connector_scale_factor=[4, 2, 1, 0.5],
174
+ sampler_input_feat="seg_pixel_values",
175
+ special_tokens=special_tokens,
176
+ s1_pretrained_pth=s1_pretrained_pth,
177
+ s2_pretrained_pth=s2_pretrained_pth,
178
+ tokenizer=tokenizer,
179
+ postprocess_fn=generic_seg_postprocess_fn,
180
+ llm=dict(
181
+ type=AutoModelForCausalLM.from_pretrained,
182
+ pretrained_model_name_or_path=llm_name_or_path,
183
+ trust_remote_code=False,
184
+ torch_dtype=torch.bfloat16,
185
+ attn_implementation="flash_attention_2",
186
+ ),
187
+ visual_encoder=dict(
188
+ type=SiglipVisionModel.from_pretrained,
189
+ pretrained_model_name_or_path=visual_encoder_name_or_path,
190
+ torch_dtype=torch.bfloat16,
191
+ ),
192
+ segmentor=dict(
193
+ type=XSegmentor,
194
+ encoder=dict(
195
+ type=SamModel.from_pretrained,
196
+ pretrained_model_name_or_path=seg_encoder_name_or_path,
197
+ trust_remote_code=True,
198
+ torch_dtype=torch.bfloat16,
199
+ ),
200
+ decoder=dict(
201
+ type=Mask2FormerModel._from_config,
202
+ config=dict(
203
+ type=Mask2FormerConfig.from_pretrained,
204
+ pretrained_model_name_or_path=seg_decoder_name_or_path,
205
+ use_backbone=False,
206
+ feature_channels=[512, 1024, 2048],
207
+ num_feature_levels=3,
208
+ trust_remote_code=True,
209
+ ),
210
+ torch_dtype=torch.bfloat16,
211
+ ),
212
+ torch_dtype=torch.bfloat16,
213
+ reinit_decoder=True,
214
+ open_cls=True,
215
+ ),
216
+ )
217
+
218
+ #######################################################################
219
+ # PART 3 Dataset & Dataloader #
220
+ #######################################################################
221
+ imgconv_data_root = data_dir + "llava_data/"
222
+ genseg_data_root = data_dir + "generic_seg_data/"
223
+ ovseg_data_root = data_dir + "ov_seg_data/"
224
+ refseg_data_root = data_dir + "refer_seg_data/"
225
+ reaseg_data_root = data_dir + "reason_seg_data/"
226
+ gcgseg_data_root = data_dir + "gcg_seg_data/"
227
+ vgdseg_data_root = data_dir + "vgd_seg_data/"
228
+ interseg_data_root = data_dir + "inter_seg_data/"
229
+
230
+ pannoptic_genseg_dataset = dict(
231
+ type=GenericSegDataset,
232
+ data_path=genseg_data_root + "coco/annotations/panoptic_train2017.json",
233
+ image_folder=genseg_data_root + "coco/train2017",
234
+ panseg_map_folder=genseg_data_root + "coco/panoptic_train2017",
235
+ tokenizer=tokenizer,
236
+ task_name="genseg",
237
+ data_name="panoptic_genseg",
238
+ cond_type=cond_type,
239
+ special_tokens=special_tokens,
240
+ extra_image_processor=extra_image_processor,
241
+ image_processor=image_processor,
242
+ dataset_map_fn=dict(
243
+ type=dataset_map_fn_factory,
244
+ fn=generic_seg_map_fn,
245
+ cond_type=cond_type,
246
+ ),
247
+ template_map_fn=dict(type=template_map_fn_factory, template=prompt_template),
248
+ max_length=max_length,
249
+ use_variant_cat=True,
250
+ pad_image_to_square=True,
251
+ )
252
+
253
+ coco_vgdseg_dataset = dict(
254
+ type=VGDSegDataset,
255
+ source_data_path=vgdseg_data_root + "coco/annotations/instances_train2017.json",
256
+ data_path=vgdseg_data_root + "annotations/coco_vgdseg_train.json",
257
+ image_folder=vgdseg_data_root + "coco/train2017",
258
+ tokenizer=tokenizer,
259
+ data_mode="train",
260
+ task_name="vgdseg",
261
+ data_name="coco_vgdseg",
262
+ cond_type=cond_type,
263
+ special_tokens=special_tokens,
264
+ extra_image_processor=extra_image_processor,
265
+ image_processor=image_processor,
266
+ dataset_map_fn=dict(
267
+ type=dataset_map_fn_factory,
268
+ fn=vgd_seg_map_fn,
269
+ cond_type=cond_type,
270
+ ),
271
+ template_map_fn=dict(type=template_map_fn_factory, template=prompt_template),
272
+ use_negative_sample=True,
273
+ sample_num=5,
274
+ max_length=max_length,
275
+ pad_image_to_square=True,
276
+ )
277
+
278
+ # False for predict mode, True for tensor mode
279
+ output_ids_with_output = True
280
+ val_datasets = [
281
+ dict(
282
+ type=GenericSegDataset,
283
+ data_path=genseg_data_root + "coco/annotations/panoptic_val2017.json",
284
+ image_folder=genseg_data_root + "coco/val2017",
285
+ panseg_map_folder=genseg_data_root + "coco/panoptic_val2017",
286
+ semseg_map_folder=genseg_data_root + "coco/panoptic_semseg_val2017",
287
+ data_mode="eval",
288
+ tokenizer=tokenizer,
289
+ task_name="genseg",
290
+ data_name="panoptic_genseg",
291
+ cond_type=cond_type,
292
+ special_tokens=special_tokens,
293
+ extra_image_processor=extra_image_processor,
294
+ image_processor=image_processor,
295
+ output_ids_with_output=output_ids_with_output,
296
+ postprocess_fn=dict(
297
+ type=process_map_fn_factory,
298
+ fn=generic_seg_postprocess_fn,
299
+ task_name="panoptic_genseg",
300
+ threshold=0.0,
301
+ ),
302
+ dataset_map_fn=dict(
303
+ type=dataset_map_fn_factory,
304
+ fn=generic_seg_map_fn,
305
+ cond_type=cond_type,
306
+ ),
307
+ template_map_fn=dict(
308
+ type=template_map_fn_factory,
309
+ template=prompt_template,
310
+ output_suffix=output_ids_with_output,
311
+ ),
312
+ max_length=max_length,
313
+ pad_image_to_square=True,
314
+ ),
315
+ dict(
316
+ type=GenericSegDataset,
317
+ data_path=genseg_data_root + "coco/annotations/panoptic_val2017.json",
318
+ image_folder=genseg_data_root + "coco/val2017",
319
+ panseg_map_folder=genseg_data_root + "coco/panoptic_val2017",
320
+ semseg_map_folder=genseg_data_root + "coco/panoptic_semseg_val2017",
321
+ data_mode="eval",
322
+ tokenizer=tokenizer,
323
+ task_name="genseg",
324
+ data_name="panoptic_genseg",
325
+ output_ids_with_output=output_ids_with_output,
326
+ cond_type=cond_type,
327
+ special_tokens=special_tokens,
328
+ image_processor=image_processor,
329
+ extra_image_processor=extra_image_processor,
330
+ dataset_map_fn=dict(
331
+ type=dataset_map_fn_factory,
332
+ fn=generic_seg_map_fn,
333
+ cond_type=cond_type,
334
+ ),
335
+ postprocess_fn=dict(
336
+ type=process_map_fn_factory,
337
+ fn=generic_seg_postprocess_fn,
338
+ task_name="semantic_genseg",
339
+ ),
340
+ template_map_fn=dict(
341
+ type=template_map_fn_factory,
342
+ template=prompt_template,
343
+ output_suffix=output_ids_with_output,
344
+ ),
345
+ max_length=max_length,
346
+ pad_image_to_square=True,
347
+ ),
348
+ dict(
349
+ type=GenericSegDataset,
350
+ data_path=genseg_data_root + "coco/annotations/instances_val2017.json",
351
+ image_folder=genseg_data_root + "coco/val2017",
352
+ task_name="genseg",
353
+ data_name="instance_genseg",
354
+ data_mode="eval",
355
+ tokenizer=tokenizer,
356
+ output_ids_with_output=output_ids_with_output,
357
+ cond_type=cond_type,
358
+ special_tokens=special_tokens,
359
+ image_processor=image_processor,
360
+ extra_image_processor=extra_image_processor,
361
+ postprocess_fn=dict(
362
+ type=process_map_fn_factory,
363
+ fn=generic_seg_postprocess_fn,
364
+ task_name="instance_genseg",
365
+ threshold=0.0,
366
+ ),
367
+ dataset_map_fn=dict(
368
+ type=dataset_map_fn_factory,
369
+ fn=generic_seg_map_fn,
370
+ cond_type=cond_type,
371
+ ),
372
+ template_map_fn=dict(
373
+ type=template_map_fn_factory,
374
+ template=prompt_template,
375
+ output_suffix=output_ids_with_output,
376
+ ),
377
+ max_length=max_length,
378
+ pad_image_to_square=True,
379
+ ),
380
+ dict(
381
+ type=VGDSegDataset,
382
+ source_data_path=vgdseg_data_root + "coco/annotations/instances_val2017.json",
383
+ data_path=vgdseg_data_root + "annotations/coco_vgdseg_val.json",
384
+ image_folder=vgdseg_data_root + "coco/val2017",
385
+ tokenizer=tokenizer,
386
+ task_name="vgdseg",
387
+ data_name="coco_vgdseg_point",
388
+ data_mode="eval",
389
+ visual_prompt_type="point_visual_prompt",
390
+ output_ids_with_output=output_ids_with_output,
391
+ cond_type=cond_type,
392
+ special_tokens=special_tokens,
393
+ extra_image_processor=extra_image_processor,
394
+ image_processor=image_processor,
395
+ postprocess_fn=dict(
396
+ type=process_map_fn_factory,
397
+ fn=vgd_seg_postprocess_fn,
398
+ threshold=0.0,
399
+ return_contiguous_labels=True,
400
+ ),
401
+ dataset_map_fn=dict(
402
+ type=dataset_map_fn_factory,
403
+ fn=vgd_seg_map_fn,
404
+ cond_type=cond_type,
405
+ ),
406
+ template_map_fn=dict(
407
+ type=template_map_fn_factory, template=prompt_template, output_suffix=output_ids_with_output
408
+ ),
409
+ use_negative_sample=False,
410
+ sample_num=5,
411
+ max_length=max_length,
412
+ pad_image_to_square=True,
413
+ ),
414
+ dict(
415
+ type=VGDSegDataset,
416
+ source_data_path=vgdseg_data_root + "coco/annotations/instances_val2017.json",
417
+ data_path=vgdseg_data_root + "annotations/coco_vgdseg_val.json",
418
+ image_folder=vgdseg_data_root + "coco/val2017",
419
+ tokenizer=tokenizer,
420
+ task_name="vgdseg",
421
+ data_name="coco_vgdseg_scribble",
422
+ data_mode="eval",
423
+ visual_prompt_type="scribble_visual_prompt",
424
+ output_ids_with_output=output_ids_with_output,
425
+ cond_type=cond_type,
426
+ special_tokens=special_tokens,
427
+ extra_image_processor=extra_image_processor,
428
+ image_processor=image_processor,
429
+ postprocess_fn=dict(
430
+ type=process_map_fn_factory,
431
+ fn=vgd_seg_postprocess_fn,
432
+ threshold=0.0,
433
+ return_contiguous_labels=True,
434
+ ),
435
+ dataset_map_fn=dict(
436
+ type=dataset_map_fn_factory,
437
+ fn=vgd_seg_map_fn,
438
+ cond_type=cond_type,
439
+ ),
440
+ template_map_fn=dict(
441
+ type=template_map_fn_factory, template=prompt_template, output_suffix=output_ids_with_output
442
+ ),
443
+ use_negative_sample=False,
444
+ sample_num=5,
445
+ max_length=max_length,
446
+ pad_image_to_square=True,
447
+ ),
448
+ dict(
449
+ type=VGDSegDataset,
450
+ source_data_path=vgdseg_data_root + "coco/annotations/instances_val2017.json",
451
+ data_path=vgdseg_data_root + "annotations/coco_vgdseg_val.json",
452
+ image_folder=vgdseg_data_root + "coco/val2017",
453
+ tokenizer=tokenizer,
454
+ task_name="vgdseg",
455
+ data_name="coco_vgdseg_box",
456
+ data_mode="eval",
457
+ visual_prompt_type="box_visual_prompt",
458
+ output_ids_with_output=output_ids_with_output,
459
+ cond_type=cond_type,
460
+ special_tokens=special_tokens,
461
+ extra_image_processor=extra_image_processor,
462
+ image_processor=image_processor,
463
+ postprocess_fn=dict(
464
+ type=process_map_fn_factory,
465
+ fn=vgd_seg_postprocess_fn,
466
+ threshold=0.0,
467
+ return_contiguous_labels=True,
468
+ ),
469
+ dataset_map_fn=dict(
470
+ type=dataset_map_fn_factory,
471
+ fn=vgd_seg_map_fn,
472
+ cond_type=cond_type,
473
+ ),
474
+ template_map_fn=dict(
475
+ type=template_map_fn_factory, template=prompt_template, output_suffix=output_ids_with_output
476
+ ),
477
+ use_negative_sample=False,
478
+ sample_num=5,
479
+ max_length=max_length,
480
+ pad_image_to_square=True,
481
+ ),
482
+ dict(
483
+ type=VGDSegDataset,
484
+ source_data_path=vgdseg_data_root + "coco/annotations/instances_val2017.json",
485
+ data_path=vgdseg_data_root + "annotations/coco_vgdseg_val.json",
486
+ image_folder=vgdseg_data_root + "coco/val2017",
487
+ tokenizer=tokenizer,
488
+ task_name="vgdseg",
489
+ data_name="coco_vgdseg_mask",
490
+ data_mode="eval",
491
+ visual_prompt_type="mask_visual_prompt",
492
+ output_ids_with_output=output_ids_with_output,
493
+ cond_type=cond_type,
494
+ special_tokens=special_tokens,
495
+ extra_image_processor=extra_image_processor,
496
+ image_processor=image_processor,
497
+ postprocess_fn=dict(
498
+ type=process_map_fn_factory,
499
+ fn=vgd_seg_postprocess_fn,
500
+ threshold=0.0,
501
+ return_contiguous_labels=True,
502
+ ),
503
+ dataset_map_fn=dict(
504
+ type=dataset_map_fn_factory,
505
+ fn=vgd_seg_map_fn,
506
+ cond_type=cond_type,
507
+ ),
508
+ template_map_fn=dict(
509
+ type=template_map_fn_factory, template=prompt_template, output_suffix=output_ids_with_output
510
+ ),
511
+ use_negative_sample=False,
512
+ sample_num=5,
513
+ max_length=max_length,
514
+ pad_image_to_square=True,
515
+ ),
516
+ ]
517
+
518
+ val_evaluators = [
519
+ dict(
520
+ type=GenericSegEvaluator,
521
+ distributed=True,
522
+ data_name="panoptic_genseg",
523
+ ),
524
+ dict(
525
+ type=GenericSegEvaluator,
526
+ data_name="semantic_genseg",
527
+ distributed=True,
528
+ ),
529
+ dict(
530
+ type=GenericSegEvaluator,
531
+ data_name="instance_genseg",
532
+ distributed=True,
533
+ ),
534
+ dict(
535
+ type=VGDSegEvaluator,
536
+ data_name="coco_vgdseg_point",
537
+ distributed=True,
538
+ ),
539
+ dict(
540
+ type=VGDSegEvaluator,
541
+ data_name="coco_vgdseg_scribble",
542
+ distributed=True,
543
+ ),
544
+ dict(
545
+ type=VGDSegEvaluator,
546
+ data_name="coco_vgdseg_box",
547
+ distributed=True,
548
+ ),
549
+ dict(
550
+ type=VGDSegEvaluator,
551
+ data_name="coco_vgdseg_mask",
552
+ distributed=True,
553
+ ),
554
+ ]
555
+
556
+ vis_datasets = val_datasets
557
+
558
+ vis_datasets = deepcopy(val_datasets)
559
+ for dataset in vis_datasets:
560
+ if dataset["task_name"] in ["genseg", "ovseg", "vgdseg", "interseg"]:
561
+ dataset["postprocess_fn"]["threshold"] = 0.5 # type: ignore
562
+
563
+ #######################################################################
564
+ # PART 4 Scheduler & Optimizer #
565
+ #######################################################################
566
+ # optimizer
567
+ optim_wrapper = dict(
568
+ type=AmpOptimWrapper,
569
+ optimizer=dict(type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
570
+ clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
571
+ accumulative_counts=accumulative_counts,
572
+ loss_scale="dynamic",
573
+ dtype="float16",
574
+ paramwise_cfg=dict(
575
+ custom_keys={
576
+ "segmentor.encoder": dict(lr_mult=0.1, decay_mult=1.0),
577
+ "visual_encoder": dict(lr_mult=0.1, decay_mult=1.0),
578
+ },
579
+ ),
580
+ )
581
+
582
+ # learning policy
583
+ # More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
584
+ param_scheduler = [
585
+ dict(
586
+ type=LinearLR,
587
+ start_factor=1e-5,
588
+ by_epoch=True,
589
+ begin=0,
590
+ end=warmup_ratio * max_epochs,
591
+ convert_to_iter_based=True,
592
+ ),
593
+ dict(
594
+ type=CosineAnnealingLR,
595
+ eta_min=0.0,
596
+ by_epoch=True,
597
+ begin=warmup_ratio * max_epochs,
598
+ end=max_epochs,
599
+ convert_to_iter_based=True,
600
+ ),
601
+ ]
602
+
603
+ # train, val, test setting
604
+ train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
605
+
606
+ #######################################################################
607
+ # PART 5 Runtime #
608
+ #######################################################################
609
+ # set visualizer
610
+ visualizer = dict(
611
+ type=Visualizer,
612
+ scale=1.0,
613
+ font_size_scale=1.0,
614
+ )
615
+
616
+ # Log the dialogue periodically during the training process, optional
617
+ custom_hooks = [
618
+ dict(
619
+ type=ModelInfoHook,
620
+ module_names=["llm", "visual_encoder", "projector", "connector", "segmentor"],
621
+ display_params=True,
622
+ ),
623
+ dict(type=DatasetInfoHook, tokenizer=tokenizer, special_tokens=special_tokens),
624
+ dict(
625
+ type=EvaluateChatHook,
626
+ tokenizer=tokenizer,
627
+ special_tokens=special_tokens,
628
+ image_processor=image_processor,
629
+ postprocess_fns=[
630
+ None,
631
+ generic_seg_postprocess_fn,
632
+ refer_seg_postprocess_fn,
633
+ reason_seg_postprocess_fn,
634
+ gcg_seg_postprocess_fn,
635
+ inter_seg_postprocess_fn,
636
+ inter_seg_postprocess_fn,
637
+ inter_seg_postprocess_fn,
638
+ inter_seg_postprocess_fn,
639
+ vgd_seg_postprocess_fn,
640
+ vgd_seg_postprocess_fn,
641
+ vgd_seg_postprocess_fn,
642
+ vgd_seg_postprocess_fn,
643
+ vgd_seg_postprocess_fn,
644
+ ],
645
+ extra_image_processor=extra_image_processor,
646
+ visualizer=visualizer,
647
+ every_n_iters=evaluation_freq,
648
+ evaluation_inputs=evaluation_inputs,
649
+ evaluation_images=evaluation_images,
650
+ vprompt_masks=vprompt_masks,
651
+ system=SYSTEM,
652
+ prompt_template=prompt_template,
653
+ ),
654
+ dict(type=PTCheckpointHook),
655
+ ]
656
+
657
+ # configure default hooks
658
+ default_hooks = dict(
659
+ # record the time of every iteration.
660
+ timer=dict(type=IterTimerHook),
661
+ # print log every 10 iterations.
662
+ logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=logging_interval),
663
+ # enable the parameter scheduler.
664
+ param_scheduler=dict(type=ParamSchedulerHook),
665
+ # save checkpoint per `save_steps`.
666
+ checkpoint=dict(
667
+ type=CheckpointHook,
668
+ by_epoch=False,
669
+ interval=save_steps,
670
+ max_keep_ckpts=save_total_limit,
671
+ ),
672
+ # set sampler seed in distributed environment.
673
+ sampler_seed=dict(type=DistSamplerSeedHook),
674
+ )
675
+
676
+ # configure environment
677
+ env_cfg = dict(
678
+ # whether to enable cudnn benchmark
679
+ cudnn_benchmark=False,
680
+ # set multi process parameters
681
+ mp_cfg=dict(mp_start_method="fork", opencv_num_threads=0),
682
+ # set distributed parameters
683
+ dist_cfg=dict(backend="nccl"),
684
+ )
685
+
686
+ # set log level
687
+ log_level = "INFO"
688
+
689
+ # load from which checkpoint
690
+ load_from = None
691
+
692
+ # whether to resume training from the loaded checkpoint
693
+ resume = False
694
+
695
+ # Defaults to use random seed and disable `deterministic`
696
+ randomness = dict(seed=None, deterministic=False)
697
+
698
+ # set log processor
699
+ log_processor = dict(
700
+ by_epoch=False,
701
+ window_size=1,
702
+ mean_pattern=r".*(loss|time|data_time|grad_norm|tflops).*",
703
+ )
vgdseg_annotations/coco_vgdseg_train.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cd3675dff40773835bb8bcc0af2a33855f5bda6e15f873320a5667147934a92
3
+ size 1388731793
vgdseg_annotations/coco_vgdseg_val.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39089126330dc2e72fd03f472e37ffab6273ce605b9c6415a4e6edd53a645f21
3
+ size 58943447