bert-base-squadv1-block-pruning-hybrid / XP_layer_wise_sparsity_global_rate_15.41.md

Chua, Vui Seng

Update readme and model analysis

ac8897e almost 4 years ago

28.1 kB

	layer_id	layer_type	param_type	shape	nparam	nnz	sparsity
0	bert.embeddings.word_embeddings	Embedding	weight	[30522, 768]	23440896	23440896	0
1	bert.embeddings.position_embeddings	Embedding	weight	[512, 768]	393216	393216	0
2	bert.embeddings.token_type_embeddings	Embedding	weight	[2, 768]	1536	1536	0
3	bert.embeddings.LayerNorm	LayerNorm	weight	[768]	768	768	0
4	bert.embeddings.LayerNorm	LayerNorm	bias	[768]	768	768	0
5	bert.encoder.layer.0.attention.self.query	Linear	weight	[320, 768]	245760	135168	0.45
6	bert.encoder.layer.0.attention.self.query	Linear	bias	[320]	320	256	0.2
7	bert.encoder.layer.0.attention.self.key	Linear	weight	[320, 768]	245760	149504	0.391667
8	bert.encoder.layer.0.attention.self.key	Linear	bias	[320]	320	256	0.2
9	bert.encoder.layer.0.attention.self.value	Linear	weight	[320, 768]	245760	173056	0.295833
10	bert.encoder.layer.0.attention.self.value	Linear	bias	[320]	320	256	0.2
11	bert.encoder.layer.0.attention.output.dense	Linear	weight	[768, 320]	245760	181248	0.2625
12	bert.encoder.layer.0.attention.output.dense	Linear	bias	[768]	768	768	0
13	bert.encoder.layer.0.attention.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
14	bert.encoder.layer.0.attention.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
15	bert.encoder.layer.0.intermediate.dense	Linear	weight	[185, 768]	142080	142080	0
16	bert.encoder.layer.0.intermediate.dense	Linear	bias	[185]	185	185	0
17	bert.encoder.layer.0.output.dense	Linear	weight	[768, 185]	142080	142080	0
18	bert.encoder.layer.0.output.dense	Linear	bias	[768]	768	768	0
19	bert.encoder.layer.0.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
20	bert.encoder.layer.0.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
21	bert.encoder.layer.1.attention.self.query	Linear	weight	[320, 768]	245760	175104	0.2875
22	bert.encoder.layer.1.attention.self.query	Linear	bias	[320]	320	288	0.1
23	bert.encoder.layer.1.attention.self.key	Linear	weight	[320, 768]	245760	177152	0.279167
24	bert.encoder.layer.1.attention.self.key	Linear	bias	[320]	320	288	0.1
25	bert.encoder.layer.1.attention.self.value	Linear	weight	[320, 768]	245760	166912	0.320833
26	bert.encoder.layer.1.attention.self.value	Linear	bias	[320]	320	288	0.1
27	bert.encoder.layer.1.attention.output.dense	Linear	weight	[768, 320]	245760	167936	0.316667
28	bert.encoder.layer.1.attention.output.dense	Linear	bias	[768]	768	768	0
29	bert.encoder.layer.1.attention.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
30	bert.encoder.layer.1.attention.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
31	bert.encoder.layer.1.intermediate.dense	Linear	weight	[315, 768]	241920	241920	0
32	bert.encoder.layer.1.intermediate.dense	Linear	bias	[315]	315	315	0
33	bert.encoder.layer.1.output.dense	Linear	weight	[768, 315]	241920	241920	0
34	bert.encoder.layer.1.output.dense	Linear	bias	[768]	768	768	0
35	bert.encoder.layer.1.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
36	bert.encoder.layer.1.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
37	bert.encoder.layer.2.attention.self.query	Linear	weight	[576, 768]	442368	285696	0.354167
38	bert.encoder.layer.2.attention.self.query	Linear	bias	[576]	576	480	0.166667
39	bert.encoder.layer.2.attention.self.key	Linear	weight	[576, 768]	442368	297984	0.326389
40	bert.encoder.layer.2.attention.self.key	Linear	bias	[576]	576	480	0.166667
41	bert.encoder.layer.2.attention.self.value	Linear	weight	[576, 768]	442368	226304	0.488426
42	bert.encoder.layer.2.attention.self.value	Linear	bias	[576]	576	384	0.333333
43	bert.encoder.layer.2.attention.output.dense	Linear	weight	[768, 576]	442368	237568	0.462963
44	bert.encoder.layer.2.attention.output.dense	Linear	bias	[768]	768	768	0
45	bert.encoder.layer.2.attention.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
46	bert.encoder.layer.2.attention.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
47	bert.encoder.layer.2.intermediate.dense	Linear	weight	[339, 768]	260352	260352	0
48	bert.encoder.layer.2.intermediate.dense	Linear	bias	[339]	339	339	0
49	bert.encoder.layer.2.output.dense	Linear	weight	[768, 339]	260352	260352	0
50	bert.encoder.layer.2.output.dense	Linear	bias	[768]	768	768	0
51	bert.encoder.layer.2.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
52	bert.encoder.layer.2.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
53	bert.encoder.layer.3.attention.self.query	Linear	weight	[576, 768]	442368	277504	0.372685
54	bert.encoder.layer.3.attention.self.query	Linear	bias	[576]	576	512	0.111111
55	bert.encoder.layer.3.attention.self.key	Linear	weight	[576, 768]	442368	303104	0.314815
56	bert.encoder.layer.3.attention.self.key	Linear	bias	[576]	576	512	0.111111
57	bert.encoder.layer.3.attention.self.value	Linear	weight	[576, 768]	442368	297984	0.326389
58	bert.encoder.layer.3.attention.self.value	Linear	bias	[576]	576	512	0.111111
59	bert.encoder.layer.3.attention.output.dense	Linear	weight	[768, 576]	442368	308224	0.303241
60	bert.encoder.layer.3.attention.output.dense	Linear	bias	[768]	768	768	0
61	bert.encoder.layer.3.attention.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
62	bert.encoder.layer.3.attention.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
63	bert.encoder.layer.3.intermediate.dense	Linear	weight	[368, 768]	282624	282624	0
64	bert.encoder.layer.3.intermediate.dense	Linear	bias	[368]	368	368	0
65	bert.encoder.layer.3.output.dense	Linear	weight	[768, 368]	282624	282624	0
66	bert.encoder.layer.3.output.dense	Linear	bias	[768]	768	768	0
67	bert.encoder.layer.3.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
68	bert.encoder.layer.3.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
69	bert.encoder.layer.4.attention.self.query	Linear	weight	[576, 768]	442368	291840	0.340278
70	bert.encoder.layer.4.attention.self.query	Linear	bias	[576]	576	544	0.0555555
71	bert.encoder.layer.4.attention.self.key	Linear	weight	[576, 768]	442368	310272	0.298611
72	bert.encoder.layer.4.attention.self.key	Linear	bias	[576]	576	544	0.0555555
73	bert.encoder.layer.4.attention.self.value	Linear	weight	[576, 768]	442368	272384	0.384259
74	bert.encoder.layer.4.attention.self.value	Linear	bias	[576]	576	480	0.166667
75	bert.encoder.layer.4.attention.output.dense	Linear	weight	[768, 576]	442368	263168	0.405093
76	bert.encoder.layer.4.attention.output.dense	Linear	bias	[768]	768	768	0
77	bert.encoder.layer.4.attention.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
78	bert.encoder.layer.4.attention.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
79	bert.encoder.layer.4.intermediate.dense	Linear	weight	[386, 768]	296448	296448	0
80	bert.encoder.layer.4.intermediate.dense	Linear	bias	[386]	386	386	0
81	bert.encoder.layer.4.output.dense	Linear	weight	[768, 386]	296448	296448	0
82	bert.encoder.layer.4.output.dense	Linear	bias	[768]	768	768	0
83	bert.encoder.layer.4.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
84	bert.encoder.layer.4.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
85	bert.encoder.layer.5.attention.self.query	Linear	weight	[384, 768]	294912	171008	0.420139
86	bert.encoder.layer.5.attention.self.query	Linear	bias	[384]	384	352	0.0833333
87	bert.encoder.layer.5.attention.self.key	Linear	weight	[384, 768]	294912	205824	0.302083
88	bert.encoder.layer.5.attention.self.key	Linear	bias	[384]	384	352	0.0833333
89	bert.encoder.layer.5.attention.self.value	Linear	weight	[384, 768]	294912	217088	0.263889
90	bert.encoder.layer.5.attention.self.value	Linear	bias	[384]	384	384	0
91	bert.encoder.layer.5.attention.output.dense	Linear	weight	[768, 384]	294912	223232	0.243056
92	bert.encoder.layer.5.attention.output.dense	Linear	bias	[768]	768	768	0
93	bert.encoder.layer.5.attention.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
94	bert.encoder.layer.5.attention.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
95	bert.encoder.layer.5.intermediate.dense	Linear	weight	[336, 768]	258048	258048	0
96	bert.encoder.layer.5.intermediate.dense	Linear	bias	[336]	336	336	0
97	bert.encoder.layer.5.output.dense	Linear	weight	[768, 336]	258048	258048	0
98	bert.encoder.layer.5.output.dense	Linear	bias	[768]	768	768	0
99	bert.encoder.layer.5.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
100	bert.encoder.layer.5.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
101	bert.encoder.layer.6.attention.self.query	Linear	weight	[448, 768]	344064	192512	0.440476
102	bert.encoder.layer.6.attention.self.query	Linear	bias	[448]	448	416	0.0714285
103	bert.encoder.layer.6.attention.self.key	Linear	weight	[448, 768]	344064	224256	0.348214
104	bert.encoder.layer.6.attention.self.key	Linear	bias	[448]	448	416	0.0714285
105	bert.encoder.layer.6.attention.self.value	Linear	weight	[448, 768]	344064	209920	0.389881
106	bert.encoder.layer.6.attention.self.value	Linear	bias	[448]	448	352	0.214286
107	bert.encoder.layer.6.attention.output.dense	Linear	weight	[768, 448]	344064	199680	0.419643
108	bert.encoder.layer.6.attention.output.dense	Linear	bias	[768]	768	768	0
109	bert.encoder.layer.6.attention.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
110	bert.encoder.layer.6.attention.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
111	bert.encoder.layer.6.intermediate.dense	Linear	weight	[280, 768]	215040	215040	0
112	bert.encoder.layer.6.intermediate.dense	Linear	bias	[280]	280	280	0
113	bert.encoder.layer.6.output.dense	Linear	weight	[768, 280]	215040	215040	0
114	bert.encoder.layer.6.output.dense	Linear	bias	[768]	768	768	0
115	bert.encoder.layer.6.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
116	bert.encoder.layer.6.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
117	bert.encoder.layer.7.attention.self.query	Linear	weight	[448, 768]	344064	201728	0.41369
118	bert.encoder.layer.7.attention.self.query	Linear	bias	[448]	448	416	0.0714285
119	bert.encoder.layer.7.attention.self.key	Linear	weight	[448, 768]	344064	237568	0.309524
120	bert.encoder.layer.7.attention.self.key	Linear	bias	[448]	448	416	0.0714285
121	bert.encoder.layer.7.attention.self.value	Linear	weight	[448, 768]	344064	218112	0.366071
122	bert.encoder.layer.7.attention.self.value	Linear	bias	[448]	448	352	0.214286
123	bert.encoder.layer.7.attention.output.dense	Linear	weight	[768, 448]	344064	202752	0.410714
124	bert.encoder.layer.7.attention.output.dense	Linear	bias	[768]	768	768	0
125	bert.encoder.layer.7.attention.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
126	bert.encoder.layer.7.attention.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
127	bert.encoder.layer.7.intermediate.dense	Linear	weight	[211, 768]	162048	162048	0
128	bert.encoder.layer.7.intermediate.dense	Linear	bias	[211]	211	211	0
129	bert.encoder.layer.7.output.dense	Linear	weight	[768, 211]	162048	162048	0
130	bert.encoder.layer.7.output.dense	Linear	bias	[768]	768	768	0
131	bert.encoder.layer.7.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
132	bert.encoder.layer.7.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
133	bert.encoder.layer.8.attention.self.query	Linear	weight	[448, 768]	344064	186368	0.458333
134	bert.encoder.layer.8.attention.self.query	Linear	bias	[448]	448	416	0.0714285
135	bert.encoder.layer.8.attention.self.key	Linear	weight	[448, 768]	344064	197632	0.425595
136	bert.encoder.layer.8.attention.self.key	Linear	bias	[448]	448	416	0.0714285
137	bert.encoder.layer.8.attention.self.value	Linear	weight	[448, 768]	344064	154624	0.550595
138	bert.encoder.layer.8.attention.self.value	Linear	bias	[448]	448	288	0.357143
139	bert.encoder.layer.8.attention.output.dense	Linear	weight	[768, 448]	344064	148480	0.568452
140	bert.encoder.layer.8.attention.output.dense	Linear	bias	[768]	768	768	0
141	bert.encoder.layer.8.attention.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
142	bert.encoder.layer.8.attention.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
143	bert.encoder.layer.8.intermediate.dense	Linear	weight	[108, 768]	82944	82944	0
144	bert.encoder.layer.8.intermediate.dense	Linear	bias	[108]	108	108	0
145	bert.encoder.layer.8.output.dense	Linear	weight	[768, 108]	82944	82944	0
146	bert.encoder.layer.8.output.dense	Linear	bias	[768]	768	768	0
147	bert.encoder.layer.8.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
148	bert.encoder.layer.8.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
149	bert.encoder.layer.9.attention.self.query	Linear	weight	[320, 768]	245760	144384	0.4125
150	bert.encoder.layer.9.attention.self.query	Linear	bias	[320]	320	288	0.1
151	bert.encoder.layer.9.attention.self.key	Linear	weight	[320, 768]	245760	155648	0.366667
152	bert.encoder.layer.9.attention.self.key	Linear	bias	[320]	320	288	0.1
153	bert.encoder.layer.9.attention.self.value	Linear	weight	[320, 768]	245760	63488	0.741667
154	bert.encoder.layer.9.attention.self.value	Linear	bias	[320]	320	160	0.5
155	bert.encoder.layer.9.attention.output.dense	Linear	weight	[768, 320]	245760	65536	0.733333
156	bert.encoder.layer.9.attention.output.dense	Linear	bias	[768]	768	704	0.0833333
157	bert.encoder.layer.9.attention.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
158	bert.encoder.layer.9.attention.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
159	bert.encoder.layer.9.intermediate.dense	Linear	weight	[53, 768]	40704	40704	5.96046e-08
160	bert.encoder.layer.9.intermediate.dense	Linear	bias	[53]	53	53	0
161	bert.encoder.layer.9.output.dense	Linear	weight	[768, 53]	40704	40704	5.96046e-08
162	bert.encoder.layer.9.output.dense	Linear	bias	[768]	768	768	0
163	bert.encoder.layer.9.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
164	bert.encoder.layer.9.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
165	bert.encoder.layer.10.attention.self.query	Linear	weight	[384, 768]	294912	158720	0.461806
166	bert.encoder.layer.10.attention.self.query	Linear	bias	[384]	384	320	0.166667
167	bert.encoder.layer.10.attention.self.key	Linear	weight	[384, 768]	294912	158720	0.461806
168	bert.encoder.layer.10.attention.self.key	Linear	bias	[384]	384	320	0.166667
169	bert.encoder.layer.10.attention.self.value	Linear	weight	[384, 768]	294912	77824	0.736111
170	bert.encoder.layer.10.attention.self.value	Linear	bias	[384]	384	192	0.5
171	bert.encoder.layer.10.attention.output.dense	Linear	weight	[768, 384]	294912	78848	0.732639
172	bert.encoder.layer.10.attention.output.dense	Linear	bias	[768]	768	736	0.0416666
173	bert.encoder.layer.10.attention.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
174	bert.encoder.layer.10.attention.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
175	bert.encoder.layer.10.intermediate.dense	Linear	weight	[86, 768]	66048	66048	0
176	bert.encoder.layer.10.intermediate.dense	Linear	bias	[86]	86	86	0
177	bert.encoder.layer.10.output.dense	Linear	weight	[768, 86]	66048	66048	0
178	bert.encoder.layer.10.output.dense	Linear	bias	[768]	768	768	0
179	bert.encoder.layer.10.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
180	bert.encoder.layer.10.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
181	bert.encoder.layer.11.attention.self.query	Linear	weight	[384, 768]	294912	107520	0.635417
182	bert.encoder.layer.11.attention.self.query	Linear	bias	[384]	384	256	0.333333
183	bert.encoder.layer.11.attention.self.key	Linear	weight	[384, 768]	294912	118784	0.597222
184	bert.encoder.layer.11.attention.self.key	Linear	bias	[384]	384	256	0.333333
185	bert.encoder.layer.11.attention.self.value	Linear	weight	[384, 768]	294912	62464	0.788194
186	bert.encoder.layer.11.attention.self.value	Linear	bias	[384]	384	192	0.5
187	bert.encoder.layer.11.attention.output.dense	Linear	weight	[768, 384]	294912	54272	0.815972
188	bert.encoder.layer.11.attention.output.dense	Linear	bias	[768]	768	672	0.125
189	bert.encoder.layer.11.attention.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
190	bert.encoder.layer.11.attention.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
191	bert.encoder.layer.11.intermediate.dense	Linear	weight	[105, 768]	80640	80640	0
192	bert.encoder.layer.11.intermediate.dense	Linear	bias	[105]	105	105	0
193	bert.encoder.layer.11.output.dense	Linear	weight	[768, 105]	80640	80640	0
194	bert.encoder.layer.11.output.dense	Linear	bias	[768]	768	768	0
195	bert.encoder.layer.11.output.LayerNorm	LayerNorm	weight	[768]	768	768	0
196	bert.encoder.layer.11.output.LayerNorm	LayerNorm	bias	[768]	768	768	0
197	qa_outputs	Linear	weight	[2, 768]	1536	1536	0
198	qa_outputs	Linear	bias	[2]	2	2	0