alibabasglab commited on
Commit
0948da4
·
verified ·
1 Parent(s): 4ea35be

Delete checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk

Browse files
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/config.yaml DELETED
@@ -1,59 +0,0 @@
1
- ## Config file
2
-
3
- # Log
4
- seed: 777
5
- use_cuda: 1 # 1 for True, 0 for False
6
-
7
- # dataset
8
- speaker_no: 2
9
- mix_lst_path: ./data/VoxCeleb2_non_repeat/mixture_data_list_2mix.csv
10
- audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/
11
- reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/
12
- # mix_lst_path: ./data/LRS2/mixture_data_list_2mix.csv
13
- # audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/LRS2/audio_clean/
14
- # reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/LRS2/mvlrs_v1/
15
- # mix_lst_path: ./data/LRS3/mixture_data_list_2mix.csv
16
- # audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/LRS3/audio_clean/
17
- # reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/LRS3/orig/
18
- audio_sr: 16000
19
- ref_sr: 25
20
-
21
- # dataloader
22
- num_workers: 2
23
- batch_size: 1 # 4-GPU training with a total effective batch size of 8
24
- accu_grad: 1
25
- effec_batch_size: 2 # per GPU, only used if accu_grad is set to 1, must be multiple times of batch size
26
- max_length: 3 # truncate the utterances in dataloader, in seconds
27
-
28
- # network settings
29
- init_from: 'checkpoints/log_VoxCeleb2_lip_tfgridnet_2spk' # 'None' or a log name 'log_2024-07-22(18:12:13)'
30
- causal: 0 # 1 for True, 0 for False
31
- network_reference:
32
- cue: lip # lip or speech or gesture or EEG
33
- backbone: resnet18 # resnet18 or shufflenetV2 or blazenet64
34
- emb_size: 256 # resnet18:256
35
- network_audio:
36
- backbone: av_tfgridnet_isam
37
- n_fft: 256
38
- stride: 128
39
- window: "hann"
40
- use_builtin_complex: False
41
- n_srcs: 1
42
- n_imics: 1
43
- n_layers: 6
44
- lstm_hidden_units: 192
45
- attn_n_head: 4
46
- attn_qk_output_channel: 4
47
- emb_dim: 48
48
- emb_ks: 4
49
- emb_hs: 1
50
- activation: "prelu"
51
- isam: 1
52
-
53
- # optimizer
54
- spk_att_dropout: 1 # 0 for always use speaker attention
55
- loss_type: ss_sisdr # "snr", "sisdr", "hybrid"
56
- lr_warmup: 1
57
- init_learning_rate: 0.0005
58
- max_epoch: 150
59
- clip_grad_norm: 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/last_best_checkpoint.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:99ab899c261721b450ae9c494b71cf33c3e08f962c458760c14cfc5cf4659c89
3
- size 162585486
 
 
 
 
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/last_checkpoint.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2bd23023c6ba194a49e46a5c16798184fe1a5b7f38b224914f4a49a6f5a0cb1
3
- size 162576686
 
 
 
 
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/log_2024-12-23(15:50:05).txt DELETED
@@ -1,762 +0,0 @@
1
- ## Config file
2
-
3
- # Log
4
- seed: 777
5
- use_cuda: 1 # 1 for True, 0 for False
6
-
7
- # dataset
8
- speaker_no: 2
9
- mix_lst_path: ./data/VoxCeleb2_non_repeat/mixture_data_list_2mix.csv
10
- audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/
11
- reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/
12
- audio_sr: 16000
13
- ref_sr: 25
14
-
15
- # dataloader
16
- num_workers: 2
17
- batch_size: 1 # 4-GPU training with a total effective batch size of 8
18
- accu_grad: 1
19
- effec_batch_size: 2 # per GPU, only used if accu_grad is set to 1, must be multiple times of batch size
20
- max_length: 3 # truncate the utterances in dataloader, in seconds
21
-
22
- # network settings
23
- init_from: 'checkpoints/log_VoxCeleb2_lip_tfgridnet_2spk' # 'None' or a log name 'log_2024-07-22(18:12:13)'
24
- causal: 0 # 1 for True, 0 for False
25
- network_reference:
26
- cue: lip # lip or speech or gesture or EEG
27
- backbone: resnet18 # resnet18 or shufflenetV2 or blazenet64
28
- emb_size: 256 # resnet18:256
29
- network_audio:
30
- backbone: av_tfgridnet_att_ss
31
- n_fft: 256
32
- stride: 128
33
- window: "hann"
34
- use_builtin_complex: False
35
- n_srcs: 1
36
- n_imics: 1
37
- n_layers: 6
38
- lstm_hidden_units: 192
39
- attn_n_head: 4
40
- attn_qk_output_channel: 4
41
- emb_dim: 48
42
- emb_ks: 4
43
- emb_hs: 1
44
- activation: "prelu"
45
-
46
- # optimizer
47
- spk_att_dropout: 1 # 0 for always use speaker attention
48
- loss_type: ss_sisdr # "snr", "sisdr", "hybrid"
49
- lr_warmup: 1
50
- init_learning_rate: 0.0005
51
- max_epoch: 150
52
- clip_grad_norm: 5
53
- W1223 15:50:36.040409 140547448362816 torch/distributed/run.py:779]
54
- W1223 15:50:36.040409 140547448362816 torch/distributed/run.py:779] *****************************************
55
- W1223 15:50:36.040409 140547448362816 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
56
- W1223 15:50:36.040409 140547448362816 torch/distributed/run.py:779] *****************************************
57
- [W1223 15:50:57.605401514 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
58
- [W1223 15:50:57.606194396 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
59
- [W1223 15:50:57.605414589 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
60
- [W1223 15:50:57.606215894 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
61
- [W1223 15:50:57.605418359 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
62
- [W1223 15:50:57.606241166 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
63
- [W1223 15:50:57.605453172 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
64
- [W1223 15:50:57.606286008 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
65
- started on checkpoints/log_2024-12-23(15:50:05)
66
-
67
- namespace(accu_grad=1, audio_direc='/mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/', audio_sr=16000, batch_size=1, causal=0, checkpoint_dir='checkpoints/log_2024-12-23(15:50:05)', clip_grad_norm=5.0, config=[<yamlargparse.Path object at 0x7fdc6754ee80>], device=device(type='cuda'), distributed=True, effec_batch_size=2, evaluate_only=0, init_from='checkpoints/log_VoxCeleb2_lip_tfgridnet_2spk', init_learning_rate=0.0005, local_rank=0, loss_type='ss_sisdr', lr_warmup=1, max_epoch=150, max_length=3, mix_lst_path='./data/VoxCeleb2_non_repeat/mixture_data_list_2mix.csv', network_audio=namespace(activation='prelu', attn_n_head=4, attn_qk_output_channel=4, backbone='av_tfgridnet_att_ss', emb_dim=48, emb_hs=1, emb_ks=4, lstm_hidden_units=192, n_fft=256, n_imics=1, n_layers=6, n_srcs=1, stride=128, use_builtin_complex=False, window='hann'), network_reference=namespace(backbone='resnet18', cue='lip', emb_size=256), num_workers=2, ref_sr=25, reference_direc='/mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/', seed=777, speaker_no=2, spk_att_dropout=1, train_from_last_checkpoint=0, use_cuda=1, world_size=4)
68
- network_wrapper(
69
- (sep_network): av_TFGridNetV3_att_ss(
70
- (enc): STFTEncoder(
71
- (stft): Stft(n_fft=256, win_length=256, hop_length=128, center=True, normalized=False, onesided=True)
72
- )
73
- (dec): STFTDecoder(
74
- (stft): Stft(n_fft=256, win_length=256, hop_length=128, center=True, normalized=False, onesided=True)
75
- )
76
- (conv): Sequential(
77
- (0): Conv2d(2, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
78
- (1): GroupNorm(1, 48, eps=1e-05, affine=True)
79
- )
80
- (blocks): ModuleList(
81
- (0-5): 6 x GridNetV3Block(
82
- (intra_norm): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
83
- (intra_rnn): LSTM(192, 192, batch_first=True, bidirectional=True)
84
- (intra_linear): ConvTranspose1d(384, 48, kernel_size=(4,), stride=(1,))
85
- (inter_norm): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
86
- (inter_rnn): LSTM(192, 192, batch_first=True, bidirectional=True)
87
- (inter_linear): ConvTranspose1d(384, 48, kernel_size=(4,), stride=(1,))
88
- (attn_conv_Q): Conv2d(48, 16, kernel_size=(1, 1), stride=(1, 1))
89
- (attn_norm_Q): AllHeadPReLULayerNormalization4DC(
90
- (act): PReLU(num_parameters=4)
91
- )
92
- (attn_conv_K): Conv2d(48, 16, kernel_size=(1, 1), stride=(1, 1))
93
- (attn_norm_K): AllHeadPReLULayerNormalization4DC(
94
- (act): PReLU(num_parameters=4)
95
- )
96
- (attn_conv_V): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1))
97
- (attn_norm_V): AllHeadPReLULayerNormalization4DC(
98
- (act): PReLU(num_parameters=4)
99
- )
100
- (attn_concat_proj): Sequential(
101
- (0): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1))
102
- (1): PReLU(num_parameters=1)
103
- (2): LayerNormalization()
104
- )
105
- (spk_att): TransformerEncoder(
106
- (layers): ModuleList(
107
- (0): TransformerEncoderLayer(
108
- (self_attn): MultiheadAttention(
109
- (out_proj): NonDynamicallyQuantizableLinear(in_features=48, out_features=48, bias=True)
110
- )
111
- (linear1): Linear(in_features=48, out_features=192, bias=True)
112
- (dropout): Dropout(p=0.1, inplace=False)
113
- (linear2): Linear(in_features=192, out_features=48, bias=True)
114
- (norm1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
115
- (norm2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
116
- (dropout1): Dropout(p=0.1, inplace=False)
117
- (dropout2): Dropout(p=0.1, inplace=False)
118
- )
119
- )
120
- )
121
- (spk_norm): GroupNorm(1, 48, eps=1e-08, affine=True)
122
- )
123
- )
124
- (deconv): ConvTranspose2d(48, 2, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
125
- (av_conv): ModuleList(
126
- (0-5): 6 x Linear(in_features=304, out_features=48, bias=True)
127
- )
128
- )
129
- (ref_encoder): Visual_encoder(
130
- (v_frontend): VisualFrontend(
131
- (frontend3D): Sequential(
132
- (0): Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
133
- (1): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
134
- (2): ReLU()
135
- (3): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
136
- )
137
- (resnet): ResNet(
138
- (layer1): ResNetLayer(
139
- (conv1a): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
140
- (bn1a): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
141
- (conv2a): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
142
- (downsample): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
143
- (outbna): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
144
- (conv1b): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
145
- (bn1b): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
146
- (conv2b): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
147
- (outbnb): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
148
- )
149
- (layer2): ResNetLayer(
150
- (conv1a): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
151
- (bn1a): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
152
- (conv2a): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
153
- (downsample): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
154
- (outbna): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
155
- (conv1b): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
156
- (bn1b): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
157
- (conv2b): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
158
- (outbnb): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
159
- )
160
- (layer3): ResNetLayer(
161
- (conv1a): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
162
- (bn1a): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
163
- (conv2a): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
164
- (downsample): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
165
- (outbna): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
166
- (conv1b): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
167
- (bn1b): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
168
- (conv2b): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
169
- (outbnb): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
170
- )
171
- (layer4): ResNetLayer(
172
- (conv1a): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
173
- (bn1a): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
174
- (conv2a): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
175
- (downsample): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
176
- (outbna): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
177
- (conv1b): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
178
- (bn1b): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
179
- (conv2b): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
180
- (outbnb): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
181
- )
182
- (avgpool): AvgPool2d(kernel_size=(4, 4), stride=(1, 1), padding=0)
183
- )
184
- )
185
- (v_ds): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
186
- (visual_conv): Sequential(
187
- (0): VisualConv1D(
188
- (relu_0): ReLU()
189
- (norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
190
- (conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
191
- (relu): ReLU()
192
- (norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
193
- (dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
194
- (prelu): PReLU(num_parameters=1)
195
- (norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
196
- (pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
197
- )
198
- (1): VisualConv1D(
199
- (relu_0): ReLU()
200
- (norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
201
- (conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
202
- (relu): ReLU()
203
- (norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
204
- (dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
205
- (prelu): PReLU(num_parameters=1)
206
- (norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
207
- (pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
208
- )
209
- (2): VisualConv1D(
210
- (relu_0): ReLU()
211
- (norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
212
- (conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
213
- (relu): ReLU()
214
- (norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
215
- (dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
216
- (prelu): PReLU(num_parameters=1)
217
- (norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
218
- (pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
219
- )
220
- (3): VisualConv1D(
221
- (relu_0): ReLU()
222
- (norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
223
- (conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
224
- (relu): ReLU()
225
- (norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
226
- (dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
227
- (prelu): PReLU(num_parameters=1)
228
- (norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
229
- (pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
230
- )
231
- (4): VisualConv1D(
232
- (relu_0): ReLU()
233
- (norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
234
- (conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
235
- (relu): ReLU()
236
- (norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
237
- (dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
238
- (prelu): PReLU(num_parameters=1)
239
- (norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
240
- (pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
241
- )
242
- )
243
- )
244
- )
245
-
246
- Total number of parameters: 20950309
247
-
248
-
249
- Total number of trainable parameters: 9765221
250
-
251
- dlcdanw1zq2cucwx-master-0:28:28 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
252
- dlcdanw1zq2cucwx-master-0:28:28 [0] NCCL INFO Bootstrap : Using eth0:22.5.96.241<0>
253
- dlcdanw1zq2cucwx-master-0:28:28 [0] NCCL INFO Plugin name set by env to libnccl-net-none.so
254
- dlcdanw1zq2cucwx-master-0:28:28 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
255
- dlcdanw1zq2cucwx-master-0:28:28 [0] NCCL INFO cudaDriverVersion 11040
256
- dlcdanw1zq2cucwx-master-0:30:30 [2] NCCL INFO cudaDriverVersion 11040
257
- dlcdanw1zq2cucwx-master-0:31:31 [3] NCCL INFO cudaDriverVersion 11040
258
- dlcdanw1zq2cucwx-master-0:29:29 [1] NCCL INFO cudaDriverVersion 11040
259
- NCCL version 2.20.5+cuda11.8
260
- dlcdanw1zq2cucwx-master-0:30:30 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
261
- dlcdanw1zq2cucwx-master-0:31:31 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
262
- dlcdanw1zq2cucwx-master-0:29:29 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
263
- dlcdanw1zq2cucwx-master-0:31:31 [3] NCCL INFO Bootstrap : Using eth0:22.5.96.241<0>
264
- dlcdanw1zq2cucwx-master-0:30:30 [2] NCCL INFO Bootstrap : Using eth0:22.5.96.241<0>
265
- dlcdanw1zq2cucwx-master-0:31:31 [3] NCCL INFO Plugin name set by env to libnccl-net-none.so
266
- dlcdanw1zq2cucwx-master-0:29:29 [1] NCCL INFO Bootstrap : Using eth0:22.5.96.241<0>
267
- dlcdanw1zq2cucwx-master-0:30:30 [2] NCCL INFO Plugin name set by env to libnccl-net-none.so
268
- dlcdanw1zq2cucwx-master-0:29:29 [1] NCCL INFO Plugin name set by env to libnccl-net-none.so
269
- dlcdanw1zq2cucwx-master-0:30:30 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
270
- dlcdanw1zq2cucwx-master-0:31:31 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
271
- dlcdanw1zq2cucwx-master-0:29:29 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
272
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
273
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
274
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
275
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
276
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO NCCL_IB_HCA set to mlx5
277
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO NCCL_IB_HCA set to mlx5
278
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO NCCL_IB_HCA set to mlx5
279
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO NCCL_IB_HCA set to mlx5
280
- libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
281
- libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
282
- libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
283
- libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
284
- libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
285
- libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
286
- libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
287
- libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
288
- libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
289
- libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
290
- libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
291
- libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
292
- libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
293
- libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
294
- libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
295
- libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
296
- libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
297
- libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
298
- libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
299
- libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
300
- libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
301
- libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
302
- libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
303
- libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
304
- libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
305
- libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
306
- libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
307
- libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
308
- libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
309
- libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
310
- libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
311
- libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
312
- libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
313
- libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
314
- libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
315
- libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
316
- libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
317
- libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
318
- libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
319
- libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
320
- libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
321
- libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
322
- libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
323
- libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
324
- libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
325
- libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
326
- libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
327
- libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
328
- libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
329
- libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
330
- libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
331
- libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
332
- libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
333
- libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
334
- libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
335
- libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
336
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.5.96.241<0>
337
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Using non-device net plugin version 0
338
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Using network IB
339
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.5.96.241<0>
340
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Using non-device net plugin version 0
341
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Using network IB
342
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.5.96.241<0>
343
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Using non-device net plugin version 0
344
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Using network IB
345
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.5.96.241<0>
346
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Using non-device net plugin version 0
347
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Using network IB
348
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO comm 0x866fc40 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 20 commId 0xafe80e26c24ecacf - Init START
349
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO comm 0x9d94400 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId 30 commId 0xafe80e26c24ecacf - Init START
350
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO comm 0x6be36a0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId 40 commId 0xafe80e26c24ecacf - Init START
351
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO comm 0x87bffd0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 10 commId 0xafe80e26c24ecacf - Init START
352
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
353
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
354
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
355
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO comm 0x9d94400 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
356
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO comm 0x6be36a0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
357
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO comm 0x87bffd0 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
358
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO comm 0x866fc40 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
359
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
360
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
361
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
362
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
363
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 00/12 : 0 1 2 3
364
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1
365
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 01/12 : 0 1 2 3
366
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO P2P Chunksize set to 524288
367
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0
368
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 02/12 : 0 1 2 3
369
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2
370
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 03/12 : 0 1 2 3
371
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO P2P Chunksize set to 524288
372
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO P2P Chunksize set to 524288
373
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 04/12 : 0 1 2 3
374
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 05/12 : 0 1 2 3
375
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 06/12 : 0 1 2 3
376
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 07/12 : 0 1 2 3
377
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 08/12 : 0 1 2 3
378
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 09/12 : 0 1 2 3
379
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 10/12 : 0 1 2 3
380
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 11/12 : 0 1 2 3
381
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1
382
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO P2P Chunksize set to 524288
383
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/IPC/read
384
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/IPC/read
385
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/IPC/read
386
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/IPC/read
387
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/IPC/read
388
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/IPC/read
389
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/IPC/read
390
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/IPC/read
391
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/IPC/read
392
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/IPC/read
393
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/IPC/read
394
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/IPC/read
395
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/IPC/read
396
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/IPC/read
397
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/IPC/read
398
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/IPC/read
399
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/IPC/read
400
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/IPC/read
401
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/IPC/read
402
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/IPC/read
403
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/IPC/read
404
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/IPC/read
405
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/IPC/read
406
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/IPC/read
407
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/IPC/read
408
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/IPC/read
409
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/IPC/read
410
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/IPC/read
411
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/IPC/read
412
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/IPC/read
413
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/IPC/read
414
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/IPC/read
415
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/IPC/read
416
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/IPC/read
417
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/IPC/read
418
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/IPC/read
419
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/IPC/read
420
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/IPC/read
421
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/IPC/read
422
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/IPC/read
423
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/IPC/read
424
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/IPC/read
425
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/IPC/read
426
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/IPC/read
427
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/IPC/read
428
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/IPC/read
429
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/IPC/read
430
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/IPC/read
431
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Connected all rings
432
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Connected all rings
433
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/IPC/read
434
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Connected all rings
435
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Connected all rings
436
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/IPC/read
437
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/IPC/read
438
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 03/0 : 3[3] -> 2[2] via P2P/IPC/read
439
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/IPC/read
440
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/IPC/read
441
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/IPC/read
442
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/IPC/read
443
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/IPC/read
444
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/IPC/read
445
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/IPC/read
446
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 11/0 : 3[3] -> 2[2] via P2P/IPC/read
447
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/IPC/read
448
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/IPC/read
449
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/IPC/read
450
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/IPC/read
451
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/IPC/read
452
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/IPC/read
453
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/IPC/read
454
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/IPC/read
455
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/IPC/read
456
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/IPC/read
457
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/IPC/read
458
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/IPC/read
459
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/IPC/read
460
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/IPC/read
461
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/IPC/read
462
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/IPC/read
463
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/IPC/read
464
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/IPC/read
465
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/IPC/read
466
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/IPC/read
467
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/IPC/read
468
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/IPC/read
469
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/IPC/read
470
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/IPC/read
471
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Connected all trees
472
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
473
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
474
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Connected all trees
475
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
476
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Connected all trees
477
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
478
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
479
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
480
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Connected all trees
481
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
482
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
483
- dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO comm 0x87bffd0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 10 commId 0xafe80e26c24ecacf - Init COMPLETE
484
- dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO comm 0x866fc40 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 20 commId 0xafe80e26c24ecacf - Init COMPLETE
485
- dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO comm 0x9d94400 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId 30 commId 0xafe80e26c24ecacf - Init COMPLETE
486
- dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO comm 0x6be36a0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId 40 commId 0xafe80e26c24ecacf - Init COMPLETE
487
- [rank1]:[W1223 15:51:28.897884818 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
488
- [rank0]:[W1223 15:51:28.897922529 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
489
- [rank2]:[W1223 15:51:28.897929604 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
490
- [rank3]:[W1223 15:51:28.898014217 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
491
- module.sep_network.blocks.0.spk_att.layers.0.self_attn.in_proj_weight not loaded
492
- module.sep_network.blocks.0.spk_att.layers.0.self_attn.in_proj_bias not loaded
493
- module.sep_network.blocks.0.spk_att.layers.0.self_attn.out_proj.weight not loaded
494
- module.sep_network.blocks.0.spk_att.layers.0.self_attn.out_proj.bias not loaded
495
- module.sep_network.blocks.0.spk_att.layers.0.linear1.weight not loaded
496
- module.sep_network.blocks.0.spk_att.layers.0.linear1.bias not loaded
497
- module.sep_network.blocks.0.spk_att.layers.0.linear2.weight not loaded
498
- module.sep_network.blocks.0.spk_att.layers.0.linear2.bias not loaded
499
- module.sep_network.blocks.0.spk_att.layers.0.norm1.weight not loaded
500
- module.sep_network.blocks.0.spk_att.layers.0.norm1.bias not loaded
501
- module.sep_network.blocks.0.spk_att.layers.0.norm2.weight not loaded
502
- module.sep_network.blocks.0.spk_att.layers.0.norm2.bias not loaded
503
- module.sep_network.blocks.0.spk_norm.weight not loaded
504
- module.sep_network.blocks.0.spk_norm.bias not loaded
505
- module.sep_network.blocks.1.spk_att.layers.0.self_attn.in_proj_weight not loaded
506
- module.sep_network.blocks.1.spk_att.layers.0.self_attn.in_proj_bias not loaded
507
- module.sep_network.blocks.1.spk_att.layers.0.self_attn.out_proj.weight not loaded
508
- module.sep_network.blocks.1.spk_att.layers.0.self_attn.out_proj.bias not loaded
509
- module.sep_network.blocks.1.spk_att.layers.0.linear1.weight not loaded
510
- module.sep_network.blocks.1.spk_att.layers.0.linear1.bias not loaded
511
- module.sep_network.blocks.1.spk_att.layers.0.linear2.weight not loaded
512
- module.sep_network.blocks.1.spk_att.layers.0.linear2.bias not loaded
513
- module.sep_network.blocks.1.spk_att.layers.0.norm1.weight not loaded
514
- module.sep_network.blocks.1.spk_att.layers.0.norm1.bias not loaded
515
- module.sep_network.blocks.1.spk_att.layers.0.norm2.weight not loaded
516
- module.sep_network.blocks.1.spk_att.layers.0.norm2.bias not loaded
517
- module.sep_network.blocks.1.spk_norm.weight not loaded
518
- module.sep_network.blocks.1.spk_norm.bias not loaded
519
- module.sep_network.blocks.2.spk_att.layers.0.self_attn.in_proj_weight not loaded
520
- module.sep_network.blocks.2.spk_att.layers.0.self_attn.in_proj_bias not loaded
521
- module.sep_network.blocks.2.spk_att.layers.0.self_attn.out_proj.weight not loaded
522
- module.sep_network.blocks.2.spk_att.layers.0.self_attn.out_proj.bias not loaded
523
- module.sep_network.blocks.2.spk_att.layers.0.linear1.weight not loaded
524
- module.sep_network.blocks.2.spk_att.layers.0.linear1.bias not loaded
525
- module.sep_network.blocks.2.spk_att.layers.0.linear2.weight not loaded
526
- module.sep_network.blocks.2.spk_att.layers.0.linear2.bias not loaded
527
- module.sep_network.blocks.2.spk_att.layers.0.norm1.weight not loaded
528
- module.sep_network.blocks.2.spk_att.layers.0.norm1.bias not loaded
529
- module.sep_network.blocks.2.spk_att.layers.0.norm2.weight not loaded
530
- module.sep_network.blocks.2.spk_att.layers.0.norm2.bias not loaded
531
- module.sep_network.blocks.2.spk_norm.weight not loaded
532
- module.sep_network.blocks.2.spk_norm.bias not loaded
533
- module.sep_network.blocks.3.spk_att.layers.0.self_attn.in_proj_weight not loaded
534
- module.sep_network.blocks.3.spk_att.layers.0.self_attn.in_proj_bias not loaded
535
- module.sep_network.blocks.3.spk_att.layers.0.self_attn.out_proj.weight not loaded
536
- module.sep_network.blocks.3.spk_att.layers.0.self_attn.out_proj.bias not loaded
537
- module.sep_network.blocks.3.spk_att.layers.0.linear1.weight not loaded
538
- module.sep_network.blocks.3.spk_att.layers.0.linear1.bias not loaded
539
- module.sep_network.blocks.3.spk_att.layers.0.linear2.weight not loaded
540
- module.sep_network.blocks.3.spk_att.layers.0.linear2.bias not loaded
541
- module.sep_network.blocks.3.spk_att.layers.0.norm1.weight not loaded
542
- module.sep_network.blocks.3.spk_att.layers.0.norm1.bias not loaded
543
- module.sep_network.blocks.3.spk_att.layers.0.norm2.weight not loaded
544
- module.sep_network.blocks.3.spk_att.layers.0.norm2.bias not loaded
545
- module.sep_network.blocks.3.spk_norm.weight not loaded
546
- module.sep_network.blocks.3.spk_norm.bias not loaded
547
- module.sep_network.blocks.4.spk_att.layers.0.self_attn.in_proj_weight not loaded
548
- module.sep_network.blocks.4.spk_att.layers.0.self_attn.in_proj_bias not loaded
549
- module.sep_network.blocks.4.spk_att.layers.0.self_attn.out_proj.weight not loaded
550
- module.sep_network.blocks.4.spk_att.layers.0.self_attn.out_proj.bias not loaded
551
- module.sep_network.blocks.4.spk_att.layers.0.linear1.weight not loaded
552
- module.sep_network.blocks.4.spk_att.layers.0.linear1.bias not loaded
553
- module.sep_network.blocks.4.spk_att.layers.0.linear2.weight not loaded
554
- module.sep_network.blocks.4.spk_att.layers.0.linear2.bias not loaded
555
- module.sep_network.blocks.4.spk_att.layers.0.norm1.weight not loaded
556
- module.sep_network.blocks.4.spk_att.layers.0.norm1.bias not loaded
557
- module.sep_network.blocks.4.spk_att.layers.0.norm2.weight not loaded
558
- module.sep_network.blocks.4.spk_att.layers.0.norm2.bias not loaded
559
- module.sep_network.blocks.4.spk_norm.weight not loaded
560
- module.sep_network.blocks.4.spk_norm.bias not loaded
561
- module.sep_network.blocks.5.spk_att.layers.0.self_attn.in_proj_weight not loaded
562
- module.sep_network.blocks.5.spk_att.layers.0.self_attn.in_proj_bias not loaded
563
- module.sep_network.blocks.5.spk_att.layers.0.self_attn.out_proj.weight not loaded
564
- module.sep_network.blocks.5.spk_att.layers.0.self_attn.out_proj.bias not loaded
565
- module.sep_network.blocks.5.spk_att.layers.0.linear1.weight not loaded
566
- module.sep_network.blocks.5.spk_att.layers.0.linear1.bias not loaded
567
- module.sep_network.blocks.5.spk_att.layers.0.linear2.weight not loaded
568
- module.sep_network.blocks.5.spk_att.layers.0.linear2.bias not loaded
569
- module.sep_network.blocks.5.spk_att.layers.0.norm1.weight not loaded
570
- module.sep_network.blocks.5.spk_att.layers.0.norm1.bias not loaded
571
- module.sep_network.blocks.5.spk_att.layers.0.norm2.weight not loaded
572
- module.sep_network.blocks.5.spk_att.layers.0.norm2.bias not loaded
573
- module.sep_network.blocks.5.spk_norm.weight not loaded
574
- module.sep_network.blocks.5.spk_norm.bias not loaded
575
- Init model from checkpoints/log_VoxCeleb2_lip_tfgridnet_2spk, and start new training
576
- [rank1]:[W1223 15:53:04.527524171 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
577
- [rank2]:[W1223 15:53:04.529782679 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
578
- [rank0]:[W1223 15:53:04.529824121 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
579
- [rank3]:[W1223 15:53:04.530453400 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
580
- Train Summary | End of Epoch 1 | Time 8862.50s | Train Loss -15.642
581
- Valid Summary | End of Epoch 1 | Time 1299.28s | Valid Loss -14.021
582
- Test Summary | End of Epoch 1 | Time 779.08s | Test Loss -13.495
583
- Fund new best model, dict saved
584
- Train Summary | End of Epoch 2 | Time 8868.78s | Train Loss -15.693
585
- Valid Summary | End of Epoch 2 | Time 1297.45s | Valid Loss -13.954
586
- Test Summary | End of Epoch 2 | Time 778.92s | Test Loss -13.553
587
- Train Summary | End of Epoch 3 | Time 8857.29s | Train Loss -15.673
588
- Valid Summary | End of Epoch 3 | Time 1297.92s | Valid Loss -13.922
589
- Test Summary | End of Epoch 3 | Time 778.84s | Test Loss -13.479
590
- Train Summary | End of Epoch 4 | Time 8841.24s | Train Loss -15.672
591
- Valid Summary | End of Epoch 4 | Time 1297.81s | Valid Loss -14.050
592
- Test Summary | End of Epoch 4 | Time 778.69s | Test Loss -13.570
593
- Fund new best model, dict saved
594
- Train Summary | End of Epoch 5 | Time 8842.92s | Train Loss -15.658
595
- Valid Summary | End of Epoch 5 | Time 1297.78s | Valid Loss -13.980
596
- Test Summary | End of Epoch 5 | Time 778.76s | Test Loss -13.672
597
- Train Summary | End of Epoch 6 | Time 8863.22s | Train Loss -15.625
598
- Valid Summary | End of Epoch 6 | Time 1297.63s | Valid Loss -14.042
599
- Test Summary | End of Epoch 6 | Time 778.93s | Test Loss -13.626
600
- Train Summary | End of Epoch 7 | Time 8874.34s | Train Loss -15.628
601
- Valid Summary | End of Epoch 7 | Time 1297.79s | Valid Loss -14.223
602
- Test Summary | End of Epoch 7 | Time 778.60s | Test Loss -13.703
603
- Fund new best model, dict saved
604
- Train Summary | End of Epoch 8 | Time 8878.31s | Train Loss -15.633
605
- Valid Summary | End of Epoch 8 | Time 1297.68s | Valid Loss -14.197
606
- Test Summary | End of Epoch 8 | Time 778.55s | Test Loss -13.741
607
- Train Summary | End of Epoch 9 | Time 8879.85s | Train Loss -15.644
608
- Valid Summary | End of Epoch 9 | Time 1297.92s | Valid Loss -14.241
609
- Test Summary | End of Epoch 9 | Time 778.76s | Test Loss -13.799
610
- Fund new best model, dict saved
611
- Train Summary | End of Epoch 10 | Time 8885.21s | Train Loss -15.669
612
- Valid Summary | End of Epoch 10 | Time 1297.27s | Valid Loss -14.229
613
- Test Summary | End of Epoch 10 | Time 778.51s | Test Loss -13.791
614
- Train Summary | End of Epoch 11 | Time 8884.38s | Train Loss -15.687
615
- Valid Summary | End of Epoch 11 | Time 1297.56s | Valid Loss -14.257
616
- Test Summary | End of Epoch 11 | Time 778.75s | Test Loss -13.766
617
- Fund new best model, dict saved
618
- Train Summary | End of Epoch 12 | Time 8861.05s | Train Loss -15.689
619
- Valid Summary | End of Epoch 12 | Time 1297.83s | Valid Loss -14.225
620
- Test Summary | End of Epoch 12 | Time 778.37s | Test Loss -13.740
621
- Train Summary | End of Epoch 13 | Time 8854.73s | Train Loss -15.715
622
- Valid Summary | End of Epoch 13 | Time 1297.07s | Valid Loss -14.338
623
- Test Summary | End of Epoch 13 | Time 778.98s | Test Loss -13.852
624
- Fund new best model, dict saved
625
- Train Summary | End of Epoch 14 | Time 8861.60s | Train Loss -15.720
626
- Valid Summary | End of Epoch 14 | Time 1297.61s | Valid Loss -14.320
627
- Test Summary | End of Epoch 14 | Time 778.73s | Test Loss -13.842
628
- Train Summary | End of Epoch 15 | Time 8867.45s | Train Loss -15.740
629
- Valid Summary | End of Epoch 15 | Time 1297.15s | Valid Loss -14.264
630
- Test Summary | End of Epoch 15 | Time 778.45s | Test Loss -13.831
631
- Train Summary | End of Epoch 16 | Time 8871.73s | Train Loss -15.750
632
- Valid Summary | End of Epoch 16 | Time 1297.58s | Valid Loss -14.301
633
- Test Summary | End of Epoch 16 | Time 778.59s | Test Loss -13.901
634
- Train Summary | End of Epoch 17 | Time 8874.12s | Train Loss -15.739
635
- Valid Summary | End of Epoch 17 | Time 1297.21s | Valid Loss -14.338
636
- Test Summary | End of Epoch 17 | Time 778.18s | Test Loss -13.897
637
- Fund new best model, dict saved
638
- Train Summary | End of Epoch 18 | Time 8866.66s | Train Loss -15.756
639
- Valid Summary | End of Epoch 18 | Time 1297.23s | Valid Loss -14.248
640
- Test Summary | End of Epoch 18 | Time 778.35s | Test Loss -13.823
641
- Train Summary | End of Epoch 19 | Time 8866.16s | Train Loss -15.784
642
- Valid Summary | End of Epoch 19 | Time 1296.46s | Valid Loss -14.318
643
- Test Summary | End of Epoch 19 | Time 778.56s | Test Loss -13.898
644
- Train Summary | End of Epoch 20 | Time 8861.71s | Train Loss -15.780
645
- Valid Summary | End of Epoch 20 | Time 1297.90s | Valid Loss -14.337
646
- Test Summary | End of Epoch 20 | Time 778.23s | Test Loss -13.892
647
- Train Summary | End of Epoch 21 | Time 8861.37s | Train Loss -15.811
648
- Valid Summary | End of Epoch 21 | Time 1296.68s | Valid Loss -14.383
649
- Test Summary | End of Epoch 21 | Time 778.57s | Test Loss -13.913
650
- Fund new best model, dict saved
651
- Train Summary | End of Epoch 22 | Time 8868.58s | Train Loss -15.815
652
- Valid Summary | End of Epoch 22 | Time 1297.38s | Valid Loss -14.341
653
- Test Summary | End of Epoch 22 | Time 778.10s | Test Loss -13.883
654
- Train Summary | End of Epoch 23 | Time 8876.67s | Train Loss -15.816
655
- Valid Summary | End of Epoch 23 | Time 1297.54s | Valid Loss -14.315
656
- Test Summary | End of Epoch 23 | Time 778.88s | Test Loss -13.974
657
- Train Summary | End of Epoch 24 | Time 8873.42s | Train Loss -15.821
658
- Valid Summary | End of Epoch 24 | Time 1297.77s | Valid Loss -14.415
659
- Test Summary | End of Epoch 24 | Time 778.40s | Test Loss -13.877
660
- Fund new best model, dict saved
661
- Train Summary | End of Epoch 25 | Time 8853.35s | Train Loss -15.834
662
- Valid Summary | End of Epoch 25 | Time 1297.67s | Valid Loss -14.356
663
- Test Summary | End of Epoch 25 | Time 778.28s | Test Loss -13.917
664
- Train Summary | End of Epoch 26 | Time 8879.37s | Train Loss -15.844
665
- Valid Summary | End of Epoch 26 | Time 1297.45s | Valid Loss -14.384
666
- Test Summary | End of Epoch 26 | Time 778.64s | Test Loss -13.973
667
- Train Summary | End of Epoch 27 | Time 8872.84s | Train Loss -15.854
668
- Valid Summary | End of Epoch 27 | Time 1297.92s | Valid Loss -14.402
669
- Test Summary | End of Epoch 27 | Time 779.00s | Test Loss -14.017
670
- Train Summary | End of Epoch 28 | Time 8875.05s | Train Loss -15.856
671
- Valid Summary | End of Epoch 28 | Time 1297.66s | Valid Loss -14.351
672
- Test Summary | End of Epoch 28 | Time 778.75s | Test Loss -13.939
673
- Train Summary | End of Epoch 29 | Time 8869.04s | Train Loss -15.884
674
- Valid Summary | End of Epoch 29 | Time 1297.87s | Valid Loss -14.430
675
- Test Summary | End of Epoch 29 | Time 778.77s | Test Loss -13.954
676
- Fund new best model, dict saved
677
- Train Summary | End of Epoch 30 | Time 8864.22s | Train Loss -15.890
678
- Valid Summary | End of Epoch 30 | Time 1297.79s | Valid Loss -14.398
679
- Test Summary | End of Epoch 30 | Time 778.88s | Test Loss -14.004
680
- Train Summary | End of Epoch 31 | Time 8862.20s | Train Loss -15.906
681
- Valid Summary | End of Epoch 31 | Time 1297.63s | Valid Loss -14.396
682
- Test Summary | End of Epoch 31 | Time 778.78s | Test Loss -13.985
683
- Train Summary | End of Epoch 32 | Time 8872.91s | Train Loss -15.911
684
- Valid Summary | End of Epoch 32 | Time 1297.77s | Valid Loss -14.425
685
- Test Summary | End of Epoch 32 | Time 778.33s | Test Loss -13.933
686
- Train Summary | End of Epoch 33 | Time 8867.16s | Train Loss -15.911
687
- Valid Summary | End of Epoch 33 | Time 1297.78s | Valid Loss -14.423
688
- Test Summary | End of Epoch 33 | Time 778.53s | Test Loss -13.952
689
- Train Summary | End of Epoch 34 | Time 8863.09s | Train Loss -15.916
690
- Valid Summary | End of Epoch 34 | Time 1297.82s | Valid Loss -14.470
691
- Test Summary | End of Epoch 34 | Time 778.80s | Test Loss -13.885
692
- Fund new best model, dict saved
693
- Train Summary | End of Epoch 35 | Time 8865.77s | Train Loss -15.936
694
- Valid Summary | End of Epoch 35 | Time 1297.80s | Valid Loss -14.486
695
- Test Summary | End of Epoch 35 | Time 778.59s | Test Loss -13.990
696
- Fund new best model, dict saved
697
- Train Summary | End of Epoch 36 | Time 8864.53s | Train Loss -15.931
698
- Valid Summary | End of Epoch 36 | Time 1297.82s | Valid Loss -14.413
699
- Test Summary | End of Epoch 36 | Time 778.79s | Test Loss -13.959
700
- Train Summary | End of Epoch 37 | Time 8860.11s | Train Loss -15.934
701
- Valid Summary | End of Epoch 37 | Time 1298.28s | Valid Loss -14.503
702
- Test Summary | End of Epoch 37 | Time 778.33s | Test Loss -13.980
703
- Fund new best model, dict saved
704
- Train Summary | End of Epoch 38 | Time 8855.04s | Train Loss -15.957
705
- Valid Summary | End of Epoch 38 | Time 1298.32s | Valid Loss -14.387
706
- Test Summary | End of Epoch 38 | Time 778.98s | Test Loss -13.980
707
- Train Summary | End of Epoch 39 | Time 8862.85s | Train Loss -15.964
708
- Valid Summary | End of Epoch 39 | Time 1297.98s | Valid Loss -14.462
709
- Test Summary | End of Epoch 39 | Time 779.50s | Test Loss -14.014
710
- Train Summary | End of Epoch 40 | Time 8861.48s | Train Loss -15.956
711
- Valid Summary | End of Epoch 40 | Time 1297.55s | Valid Loss -14.344
712
- Test Summary | End of Epoch 40 | Time 778.53s | Test Loss -13.998
713
- Train Summary | End of Epoch 41 | Time 8861.38s | Train Loss -15.955
714
- Valid Summary | End of Epoch 41 | Time 1297.55s | Valid Loss -14.369
715
- Test Summary | End of Epoch 41 | Time 779.16s | Test Loss -13.946
716
- Train Summary | End of Epoch 42 | Time 8853.68s | Train Loss -15.969
717
- Valid Summary | End of Epoch 42 | Time 1297.52s | Valid Loss -14.472
718
- Test Summary | End of Epoch 42 | Time 778.78s | Test Loss -14.000
719
- reload weights and optimizer from last best checkpoint
720
- Learning rate adjusted to: 0.000255
721
- Train Summary | End of Epoch 43 | Time 8855.22s | Train Loss -16.093
722
- Valid Summary | End of Epoch 43 | Time 1297.53s | Valid Loss -14.584
723
- Test Summary | End of Epoch 43 | Time 778.81s | Test Loss -14.144
724
- Fund new best model, dict saved
725
- Train Summary | End of Epoch 44 | Time 8839.27s | Train Loss -16.155
726
- Valid Summary | End of Epoch 44 | Time 1297.71s | Valid Loss -14.520
727
- Test Summary | End of Epoch 44 | Time 778.86s | Test Loss -14.054
728
- Train Summary | End of Epoch 45 | Time 8840.93s | Train Loss -16.158
729
- Valid Summary | End of Epoch 45 | Time 1298.07s | Valid Loss -14.566
730
- Test Summary | End of Epoch 45 | Time 778.17s | Test Loss -14.039
731
- Train Summary | End of Epoch 46 | Time 8846.57s | Train Loss -16.194
732
- Valid Summary | End of Epoch 46 | Time 1297.09s | Valid Loss -14.602
733
- Test Summary | End of Epoch 46 | Time 779.00s | Test Loss -14.058
734
- Fund new best model, dict saved
735
- Train Summary | End of Epoch 47 | Time 8848.27s | Train Loss -16.203
736
- Valid Summary | End of Epoch 47 | Time 1297.44s | Valid Loss -14.561
737
- Test Summary | End of Epoch 47 | Time 778.99s | Test Loss -14.038
738
- Train Summary | End of Epoch 48 | Time 8855.13s | Train Loss -16.208
739
- Valid Summary | End of Epoch 48 | Time 1298.17s | Valid Loss -14.567
740
- Test Summary | End of Epoch 48 | Time 779.06s | Test Loss -14.059
741
- Train Summary | End of Epoch 49 | Time 8860.44s | Train Loss -16.230
742
- Valid Summary | End of Epoch 49 | Time 1297.85s | Valid Loss -14.600
743
- Test Summary | End of Epoch 49 | Time 778.57s | Test Loss -14.109
744
- Train Summary | End of Epoch 50 | Time 8849.69s | Train Loss -16.234
745
- Valid Summary | End of Epoch 50 | Time 1297.78s | Valid Loss -14.520
746
- Test Summary | End of Epoch 50 | Time 778.36s | Test Loss -14.044
747
- Train Summary | End of Epoch 51 | Time 8845.04s | Train Loss -16.262
748
- Valid Summary | End of Epoch 51 | Time 1296.91s | Valid Loss -14.605
749
- Test Summary | End of Epoch 51 | Time 778.81s | Test Loss -14.112
750
- Fund new best model, dict saved
751
- Train Summary | End of Epoch 52 | Time 8846.18s | Train Loss -16.262
752
- Valid Summary | End of Epoch 52 | Time 1297.45s | Valid Loss -14.570
753
- Test Summary | End of Epoch 52 | Time 778.92s | Test Loss -14.142
754
- Train Summary | End of Epoch 53 | Time 8848.28s | Train Loss -16.273
755
- Valid Summary | End of Epoch 53 | Time 1297.74s | Valid Loss -14.576
756
- Test Summary | End of Epoch 53 | Time 778.71s | Test Loss -14.089
757
- Train Summary | End of Epoch 54 | Time 8827.56s | Train Loss -16.303
758
- Valid Summary | End of Epoch 54 | Time 1298.30s | Valid Loss -14.529
759
- Test Summary | End of Epoch 54 | Time 779.41s | Test Loss -14.063
760
- Train Summary | End of Epoch 55 | Time 8853.03s | Train Loss -16.310
761
- Valid Summary | End of Epoch 55 | Time 1297.30s | Valid Loss -14.512
762
- Test Summary | End of Epoch 55 | Time 778.55s | Test Loss -14.077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/log_2024-12-31(09:48:03).txt DELETED
@@ -1,632 +0,0 @@
1
- ## Config file
2
-
3
- # Log
4
- seed: 777
5
- use_cuda: 1 # 1 for True, 0 for False
6
-
7
- # dataset
8
- speaker_no: 2
9
- mix_lst_path: ./data/VoxCeleb2_non_repeat/mixture_data_list_2mix.csv
10
- audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/
11
- reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/
12
- audio_sr: 16000
13
- ref_sr: 25
14
-
15
- # dataloader
16
- num_workers: 2
17
- batch_size: 1 # 4-GPU training with a total effective batch size of 8
18
- accu_grad: 1
19
- effec_batch_size: 2 # per GPU, only used if accu_grad is set to 1, must be multiple times of batch size
20
- max_length: 3 # truncate the utterances in dataloader, in seconds
21
-
22
- # network settings
23
- init_from: 'checkpoints/log_VoxCeleb2_lip_tfgridnet_2spk' # 'None' or a log name 'log_2024-07-22(18:12:13)'
24
- causal: 0 # 1 for True, 0 for False
25
- network_reference:
26
- cue: lip # lip or speech or gesture or EEG
27
- backbone: resnet18 # resnet18 or shufflenetV2 or blazenet64
28
- emb_size: 256 # resnet18:256
29
- network_audio:
30
- backbone: av_tfgridnet_att_ss
31
- n_fft: 256
32
- stride: 128
33
- window: "hann"
34
- use_builtin_complex: False
35
- n_srcs: 1
36
- n_imics: 1
37
- n_layers: 6
38
- lstm_hidden_units: 192
39
- attn_n_head: 4
40
- attn_qk_output_channel: 4
41
- emb_dim: 48
42
- emb_ks: 4
43
- emb_hs: 1
44
- activation: "prelu"
45
-
46
- # optimizer
47
- spk_att_dropout: 1 # 0 for always use speaker attention
48
- loss_type: ss_sisdr # "snr", "sisdr", "hybrid"
49
- lr_warmup: 1
50
- init_learning_rate: 0.0005
51
- max_epoch: 150
52
- clip_grad_norm: 5
53
- W1231 09:48:39.901772 139973816584000 torch/distributed/run.py:779]
54
- W1231 09:48:39.901772 139973816584000 torch/distributed/run.py:779] *****************************************
55
- W1231 09:48:39.901772 139973816584000 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
56
- W1231 09:48:39.901772 139973816584000 torch/distributed/run.py:779] *****************************************
57
- [W1231 09:49:05.069551764 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
58
- [W1231 09:49:05.069566273 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
59
- [W1231 09:49:05.070374617 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
60
- [W1231 09:49:05.070402090 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
61
- [W1231 09:49:05.069566736 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
62
- [W1231 09:49:05.070424006 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
63
- [W1231 09:49:05.069610510 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
64
- [W1231 09:49:05.070456802 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
65
- started on checkpoints/log_2024-12-23(15:50:05)
66
-
67
- namespace(accu_grad=1, audio_direc='/mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/', audio_sr=16000, batch_size=1, causal=0, checkpoint_dir='checkpoints/log_2024-12-23(15:50:05)', clip_grad_norm=5.0, config=[<yamlargparse.Path object at 0x7f87241d2c40>], device=device(type='cuda'), distributed=True, effec_batch_size=2, evaluate_only=0, init_from='checkpoints/log_VoxCeleb2_lip_tfgridnet_2spk', init_learning_rate=0.0005, local_rank=0, loss_type='ss_sisdr', lr_warmup=1, max_epoch=150, max_length=3, mix_lst_path='./data/VoxCeleb2_non_repeat/mixture_data_list_2mix.csv', network_audio=namespace(activation='prelu', attn_n_head=4, attn_qk_output_channel=4, backbone='av_tfgridnet_att_ss', emb_dim=48, emb_hs=1, emb_ks=4, lstm_hidden_units=192, n_fft=256, n_imics=1, n_layers=6, n_srcs=1, stride=128, use_builtin_complex=False, window='hann'), network_reference=namespace(backbone='resnet18', cue='lip', emb_size=256), num_workers=2, ref_sr=25, reference_direc='/mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/', seed=777, speaker_no=2, spk_att_dropout=1, train_from_last_checkpoint=1, use_cuda=1, world_size=4)
68
- network_wrapper(
69
- (sep_network): av_TFGridNetV3_att_ss(
70
- (enc): STFTEncoder(
71
- (stft): Stft(n_fft=256, win_length=256, hop_length=128, center=True, normalized=False, onesided=True)
72
- )
73
- (dec): STFTDecoder(
74
- (stft): Stft(n_fft=256, win_length=256, hop_length=128, center=True, normalized=False, onesided=True)
75
- )
76
- (conv): Sequential(
77
- (0): Conv2d(2, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
78
- (1): GroupNorm(1, 48, eps=1e-05, affine=True)
79
- )
80
- (blocks): ModuleList(
81
- (0-5): 6 x GridNetV3Block(
82
- (intra_norm): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
83
- (intra_rnn): LSTM(192, 192, batch_first=True, bidirectional=True)
84
- (intra_linear): ConvTranspose1d(384, 48, kernel_size=(4,), stride=(1,))
85
- (inter_norm): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
86
- (inter_rnn): LSTM(192, 192, batch_first=True, bidirectional=True)
87
- (inter_linear): ConvTranspose1d(384, 48, kernel_size=(4,), stride=(1,))
88
- (attn_conv_Q): Conv2d(48, 16, kernel_size=(1, 1), stride=(1, 1))
89
- (attn_norm_Q): AllHeadPReLULayerNormalization4DC(
90
- (act): PReLU(num_parameters=4)
91
- )
92
- (attn_conv_K): Conv2d(48, 16, kernel_size=(1, 1), stride=(1, 1))
93
- (attn_norm_K): AllHeadPReLULayerNormalization4DC(
94
- (act): PReLU(num_parameters=4)
95
- )
96
- (attn_conv_V): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1))
97
- (attn_norm_V): AllHeadPReLULayerNormalization4DC(
98
- (act): PReLU(num_parameters=4)
99
- )
100
- (attn_concat_proj): Sequential(
101
- (0): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1))
102
- (1): PReLU(num_parameters=1)
103
- (2): LayerNormalization()
104
- )
105
- (spk_att): TransformerEncoder(
106
- (layers): ModuleList(
107
- (0): TransformerEncoderLayer(
108
- (self_attn): MultiheadAttention(
109
- (out_proj): NonDynamicallyQuantizableLinear(in_features=48, out_features=48, bias=True)
110
- )
111
- (linear1): Linear(in_features=48, out_features=192, bias=True)
112
- (dropout): Dropout(p=0.1, inplace=False)
113
- (linear2): Linear(in_features=192, out_features=48, bias=True)
114
- (norm1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
115
- (norm2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
116
- (dropout1): Dropout(p=0.1, inplace=False)
117
- (dropout2): Dropout(p=0.1, inplace=False)
118
- )
119
- )
120
- )
121
- (spk_norm): GroupNorm(1, 48, eps=1e-08, affine=True)
122
- )
123
- )
124
- (deconv): ConvTranspose2d(48, 2, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
125
- (av_conv): ModuleList(
126
- (0-5): 6 x Linear(in_features=304, out_features=48, bias=True)
127
- )
128
- )
129
- (ref_encoder): Visual_encoder(
130
- (v_frontend): VisualFrontend(
131
- (frontend3D): Sequential(
132
- (0): Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
133
- (1): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
134
- (2): ReLU()
135
- (3): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
136
- )
137
- (resnet): ResNet(
138
- (layer1): ResNetLayer(
139
- (conv1a): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
140
- (bn1a): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
141
- (conv2a): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
142
- (downsample): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
143
- (outbna): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
144
- (conv1b): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
145
- (bn1b): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
146
- (conv2b): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
147
- (outbnb): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
148
- )
149
- (layer2): ResNetLayer(
150
- (conv1a): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
151
- (bn1a): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
152
- (conv2a): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
153
- (downsample): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
154
- (outbna): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
155
- (conv1b): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
156
- (bn1b): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
157
- (conv2b): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
158
- (outbnb): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
159
- )
160
- (layer3): ResNetLayer(
161
- (conv1a): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
162
- (bn1a): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
163
- (conv2a): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
164
- (downsample): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
165
- (outbna): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
166
- (conv1b): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
167
- (bn1b): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
168
- (conv2b): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
169
- (outbnb): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
170
- )
171
- (layer4): ResNetLayer(
172
- (conv1a): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
173
- (bn1a): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
174
- (conv2a): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
175
- (downsample): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
176
- (outbna): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
177
- (conv1b): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
178
- (bn1b): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
179
- (conv2b): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
180
- (outbnb): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
181
- )
182
- (avgpool): AvgPool2d(kernel_size=(4, 4), stride=(1, 1), padding=0)
183
- )
184
- )
185
- (v_ds): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
186
- (visual_conv): Sequential(
187
- (0): VisualConv1D(
188
- (relu_0): ReLU()
189
- (norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
190
- (conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
191
- (relu): ReLU()
192
- (norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
193
- (dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
194
- (prelu): PReLU(num_parameters=1)
195
- (norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
196
- (pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
197
- )
198
- (1): VisualConv1D(
199
- (relu_0): ReLU()
200
- (norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
201
- (conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
202
- (relu): ReLU()
203
- (norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
204
- (dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
205
- (prelu): PReLU(num_parameters=1)
206
- (norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
207
- (pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
208
- )
209
- (2): VisualConv1D(
210
- (relu_0): ReLU()
211
- (norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
212
- (conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
213
- (relu): ReLU()
214
- (norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
215
- (dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
216
- (prelu): PReLU(num_parameters=1)
217
- (norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
218
- (pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
219
- )
220
- (3): VisualConv1D(
221
- (relu_0): ReLU()
222
- (norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
223
- (conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
224
- (relu): ReLU()
225
- (norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
226
- (dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
227
- (prelu): PReLU(num_parameters=1)
228
- (norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
229
- (pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
230
- )
231
- (4): VisualConv1D(
232
- (relu_0): ReLU()
233
- (norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
234
- (conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
235
- (relu): ReLU()
236
- (norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
237
- (dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
238
- (prelu): PReLU(num_parameters=1)
239
- (norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
240
- (pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
241
- )
242
- )
243
- )
244
- )
245
-
246
- Total number of parameters: 20950309
247
-
248
-
249
- Total number of trainable parameters: 9765221
250
-
251
- dlc10xm9l399lwkq-master-0:26:26 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
252
- dlc10xm9l399lwkq-master-0:26:26 [0] NCCL INFO Bootstrap : Using eth0:22.6.236.79<0>
253
- dlc10xm9l399lwkq-master-0:26:26 [0] NCCL INFO Plugin name set by env to libnccl-net-none.so
254
- dlc10xm9l399lwkq-master-0:26:26 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
255
- dlc10xm9l399lwkq-master-0:26:26 [0] NCCL INFO cudaDriverVersion 11040
256
- dlc10xm9l399lwkq-master-0:29:29 [3] NCCL INFO cudaDriverVersion 11040
257
- dlc10xm9l399lwkq-master-0:27:27 [1] NCCL INFO cudaDriverVersion 11040
258
- dlc10xm9l399lwkq-master-0:28:28 [2] NCCL INFO cudaDriverVersion 11040
259
- NCCL version 2.20.5+cuda11.8
260
- dlc10xm9l399lwkq-master-0:29:29 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
261
- dlc10xm9l399lwkq-master-0:27:27 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
262
- dlc10xm9l399lwkq-master-0:28:28 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
263
- dlc10xm9l399lwkq-master-0:29:29 [3] NCCL INFO Bootstrap : Using eth0:22.6.236.79<0>
264
- dlc10xm9l399lwkq-master-0:28:28 [2] NCCL INFO Bootstrap : Using eth0:22.6.236.79<0>
265
- dlc10xm9l399lwkq-master-0:27:27 [1] NCCL INFO Bootstrap : Using eth0:22.6.236.79<0>
266
- dlc10xm9l399lwkq-master-0:29:29 [3] NCCL INFO Plugin name set by env to libnccl-net-none.so
267
- dlc10xm9l399lwkq-master-0:28:28 [2] NCCL INFO Plugin name set by env to libnccl-net-none.so
268
- dlc10xm9l399lwkq-master-0:27:27 [1] NCCL INFO Plugin name set by env to libnccl-net-none.so
269
- dlc10xm9l399lwkq-master-0:29:29 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
270
- dlc10xm9l399lwkq-master-0:28:28 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
271
- dlc10xm9l399lwkq-master-0:27:27 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
272
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
273
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
274
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
275
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
276
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO NCCL_IB_HCA set to mlx5
277
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO NCCL_IB_HCA set to mlx5
278
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO NCCL_IB_HCA set to mlx5
279
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO NCCL_IB_HCA set to mlx5
280
- libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
281
- libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
282
- libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
283
- libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
284
- libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
285
- libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
286
- libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
287
- libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
288
- libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
289
- libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
290
- libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
291
- libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
292
- libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
293
- libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
294
- libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
295
- libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
296
- libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
297
- libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
298
- libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
299
- libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
300
- libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
301
- libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
302
- libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
303
- libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
304
- libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
305
- libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
306
- libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
307
- libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
308
- libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
309
- libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
310
- libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
311
- libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
312
- libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
313
- libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
314
- libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
315
- libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
316
- libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
317
- libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
318
- libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
319
- libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
320
- libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
321
- libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
322
- libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
323
- libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
324
- libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
325
- libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
326
- libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
327
- libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
328
- libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
329
- libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
330
- libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
331
- libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
332
- libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
333
- libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
334
- libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
335
- libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
336
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.6.236.79<0>
337
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Using non-device net plugin version 0
338
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Using network IB
339
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.6.236.79<0>
340
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.6.236.79<0>
341
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Using non-device net plugin version 0
342
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Using network IB
343
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Using non-device net plugin version 0
344
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Using network IB
345
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.6.236.79<0>
346
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Using non-device net plugin version 0
347
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Using network IB
348
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO comm 0x8e64ad0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 20 commId 0xf279881ae65b16f2 - Init START
349
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO comm 0x933e770 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 10 commId 0xf279881ae65b16f2 - Init START
350
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO comm 0x792e140 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId 40 commId 0xf279881ae65b16f2 - Init START
351
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO comm 0x754a850 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId 30 commId 0xf279881ae65b16f2 - Init START
352
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
353
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
354
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
355
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO comm 0x933e770 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
356
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO comm 0x8e64ad0 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
357
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO comm 0x792e140 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
358
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO comm 0x754a850 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
359
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
360
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
361
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
362
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
363
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 00/12 : 0 1 2 3
364
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0
365
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 01/12 : 0 1 2 3
366
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO P2P Chunksize set to 524288
367
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2
368
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 02/12 : 0 1 2 3
369
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1
370
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO P2P Chunksize set to 524288
371
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO P2P Chunksize set to 524288
372
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 03/12 : 0 1 2 3
373
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 04/12 : 0 1 2 3
374
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 05/12 : 0 1 2 3
375
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 06/12 : 0 1 2 3
376
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 07/12 : 0 1 2 3
377
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 08/12 : 0 1 2 3
378
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 09/12 : 0 1 2 3
379
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 10/12 : 0 1 2 3
380
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 11/12 : 0 1 2 3
381
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1
382
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO P2P Chunksize set to 524288
383
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/IPC/read
384
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/IPC/read
385
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/IPC/read
386
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/IPC/read
387
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/IPC/read
388
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/IPC/read
389
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/IPC/read
390
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/IPC/read
391
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/IPC/read
392
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/IPC/read
393
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/IPC/read
394
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/IPC/read
395
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/IPC/read
396
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/IPC/read
397
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/IPC/read
398
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/IPC/read
399
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/IPC/read
400
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/IPC/read
401
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/IPC/read
402
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/IPC/read
403
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/IPC/read
404
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/IPC/read
405
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/IPC/read
406
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/IPC/read
407
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/IPC/read
408
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/IPC/read
409
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/IPC/read
410
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/IPC/read
411
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/IPC/read
412
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/IPC/read
413
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/IPC/read
414
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/IPC/read
415
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/IPC/read
416
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/IPC/read
417
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/IPC/read
418
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/IPC/read
419
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/IPC/read
420
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/IPC/read
421
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/IPC/read
422
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/IPC/read
423
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/IPC/read
424
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/IPC/read
425
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/IPC/read
426
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/IPC/read
427
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/IPC/read
428
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/IPC/read
429
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/IPC/read
430
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/IPC/read
431
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Connected all rings
432
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Connected all rings
433
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/IPC/read
434
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Connected all rings
435
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Connected all rings
436
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/IPC/read
437
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/IPC/read
438
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 03/0 : 3[3] -> 2[2] via P2P/IPC/read
439
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/IPC/read
440
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/IPC/read
441
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/IPC/read
442
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/IPC/read
443
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/IPC/read
444
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/IPC/read
445
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/IPC/read
446
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 11/0 : 3[3] -> 2[2] via P2P/IPC/read
447
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/IPC/read
448
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/IPC/read
449
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/IPC/read
450
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/IPC/read
451
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/IPC/read
452
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/IPC/read
453
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/IPC/read
454
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/IPC/read
455
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/IPC/read
456
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/IPC/read
457
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/IPC/read
458
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/IPC/read
459
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/IPC/read
460
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/IPC/read
461
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/IPC/read
462
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/IPC/read
463
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/IPC/read
464
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/IPC/read
465
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/IPC/read
466
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/IPC/read
467
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/IPC/read
468
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/IPC/read
469
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/IPC/read
470
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/IPC/read
471
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Connected all trees
472
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
473
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
474
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Connected all trees
475
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
476
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
477
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Connected all trees
478
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Connected all trees
479
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
480
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
481
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
482
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
483
- dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO comm 0x8e64ad0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 20 commId 0xf279881ae65b16f2 - Init COMPLETE
484
- dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO comm 0x933e770 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 10 commId 0xf279881ae65b16f2 - Init COMPLETE
485
- dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO comm 0x754a850 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId 30 commId 0xf279881ae65b16f2 - Init COMPLETE
486
- dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO comm 0x792e140 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId 40 commId 0xf279881ae65b16f2 - Init COMPLETE
487
- [rank0]:[W1231 09:49:44.279868977 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
488
- [rank2]:[W1231 09:49:44.279870560 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
489
- [rank3]:[W1231 09:49:44.280285689 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
490
- [rank1]:[W1231 09:49:44.280413742 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
491
- Resume training from epoch: 56
492
- [rank1]:[W1231 09:51:26.949995860 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
493
- [rank2]:[W1231 09:51:26.954996096 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
494
- [rank0]:[W1231 09:51:26.955118250 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
495
- [rank3]:[W1231 09:51:26.957005274 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
496
- Train Summary | End of Epoch 56 | Time 8839.06s | Train Loss -16.312
497
- Valid Summary | End of Epoch 56 | Time 1303.07s | Valid Loss -14.529
498
- Test Summary | End of Epoch 56 | Time 781.34s | Test Loss -14.061
499
- reload weights and optimizer from last best checkpoint
500
- Learning rate adjusted to: 0.000128
501
- Learning rate is: 0.000128
502
- Train Summary | End of Epoch 57 | Time 8856.34s | Train Loss -16.327
503
- Valid Summary | End of Epoch 57 | Time 1301.12s | Valid Loss -14.620
504
- Test Summary | End of Epoch 57 | Time 781.09s | Test Loss -14.115
505
- Learning rate is: 0.000128
506
- Fund new best model, dict saved
507
- Train Summary | End of Epoch 58 | Time 8854.23s | Train Loss -16.346
508
- Valid Summary | End of Epoch 58 | Time 1301.64s | Valid Loss -14.590
509
- Test Summary | End of Epoch 58 | Time 780.97s | Test Loss -14.113
510
- Learning rate is: 0.000128
511
- Train Summary | End of Epoch 59 | Time 8854.80s | Train Loss -16.353
512
- Valid Summary | End of Epoch 59 | Time 1301.66s | Valid Loss -14.589
513
- Test Summary | End of Epoch 59 | Time 781.22s | Test Loss -14.091
514
- Learning rate is: 0.000128
515
- Train Summary | End of Epoch 60 | Time 8863.27s | Train Loss -16.371
516
- Valid Summary | End of Epoch 60 | Time 1301.48s | Valid Loss -14.578
517
- Test Summary | End of Epoch 60 | Time 781.07s | Test Loss -14.179
518
- Learning rate is: 0.000128
519
- Train Summary | End of Epoch 61 | Time 8878.44s | Train Loss -16.381
520
- Valid Summary | End of Epoch 61 | Time 1301.09s | Valid Loss -14.561
521
- Test Summary | End of Epoch 61 | Time 781.19s | Test Loss -14.117
522
- Learning rate is: 0.000128
523
- Train Summary | End of Epoch 62 | Time 8874.16s | Train Loss -16.381
524
- Valid Summary | End of Epoch 62 | Time 1301.83s | Valid Loss -14.627
525
- Test Summary | End of Epoch 62 | Time 781.20s | Test Loss -14.076
526
- Learning rate is: 0.000128
527
- Fund new best model, dict saved
528
- Train Summary | End of Epoch 63 | Time 8872.86s | Train Loss -16.386
529
- Valid Summary | End of Epoch 63 | Time 1301.73s | Valid Loss -14.573
530
- Test Summary | End of Epoch 63 | Time 781.22s | Test Loss -14.034
531
- Learning rate is: 0.000128
532
- Train Summary | End of Epoch 64 | Time 8872.70s | Train Loss -16.407
533
- Valid Summary | End of Epoch 64 | Time 1302.35s | Valid Loss -14.552
534
- Test Summary | End of Epoch 64 | Time 781.45s | Test Loss -14.107
535
- Learning rate is: 0.000128
536
- Train Summary | End of Epoch 65 | Time 8871.09s | Train Loss -16.411
537
- Valid Summary | End of Epoch 65 | Time 1301.10s | Valid Loss -14.493
538
- Test Summary | End of Epoch 65 | Time 781.20s | Test Loss -14.088
539
- Learning rate is: 0.000128
540
- Train Summary | End of Epoch 66 | Time 8868.35s | Train Loss -16.424
541
- Valid Summary | End of Epoch 66 | Time 1301.48s | Valid Loss -14.576
542
- Test Summary | End of Epoch 66 | Time 780.88s | Test Loss -14.107
543
- Learning rate is: 0.000128
544
- Train Summary | End of Epoch 67 | Time 8872.21s | Train Loss -16.428
545
- Valid Summary | End of Epoch 67 | Time 1302.63s | Valid Loss -14.600
546
- Test Summary | End of Epoch 67 | Time 780.67s | Test Loss -14.124
547
- reload weights and optimizer from last best checkpoint
548
- Learning rate adjusted to: 0.000064
549
- Learning rate is: 0.000064
550
- Train Summary | End of Epoch 68 | Time 8866.46s | Train Loss -16.420
551
- Valid Summary | End of Epoch 68 | Time 1301.06s | Valid Loss -14.629
552
- Test Summary | End of Epoch 68 | Time 781.03s | Test Loss -14.045
553
- Learning rate is: 0.000064
554
- Fund new best model, dict saved
555
- Train Summary | End of Epoch 69 | Time 8869.51s | Train Loss -16.423
556
- Valid Summary | End of Epoch 69 | Time 1301.42s | Valid Loss -14.610
557
- Test Summary | End of Epoch 69 | Time 781.07s | Test Loss -14.122
558
- Learning rate is: 0.000064
559
- Train Summary | End of Epoch 70 | Time 8870.52s | Train Loss -16.439
560
- Valid Summary | End of Epoch 70 | Time 1301.36s | Valid Loss -14.595
561
- Test Summary | End of Epoch 70 | Time 781.50s | Test Loss -14.017
562
- Learning rate is: 0.000064
563
- Train Summary | End of Epoch 71 | Time 8872.92s | Train Loss -16.443
564
- Valid Summary | End of Epoch 71 | Time 1302.03s | Valid Loss -14.576
565
- Test Summary | End of Epoch 71 | Time 781.03s | Test Loss -14.149
566
- Learning rate is: 0.000064
567
- Train Summary | End of Epoch 72 | Time 8868.30s | Train Loss -16.452
568
- Valid Summary | End of Epoch 72 | Time 1300.98s | Valid Loss -14.580
569
- Test Summary | End of Epoch 72 | Time 780.55s | Test Loss -14.096
570
- Learning rate is: 0.000064
571
- Train Summary | End of Epoch 73 | Time 8867.89s | Train Loss -16.449
572
- Valid Summary | End of Epoch 73 | Time 1301.70s | Valid Loss -14.556
573
- Test Summary | End of Epoch 73 | Time 780.97s | Test Loss -14.083
574
- reload weights and optimizer from last best checkpoint
575
- Learning rate adjusted to: 0.000032
576
- Learning rate is: 0.000032
577
- Train Summary | End of Epoch 74 | Time 8867.13s | Train Loss -16.444
578
- Valid Summary | End of Epoch 74 | Time 1301.02s | Valid Loss -14.603
579
- Test Summary | End of Epoch 74 | Time 781.03s | Test Loss -14.174
580
- Learning rate is: 0.000032
581
- Train Summary | End of Epoch 75 | Time 8848.91s | Train Loss -16.446
582
- Valid Summary | End of Epoch 75 | Time 1302.16s | Valid Loss -14.586
583
- Test Summary | End of Epoch 75 | Time 781.19s | Test Loss -14.069
584
- Learning rate is: 0.000032
585
- Train Summary | End of Epoch 76 | Time 8854.80s | Train Loss -16.460
586
- Valid Summary | End of Epoch 76 | Time 1300.81s | Valid Loss -14.595
587
- Test Summary | End of Epoch 76 | Time 781.01s | Test Loss -14.063
588
- Learning rate is: 0.000032
589
- Train Summary | End of Epoch 77 | Time 8863.14s | Train Loss -16.455
590
- Valid Summary | End of Epoch 77 | Time 1300.81s | Valid Loss -14.602
591
- Test Summary | End of Epoch 77 | Time 780.15s | Test Loss -14.056
592
- Learning rate is: 0.000032
593
- Train Summary | End of Epoch 78 | Time 8867.28s | Train Loss -16.456
594
- Valid Summary | End of Epoch 78 | Time 1300.89s | Valid Loss -14.569
595
- Test Summary | End of Epoch 78 | Time 780.83s | Test Loss -14.124
596
- No imporvement for 10 epochs, early stopping.
597
- Start evaluation
598
- Avg SISNR:i tensor([13.8622], device='cuda:0')
599
- Avg SNRi: 14.180312131890753
600
- Avg PESQi: 1.4813443135023117
601
- Avg STOIi: 0.27852386519253225
602
- # 2spk
603
- Avg SISNR:i tensor([14.4913], device='cuda:0')
604
- Avg SNRi: 14.766350478813006
605
- Avg PESQi: 1.566092278043429
606
- Avg STOIi: 0.2884288031001122
607
-
608
-
609
- # lrs2
610
- # 1spk
611
- Avg SISNR:i tensor([14.3695], device='cuda:0')
612
- Avg SNRi: 14.712913455144225
613
- Avg PESQi: 1.4528894378741581
614
- Avg STOIi: 0.27694796861127857
615
- # 2spk
616
- Avg SISNR:i tensor([15.0343], device='cuda:0')
617
- Avg SNRi: 15.362836239165572
618
- Avg PESQi: 1.5522074782848359
619
- Avg STOIi: 0.28548176812938764
620
-
621
-
622
- # lrs3
623
- # 1spk
624
- Avg SISNR:i tensor([16.2440], device='cuda:0')
625
- Avg SNRi: 16.48747834483003
626
- Avg PESQi: 1.7686368883450827
627
- Avg STOIi: 0.2731847089622733
628
- # 2spk
629
- Avg SISNR:i tensor([16.9063], device='cuda:0')
630
- Avg SNRi: 17.114254503063478
631
- Avg PESQi: 1.8624962186813354
632
- Avg STOIi: 0.279682808481817
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/tensorboard/events.out.tfevents.1734940287.dlcdanw1zq2cucwx-master-0.28.0 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e29766adb4a58b0df338860869285d8dbc93e0fa0c642ade14dd326772d9706
3
- size 8228
 
 
 
 
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/tensorboard/events.out.tfevents.1735609783.dlc10xm9l399lwkq-master-0.26.0 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e7ae671ffc26ca42f44f0ce092ff6f6fb160e081580b2a680bfbe4e5761751f
3
- size 3344