Delete checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk
Browse files- checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/config.yaml +0 -59
- checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/last_best_checkpoint.pt +0 -3
- checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/last_checkpoint.pt +0 -3
- checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/log_2024-12-23(15:50:05).txt +0 -762
- checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/log_2024-12-31(09:48:03).txt +0 -632
- checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/tensorboard/events.out.tfevents.1734940287.dlcdanw1zq2cucwx-master-0.28.0 +0 -3
- checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/tensorboard/events.out.tfevents.1735609783.dlc10xm9l399lwkq-master-0.26.0 +0 -3
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/config.yaml
DELETED
@@ -1,59 +0,0 @@
|
|
1 |
-
## Config file
|
2 |
-
|
3 |
-
# Log
|
4 |
-
seed: 777
|
5 |
-
use_cuda: 1 # 1 for True, 0 for False
|
6 |
-
|
7 |
-
# dataset
|
8 |
-
speaker_no: 2
|
9 |
-
mix_lst_path: ./data/VoxCeleb2_non_repeat/mixture_data_list_2mix.csv
|
10 |
-
audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/
|
11 |
-
reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/
|
12 |
-
# mix_lst_path: ./data/LRS2/mixture_data_list_2mix.csv
|
13 |
-
# audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/LRS2/audio_clean/
|
14 |
-
# reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/LRS2/mvlrs_v1/
|
15 |
-
# mix_lst_path: ./data/LRS3/mixture_data_list_2mix.csv
|
16 |
-
# audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/LRS3/audio_clean/
|
17 |
-
# reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/LRS3/orig/
|
18 |
-
audio_sr: 16000
|
19 |
-
ref_sr: 25
|
20 |
-
|
21 |
-
# dataloader
|
22 |
-
num_workers: 2
|
23 |
-
batch_size: 1 # 4-GPU training with a total effective batch size of 8
|
24 |
-
accu_grad: 1
|
25 |
-
effec_batch_size: 2 # per GPU, only used if accu_grad is set to 1, must be multiple times of batch size
|
26 |
-
max_length: 3 # truncate the utterances in dataloader, in seconds
|
27 |
-
|
28 |
-
# network settings
|
29 |
-
init_from: 'checkpoints/log_VoxCeleb2_lip_tfgridnet_2spk' # 'None' or a log name 'log_2024-07-22(18:12:13)'
|
30 |
-
causal: 0 # 1 for True, 0 for False
|
31 |
-
network_reference:
|
32 |
-
cue: lip # lip or speech or gesture or EEG
|
33 |
-
backbone: resnet18 # resnet18 or shufflenetV2 or blazenet64
|
34 |
-
emb_size: 256 # resnet18:256
|
35 |
-
network_audio:
|
36 |
-
backbone: av_tfgridnet_isam
|
37 |
-
n_fft: 256
|
38 |
-
stride: 128
|
39 |
-
window: "hann"
|
40 |
-
use_builtin_complex: False
|
41 |
-
n_srcs: 1
|
42 |
-
n_imics: 1
|
43 |
-
n_layers: 6
|
44 |
-
lstm_hidden_units: 192
|
45 |
-
attn_n_head: 4
|
46 |
-
attn_qk_output_channel: 4
|
47 |
-
emb_dim: 48
|
48 |
-
emb_ks: 4
|
49 |
-
emb_hs: 1
|
50 |
-
activation: "prelu"
|
51 |
-
isam: 1
|
52 |
-
|
53 |
-
# optimizer
|
54 |
-
spk_att_dropout: 1 # 0 for always use speaker attention
|
55 |
-
loss_type: ss_sisdr # "snr", "sisdr", "hybrid"
|
56 |
-
lr_warmup: 1
|
57 |
-
init_learning_rate: 0.0005
|
58 |
-
max_epoch: 150
|
59 |
-
clip_grad_norm: 5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/last_best_checkpoint.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:99ab899c261721b450ae9c494b71cf33c3e08f962c458760c14cfc5cf4659c89
|
3 |
-
size 162585486
|
|
|
|
|
|
|
|
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/last_checkpoint.pt
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:e2bd23023c6ba194a49e46a5c16798184fe1a5b7f38b224914f4a49a6f5a0cb1
|
3 |
-
size 162576686
|
|
|
|
|
|
|
|
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/log_2024-12-23(15:50:05).txt
DELETED
@@ -1,762 +0,0 @@
|
|
1 |
-
## Config file
|
2 |
-
|
3 |
-
# Log
|
4 |
-
seed: 777
|
5 |
-
use_cuda: 1 # 1 for True, 0 for False
|
6 |
-
|
7 |
-
# dataset
|
8 |
-
speaker_no: 2
|
9 |
-
mix_lst_path: ./data/VoxCeleb2_non_repeat/mixture_data_list_2mix.csv
|
10 |
-
audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/
|
11 |
-
reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/
|
12 |
-
audio_sr: 16000
|
13 |
-
ref_sr: 25
|
14 |
-
|
15 |
-
# dataloader
|
16 |
-
num_workers: 2
|
17 |
-
batch_size: 1 # 4-GPU training with a total effective batch size of 8
|
18 |
-
accu_grad: 1
|
19 |
-
effec_batch_size: 2 # per GPU, only used if accu_grad is set to 1, must be multiple times of batch size
|
20 |
-
max_length: 3 # truncate the utterances in dataloader, in seconds
|
21 |
-
|
22 |
-
# network settings
|
23 |
-
init_from: 'checkpoints/log_VoxCeleb2_lip_tfgridnet_2spk' # 'None' or a log name 'log_2024-07-22(18:12:13)'
|
24 |
-
causal: 0 # 1 for True, 0 for False
|
25 |
-
network_reference:
|
26 |
-
cue: lip # lip or speech or gesture or EEG
|
27 |
-
backbone: resnet18 # resnet18 or shufflenetV2 or blazenet64
|
28 |
-
emb_size: 256 # resnet18:256
|
29 |
-
network_audio:
|
30 |
-
backbone: av_tfgridnet_att_ss
|
31 |
-
n_fft: 256
|
32 |
-
stride: 128
|
33 |
-
window: "hann"
|
34 |
-
use_builtin_complex: False
|
35 |
-
n_srcs: 1
|
36 |
-
n_imics: 1
|
37 |
-
n_layers: 6
|
38 |
-
lstm_hidden_units: 192
|
39 |
-
attn_n_head: 4
|
40 |
-
attn_qk_output_channel: 4
|
41 |
-
emb_dim: 48
|
42 |
-
emb_ks: 4
|
43 |
-
emb_hs: 1
|
44 |
-
activation: "prelu"
|
45 |
-
|
46 |
-
# optimizer
|
47 |
-
spk_att_dropout: 1 # 0 for always use speaker attention
|
48 |
-
loss_type: ss_sisdr # "snr", "sisdr", "hybrid"
|
49 |
-
lr_warmup: 1
|
50 |
-
init_learning_rate: 0.0005
|
51 |
-
max_epoch: 150
|
52 |
-
clip_grad_norm: 5
|
53 |
-
W1223 15:50:36.040409 140547448362816 torch/distributed/run.py:779]
|
54 |
-
W1223 15:50:36.040409 140547448362816 torch/distributed/run.py:779] *****************************************
|
55 |
-
W1223 15:50:36.040409 140547448362816 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
56 |
-
W1223 15:50:36.040409 140547448362816 torch/distributed/run.py:779] *****************************************
|
57 |
-
[W1223 15:50:57.605401514 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
58 |
-
[W1223 15:50:57.606194396 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
|
59 |
-
[W1223 15:50:57.605414589 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
60 |
-
[W1223 15:50:57.606215894 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
|
61 |
-
[W1223 15:50:57.605418359 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
62 |
-
[W1223 15:50:57.606241166 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
|
63 |
-
[W1223 15:50:57.605453172 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
64 |
-
[W1223 15:50:57.606286008 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
|
65 |
-
started on checkpoints/log_2024-12-23(15:50:05)
|
66 |
-
|
67 |
-
namespace(accu_grad=1, audio_direc='/mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/', audio_sr=16000, batch_size=1, causal=0, checkpoint_dir='checkpoints/log_2024-12-23(15:50:05)', clip_grad_norm=5.0, config=[<yamlargparse.Path object at 0x7fdc6754ee80>], device=device(type='cuda'), distributed=True, effec_batch_size=2, evaluate_only=0, init_from='checkpoints/log_VoxCeleb2_lip_tfgridnet_2spk', init_learning_rate=0.0005, local_rank=0, loss_type='ss_sisdr', lr_warmup=1, max_epoch=150, max_length=3, mix_lst_path='./data/VoxCeleb2_non_repeat/mixture_data_list_2mix.csv', network_audio=namespace(activation='prelu', attn_n_head=4, attn_qk_output_channel=4, backbone='av_tfgridnet_att_ss', emb_dim=48, emb_hs=1, emb_ks=4, lstm_hidden_units=192, n_fft=256, n_imics=1, n_layers=6, n_srcs=1, stride=128, use_builtin_complex=False, window='hann'), network_reference=namespace(backbone='resnet18', cue='lip', emb_size=256), num_workers=2, ref_sr=25, reference_direc='/mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/', seed=777, speaker_no=2, spk_att_dropout=1, train_from_last_checkpoint=0, use_cuda=1, world_size=4)
|
68 |
-
network_wrapper(
|
69 |
-
(sep_network): av_TFGridNetV3_att_ss(
|
70 |
-
(enc): STFTEncoder(
|
71 |
-
(stft): Stft(n_fft=256, win_length=256, hop_length=128, center=True, normalized=False, onesided=True)
|
72 |
-
)
|
73 |
-
(dec): STFTDecoder(
|
74 |
-
(stft): Stft(n_fft=256, win_length=256, hop_length=128, center=True, normalized=False, onesided=True)
|
75 |
-
)
|
76 |
-
(conv): Sequential(
|
77 |
-
(0): Conv2d(2, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
78 |
-
(1): GroupNorm(1, 48, eps=1e-05, affine=True)
|
79 |
-
)
|
80 |
-
(blocks): ModuleList(
|
81 |
-
(0-5): 6 x GridNetV3Block(
|
82 |
-
(intra_norm): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
|
83 |
-
(intra_rnn): LSTM(192, 192, batch_first=True, bidirectional=True)
|
84 |
-
(intra_linear): ConvTranspose1d(384, 48, kernel_size=(4,), stride=(1,))
|
85 |
-
(inter_norm): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
|
86 |
-
(inter_rnn): LSTM(192, 192, batch_first=True, bidirectional=True)
|
87 |
-
(inter_linear): ConvTranspose1d(384, 48, kernel_size=(4,), stride=(1,))
|
88 |
-
(attn_conv_Q): Conv2d(48, 16, kernel_size=(1, 1), stride=(1, 1))
|
89 |
-
(attn_norm_Q): AllHeadPReLULayerNormalization4DC(
|
90 |
-
(act): PReLU(num_parameters=4)
|
91 |
-
)
|
92 |
-
(attn_conv_K): Conv2d(48, 16, kernel_size=(1, 1), stride=(1, 1))
|
93 |
-
(attn_norm_K): AllHeadPReLULayerNormalization4DC(
|
94 |
-
(act): PReLU(num_parameters=4)
|
95 |
-
)
|
96 |
-
(attn_conv_V): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1))
|
97 |
-
(attn_norm_V): AllHeadPReLULayerNormalization4DC(
|
98 |
-
(act): PReLU(num_parameters=4)
|
99 |
-
)
|
100 |
-
(attn_concat_proj): Sequential(
|
101 |
-
(0): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1))
|
102 |
-
(1): PReLU(num_parameters=1)
|
103 |
-
(2): LayerNormalization()
|
104 |
-
)
|
105 |
-
(spk_att): TransformerEncoder(
|
106 |
-
(layers): ModuleList(
|
107 |
-
(0): TransformerEncoderLayer(
|
108 |
-
(self_attn): MultiheadAttention(
|
109 |
-
(out_proj): NonDynamicallyQuantizableLinear(in_features=48, out_features=48, bias=True)
|
110 |
-
)
|
111 |
-
(linear1): Linear(in_features=48, out_features=192, bias=True)
|
112 |
-
(dropout): Dropout(p=0.1, inplace=False)
|
113 |
-
(linear2): Linear(in_features=192, out_features=48, bias=True)
|
114 |
-
(norm1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
|
115 |
-
(norm2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
|
116 |
-
(dropout1): Dropout(p=0.1, inplace=False)
|
117 |
-
(dropout2): Dropout(p=0.1, inplace=False)
|
118 |
-
)
|
119 |
-
)
|
120 |
-
)
|
121 |
-
(spk_norm): GroupNorm(1, 48, eps=1e-08, affine=True)
|
122 |
-
)
|
123 |
-
)
|
124 |
-
(deconv): ConvTranspose2d(48, 2, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
125 |
-
(av_conv): ModuleList(
|
126 |
-
(0-5): 6 x Linear(in_features=304, out_features=48, bias=True)
|
127 |
-
)
|
128 |
-
)
|
129 |
-
(ref_encoder): Visual_encoder(
|
130 |
-
(v_frontend): VisualFrontend(
|
131 |
-
(frontend3D): Sequential(
|
132 |
-
(0): Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
|
133 |
-
(1): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
134 |
-
(2): ReLU()
|
135 |
-
(3): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
|
136 |
-
)
|
137 |
-
(resnet): ResNet(
|
138 |
-
(layer1): ResNetLayer(
|
139 |
-
(conv1a): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
140 |
-
(bn1a): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
141 |
-
(conv2a): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
142 |
-
(downsample): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
|
143 |
-
(outbna): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
144 |
-
(conv1b): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
145 |
-
(bn1b): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
146 |
-
(conv2b): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
147 |
-
(outbnb): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
148 |
-
)
|
149 |
-
(layer2): ResNetLayer(
|
150 |
-
(conv1a): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
|
151 |
-
(bn1a): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
152 |
-
(conv2a): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
153 |
-
(downsample): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
|
154 |
-
(outbna): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
155 |
-
(conv1b): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
156 |
-
(bn1b): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
157 |
-
(conv2b): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
158 |
-
(outbnb): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
159 |
-
)
|
160 |
-
(layer3): ResNetLayer(
|
161 |
-
(conv1a): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
|
162 |
-
(bn1a): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
163 |
-
(conv2a): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
164 |
-
(downsample): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
|
165 |
-
(outbna): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
166 |
-
(conv1b): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
167 |
-
(bn1b): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
168 |
-
(conv2b): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
169 |
-
(outbnb): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
170 |
-
)
|
171 |
-
(layer4): ResNetLayer(
|
172 |
-
(conv1a): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
|
173 |
-
(bn1a): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
174 |
-
(conv2a): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
175 |
-
(downsample): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
|
176 |
-
(outbna): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
177 |
-
(conv1b): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
178 |
-
(bn1b): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
179 |
-
(conv2b): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
180 |
-
(outbnb): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
181 |
-
)
|
182 |
-
(avgpool): AvgPool2d(kernel_size=(4, 4), stride=(1, 1), padding=0)
|
183 |
-
)
|
184 |
-
)
|
185 |
-
(v_ds): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
186 |
-
(visual_conv): Sequential(
|
187 |
-
(0): VisualConv1D(
|
188 |
-
(relu_0): ReLU()
|
189 |
-
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
190 |
-
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
191 |
-
(relu): ReLU()
|
192 |
-
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
193 |
-
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
194 |
-
(prelu): PReLU(num_parameters=1)
|
195 |
-
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
196 |
-
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
197 |
-
)
|
198 |
-
(1): VisualConv1D(
|
199 |
-
(relu_0): ReLU()
|
200 |
-
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
201 |
-
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
202 |
-
(relu): ReLU()
|
203 |
-
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
204 |
-
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
205 |
-
(prelu): PReLU(num_parameters=1)
|
206 |
-
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
207 |
-
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
208 |
-
)
|
209 |
-
(2): VisualConv1D(
|
210 |
-
(relu_0): ReLU()
|
211 |
-
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
212 |
-
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
213 |
-
(relu): ReLU()
|
214 |
-
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
215 |
-
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
216 |
-
(prelu): PReLU(num_parameters=1)
|
217 |
-
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
218 |
-
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
219 |
-
)
|
220 |
-
(3): VisualConv1D(
|
221 |
-
(relu_0): ReLU()
|
222 |
-
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
223 |
-
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
224 |
-
(relu): ReLU()
|
225 |
-
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
226 |
-
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
227 |
-
(prelu): PReLU(num_parameters=1)
|
228 |
-
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
229 |
-
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
230 |
-
)
|
231 |
-
(4): VisualConv1D(
|
232 |
-
(relu_0): ReLU()
|
233 |
-
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
234 |
-
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
235 |
-
(relu): ReLU()
|
236 |
-
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
237 |
-
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
238 |
-
(prelu): PReLU(num_parameters=1)
|
239 |
-
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
240 |
-
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
241 |
-
)
|
242 |
-
)
|
243 |
-
)
|
244 |
-
)
|
245 |
-
|
246 |
-
Total number of parameters: 20950309
|
247 |
-
|
248 |
-
|
249 |
-
Total number of trainable parameters: 9765221
|
250 |
-
|
251 |
-
dlcdanw1zq2cucwx-master-0:28:28 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
252 |
-
dlcdanw1zq2cucwx-master-0:28:28 [0] NCCL INFO Bootstrap : Using eth0:22.5.96.241<0>
|
253 |
-
dlcdanw1zq2cucwx-master-0:28:28 [0] NCCL INFO Plugin name set by env to libnccl-net-none.so
|
254 |
-
dlcdanw1zq2cucwx-master-0:28:28 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
|
255 |
-
dlcdanw1zq2cucwx-master-0:28:28 [0] NCCL INFO cudaDriverVersion 11040
|
256 |
-
dlcdanw1zq2cucwx-master-0:30:30 [2] NCCL INFO cudaDriverVersion 11040
|
257 |
-
dlcdanw1zq2cucwx-master-0:31:31 [3] NCCL INFO cudaDriverVersion 11040
|
258 |
-
dlcdanw1zq2cucwx-master-0:29:29 [1] NCCL INFO cudaDriverVersion 11040
|
259 |
-
NCCL version 2.20.5+cuda11.8
|
260 |
-
dlcdanw1zq2cucwx-master-0:30:30 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
261 |
-
dlcdanw1zq2cucwx-master-0:31:31 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
262 |
-
dlcdanw1zq2cucwx-master-0:29:29 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
263 |
-
dlcdanw1zq2cucwx-master-0:31:31 [3] NCCL INFO Bootstrap : Using eth0:22.5.96.241<0>
|
264 |
-
dlcdanw1zq2cucwx-master-0:30:30 [2] NCCL INFO Bootstrap : Using eth0:22.5.96.241<0>
|
265 |
-
dlcdanw1zq2cucwx-master-0:31:31 [3] NCCL INFO Plugin name set by env to libnccl-net-none.so
|
266 |
-
dlcdanw1zq2cucwx-master-0:29:29 [1] NCCL INFO Bootstrap : Using eth0:22.5.96.241<0>
|
267 |
-
dlcdanw1zq2cucwx-master-0:30:30 [2] NCCL INFO Plugin name set by env to libnccl-net-none.so
|
268 |
-
dlcdanw1zq2cucwx-master-0:29:29 [1] NCCL INFO Plugin name set by env to libnccl-net-none.so
|
269 |
-
dlcdanw1zq2cucwx-master-0:30:30 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
|
270 |
-
dlcdanw1zq2cucwx-master-0:31:31 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
|
271 |
-
dlcdanw1zq2cucwx-master-0:29:29 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
|
272 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
273 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
274 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
275 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
276 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO NCCL_IB_HCA set to mlx5
|
277 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO NCCL_IB_HCA set to mlx5
|
278 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO NCCL_IB_HCA set to mlx5
|
279 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO NCCL_IB_HCA set to mlx5
|
280 |
-
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
|
281 |
-
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
|
282 |
-
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
|
283 |
-
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
|
284 |
-
libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
|
285 |
-
libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
|
286 |
-
libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
|
287 |
-
libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
|
288 |
-
libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
|
289 |
-
libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
|
290 |
-
libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
|
291 |
-
libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
|
292 |
-
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
293 |
-
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
294 |
-
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
295 |
-
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
296 |
-
libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
|
297 |
-
libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
|
298 |
-
libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
|
299 |
-
libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
|
300 |
-
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
|
301 |
-
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
|
302 |
-
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
|
303 |
-
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
|
304 |
-
libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
|
305 |
-
libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
|
306 |
-
libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
|
307 |
-
libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
|
308 |
-
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
|
309 |
-
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
|
310 |
-
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
|
311 |
-
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
|
312 |
-
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
313 |
-
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
314 |
-
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
315 |
-
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
316 |
-
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
|
317 |
-
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
|
318 |
-
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
|
319 |
-
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
|
320 |
-
libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
|
321 |
-
libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
|
322 |
-
libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
|
323 |
-
libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
|
324 |
-
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
|
325 |
-
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
|
326 |
-
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
|
327 |
-
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
|
328 |
-
libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
|
329 |
-
libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
|
330 |
-
libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
|
331 |
-
libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
|
332 |
-
libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
|
333 |
-
libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
|
334 |
-
libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
|
335 |
-
libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
|
336 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.5.96.241<0>
|
337 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Using non-device net plugin version 0
|
338 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Using network IB
|
339 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.5.96.241<0>
|
340 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Using non-device net plugin version 0
|
341 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Using network IB
|
342 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.5.96.241<0>
|
343 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Using non-device net plugin version 0
|
344 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Using network IB
|
345 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.5.96.241<0>
|
346 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Using non-device net plugin version 0
|
347 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Using network IB
|
348 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO comm 0x866fc40 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 20 commId 0xafe80e26c24ecacf - Init START
|
349 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO comm 0x9d94400 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId 30 commId 0xafe80e26c24ecacf - Init START
|
350 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO comm 0x6be36a0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId 40 commId 0xafe80e26c24ecacf - Init START
|
351 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO comm 0x87bffd0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 10 commId 0xafe80e26c24ecacf - Init START
|
352 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
|
353 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
|
354 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
|
355 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO comm 0x9d94400 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
|
356 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO comm 0x6be36a0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
|
357 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO comm 0x87bffd0 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
|
358 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO comm 0x866fc40 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
|
359 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
|
360 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
|
361 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
|
362 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
|
363 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 00/12 : 0 1 2 3
|
364 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1
|
365 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 01/12 : 0 1 2 3
|
366 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO P2P Chunksize set to 524288
|
367 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0
|
368 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 02/12 : 0 1 2 3
|
369 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2
|
370 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 03/12 : 0 1 2 3
|
371 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO P2P Chunksize set to 524288
|
372 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO P2P Chunksize set to 524288
|
373 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 04/12 : 0 1 2 3
|
374 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 05/12 : 0 1 2 3
|
375 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 06/12 : 0 1 2 3
|
376 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 07/12 : 0 1 2 3
|
377 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 08/12 : 0 1 2 3
|
378 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 09/12 : 0 1 2 3
|
379 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 10/12 : 0 1 2 3
|
380 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 11/12 : 0 1 2 3
|
381 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1
|
382 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO P2P Chunksize set to 524288
|
383 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/IPC/read
|
384 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/IPC/read
|
385 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/IPC/read
|
386 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/IPC/read
|
387 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/IPC/read
|
388 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/IPC/read
|
389 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/IPC/read
|
390 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/IPC/read
|
391 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/IPC/read
|
392 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/IPC/read
|
393 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/IPC/read
|
394 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/IPC/read
|
395 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/IPC/read
|
396 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/IPC/read
|
397 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/IPC/read
|
398 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/IPC/read
|
399 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/IPC/read
|
400 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/IPC/read
|
401 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/IPC/read
|
402 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/IPC/read
|
403 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/IPC/read
|
404 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/IPC/read
|
405 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/IPC/read
|
406 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/IPC/read
|
407 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/IPC/read
|
408 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/IPC/read
|
409 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/IPC/read
|
410 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/IPC/read
|
411 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/IPC/read
|
412 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/IPC/read
|
413 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/IPC/read
|
414 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/IPC/read
|
415 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/IPC/read
|
416 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/IPC/read
|
417 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/IPC/read
|
418 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/IPC/read
|
419 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/IPC/read
|
420 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/IPC/read
|
421 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/IPC/read
|
422 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/IPC/read
|
423 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/IPC/read
|
424 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/IPC/read
|
425 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/IPC/read
|
426 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/IPC/read
|
427 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/IPC/read
|
428 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/IPC/read
|
429 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/IPC/read
|
430 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/IPC/read
|
431 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Connected all rings
|
432 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Connected all rings
|
433 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/IPC/read
|
434 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Connected all rings
|
435 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Connected all rings
|
436 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/IPC/read
|
437 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/IPC/read
|
438 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 03/0 : 3[3] -> 2[2] via P2P/IPC/read
|
439 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/IPC/read
|
440 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/IPC/read
|
441 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/IPC/read
|
442 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/IPC/read
|
443 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/IPC/read
|
444 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/IPC/read
|
445 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/IPC/read
|
446 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Channel 11/0 : 3[3] -> 2[2] via P2P/IPC/read
|
447 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/IPC/read
|
448 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/IPC/read
|
449 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/IPC/read
|
450 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/IPC/read
|
451 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/IPC/read
|
452 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/IPC/read
|
453 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/IPC/read
|
454 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/IPC/read
|
455 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/IPC/read
|
456 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/IPC/read
|
457 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/IPC/read
|
458 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/IPC/read
|
459 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/IPC/read
|
460 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/IPC/read
|
461 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/IPC/read
|
462 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/IPC/read
|
463 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/IPC/read
|
464 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/IPC/read
|
465 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/IPC/read
|
466 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/IPC/read
|
467 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/IPC/read
|
468 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/IPC/read
|
469 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/IPC/read
|
470 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/IPC/read
|
471 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO Connected all trees
|
472 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
473 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
|
474 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO Connected all trees
|
475 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
476 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO Connected all trees
|
477 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
|
478 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
479 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
|
480 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO Connected all trees
|
481 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
482 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
|
483 |
-
dlcdanw1zq2cucwx-master-0:28:57 [0] NCCL INFO comm 0x87bffd0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 10 commId 0xafe80e26c24ecacf - Init COMPLETE
|
484 |
-
dlcdanw1zq2cucwx-master-0:29:60 [1] NCCL INFO comm 0x866fc40 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 20 commId 0xafe80e26c24ecacf - Init COMPLETE
|
485 |
-
dlcdanw1zq2cucwx-master-0:30:58 [2] NCCL INFO comm 0x9d94400 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId 30 commId 0xafe80e26c24ecacf - Init COMPLETE
|
486 |
-
dlcdanw1zq2cucwx-master-0:31:59 [3] NCCL INFO comm 0x6be36a0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId 40 commId 0xafe80e26c24ecacf - Init COMPLETE
|
487 |
-
[rank1]:[W1223 15:51:28.897884818 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
488 |
-
[rank0]:[W1223 15:51:28.897922529 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
489 |
-
[rank2]:[W1223 15:51:28.897929604 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
490 |
-
[rank3]:[W1223 15:51:28.898014217 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
491 |
-
module.sep_network.blocks.0.spk_att.layers.0.self_attn.in_proj_weight not loaded
|
492 |
-
module.sep_network.blocks.0.spk_att.layers.0.self_attn.in_proj_bias not loaded
|
493 |
-
module.sep_network.blocks.0.spk_att.layers.0.self_attn.out_proj.weight not loaded
|
494 |
-
module.sep_network.blocks.0.spk_att.layers.0.self_attn.out_proj.bias not loaded
|
495 |
-
module.sep_network.blocks.0.spk_att.layers.0.linear1.weight not loaded
|
496 |
-
module.sep_network.blocks.0.spk_att.layers.0.linear1.bias not loaded
|
497 |
-
module.sep_network.blocks.0.spk_att.layers.0.linear2.weight not loaded
|
498 |
-
module.sep_network.blocks.0.spk_att.layers.0.linear2.bias not loaded
|
499 |
-
module.sep_network.blocks.0.spk_att.layers.0.norm1.weight not loaded
|
500 |
-
module.sep_network.blocks.0.spk_att.layers.0.norm1.bias not loaded
|
501 |
-
module.sep_network.blocks.0.spk_att.layers.0.norm2.weight not loaded
|
502 |
-
module.sep_network.blocks.0.spk_att.layers.0.norm2.bias not loaded
|
503 |
-
module.sep_network.blocks.0.spk_norm.weight not loaded
|
504 |
-
module.sep_network.blocks.0.spk_norm.bias not loaded
|
505 |
-
module.sep_network.blocks.1.spk_att.layers.0.self_attn.in_proj_weight not loaded
|
506 |
-
module.sep_network.blocks.1.spk_att.layers.0.self_attn.in_proj_bias not loaded
|
507 |
-
module.sep_network.blocks.1.spk_att.layers.0.self_attn.out_proj.weight not loaded
|
508 |
-
module.sep_network.blocks.1.spk_att.layers.0.self_attn.out_proj.bias not loaded
|
509 |
-
module.sep_network.blocks.1.spk_att.layers.0.linear1.weight not loaded
|
510 |
-
module.sep_network.blocks.1.spk_att.layers.0.linear1.bias not loaded
|
511 |
-
module.sep_network.blocks.1.spk_att.layers.0.linear2.weight not loaded
|
512 |
-
module.sep_network.blocks.1.spk_att.layers.0.linear2.bias not loaded
|
513 |
-
module.sep_network.blocks.1.spk_att.layers.0.norm1.weight not loaded
|
514 |
-
module.sep_network.blocks.1.spk_att.layers.0.norm1.bias not loaded
|
515 |
-
module.sep_network.blocks.1.spk_att.layers.0.norm2.weight not loaded
|
516 |
-
module.sep_network.blocks.1.spk_att.layers.0.norm2.bias not loaded
|
517 |
-
module.sep_network.blocks.1.spk_norm.weight not loaded
|
518 |
-
module.sep_network.blocks.1.spk_norm.bias not loaded
|
519 |
-
module.sep_network.blocks.2.spk_att.layers.0.self_attn.in_proj_weight not loaded
|
520 |
-
module.sep_network.blocks.2.spk_att.layers.0.self_attn.in_proj_bias not loaded
|
521 |
-
module.sep_network.blocks.2.spk_att.layers.0.self_attn.out_proj.weight not loaded
|
522 |
-
module.sep_network.blocks.2.spk_att.layers.0.self_attn.out_proj.bias not loaded
|
523 |
-
module.sep_network.blocks.2.spk_att.layers.0.linear1.weight not loaded
|
524 |
-
module.sep_network.blocks.2.spk_att.layers.0.linear1.bias not loaded
|
525 |
-
module.sep_network.blocks.2.spk_att.layers.0.linear2.weight not loaded
|
526 |
-
module.sep_network.blocks.2.spk_att.layers.0.linear2.bias not loaded
|
527 |
-
module.sep_network.blocks.2.spk_att.layers.0.norm1.weight not loaded
|
528 |
-
module.sep_network.blocks.2.spk_att.layers.0.norm1.bias not loaded
|
529 |
-
module.sep_network.blocks.2.spk_att.layers.0.norm2.weight not loaded
|
530 |
-
module.sep_network.blocks.2.spk_att.layers.0.norm2.bias not loaded
|
531 |
-
module.sep_network.blocks.2.spk_norm.weight not loaded
|
532 |
-
module.sep_network.blocks.2.spk_norm.bias not loaded
|
533 |
-
module.sep_network.blocks.3.spk_att.layers.0.self_attn.in_proj_weight not loaded
|
534 |
-
module.sep_network.blocks.3.spk_att.layers.0.self_attn.in_proj_bias not loaded
|
535 |
-
module.sep_network.blocks.3.spk_att.layers.0.self_attn.out_proj.weight not loaded
|
536 |
-
module.sep_network.blocks.3.spk_att.layers.0.self_attn.out_proj.bias not loaded
|
537 |
-
module.sep_network.blocks.3.spk_att.layers.0.linear1.weight not loaded
|
538 |
-
module.sep_network.blocks.3.spk_att.layers.0.linear1.bias not loaded
|
539 |
-
module.sep_network.blocks.3.spk_att.layers.0.linear2.weight not loaded
|
540 |
-
module.sep_network.blocks.3.spk_att.layers.0.linear2.bias not loaded
|
541 |
-
module.sep_network.blocks.3.spk_att.layers.0.norm1.weight not loaded
|
542 |
-
module.sep_network.blocks.3.spk_att.layers.0.norm1.bias not loaded
|
543 |
-
module.sep_network.blocks.3.spk_att.layers.0.norm2.weight not loaded
|
544 |
-
module.sep_network.blocks.3.spk_att.layers.0.norm2.bias not loaded
|
545 |
-
module.sep_network.blocks.3.spk_norm.weight not loaded
|
546 |
-
module.sep_network.blocks.3.spk_norm.bias not loaded
|
547 |
-
module.sep_network.blocks.4.spk_att.layers.0.self_attn.in_proj_weight not loaded
|
548 |
-
module.sep_network.blocks.4.spk_att.layers.0.self_attn.in_proj_bias not loaded
|
549 |
-
module.sep_network.blocks.4.spk_att.layers.0.self_attn.out_proj.weight not loaded
|
550 |
-
module.sep_network.blocks.4.spk_att.layers.0.self_attn.out_proj.bias not loaded
|
551 |
-
module.sep_network.blocks.4.spk_att.layers.0.linear1.weight not loaded
|
552 |
-
module.sep_network.blocks.4.spk_att.layers.0.linear1.bias not loaded
|
553 |
-
module.sep_network.blocks.4.spk_att.layers.0.linear2.weight not loaded
|
554 |
-
module.sep_network.blocks.4.spk_att.layers.0.linear2.bias not loaded
|
555 |
-
module.sep_network.blocks.4.spk_att.layers.0.norm1.weight not loaded
|
556 |
-
module.sep_network.blocks.4.spk_att.layers.0.norm1.bias not loaded
|
557 |
-
module.sep_network.blocks.4.spk_att.layers.0.norm2.weight not loaded
|
558 |
-
module.sep_network.blocks.4.spk_att.layers.0.norm2.bias not loaded
|
559 |
-
module.sep_network.blocks.4.spk_norm.weight not loaded
|
560 |
-
module.sep_network.blocks.4.spk_norm.bias not loaded
|
561 |
-
module.sep_network.blocks.5.spk_att.layers.0.self_attn.in_proj_weight not loaded
|
562 |
-
module.sep_network.blocks.5.spk_att.layers.0.self_attn.in_proj_bias not loaded
|
563 |
-
module.sep_network.blocks.5.spk_att.layers.0.self_attn.out_proj.weight not loaded
|
564 |
-
module.sep_network.blocks.5.spk_att.layers.0.self_attn.out_proj.bias not loaded
|
565 |
-
module.sep_network.blocks.5.spk_att.layers.0.linear1.weight not loaded
|
566 |
-
module.sep_network.blocks.5.spk_att.layers.0.linear1.bias not loaded
|
567 |
-
module.sep_network.blocks.5.spk_att.layers.0.linear2.weight not loaded
|
568 |
-
module.sep_network.blocks.5.spk_att.layers.0.linear2.bias not loaded
|
569 |
-
module.sep_network.blocks.5.spk_att.layers.0.norm1.weight not loaded
|
570 |
-
module.sep_network.blocks.5.spk_att.layers.0.norm1.bias not loaded
|
571 |
-
module.sep_network.blocks.5.spk_att.layers.0.norm2.weight not loaded
|
572 |
-
module.sep_network.blocks.5.spk_att.layers.0.norm2.bias not loaded
|
573 |
-
module.sep_network.blocks.5.spk_norm.weight not loaded
|
574 |
-
module.sep_network.blocks.5.spk_norm.bias not loaded
|
575 |
-
Init model from checkpoints/log_VoxCeleb2_lip_tfgridnet_2spk, and start new training
|
576 |
-
[rank1]:[W1223 15:53:04.527524171 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
577 |
-
[rank2]:[W1223 15:53:04.529782679 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
578 |
-
[rank0]:[W1223 15:53:04.529824121 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
579 |
-
[rank3]:[W1223 15:53:04.530453400 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
580 |
-
Train Summary | End of Epoch 1 | Time 8862.50s | Train Loss -15.642
|
581 |
-
Valid Summary | End of Epoch 1 | Time 1299.28s | Valid Loss -14.021
|
582 |
-
Test Summary | End of Epoch 1 | Time 779.08s | Test Loss -13.495
|
583 |
-
Fund new best model, dict saved
|
584 |
-
Train Summary | End of Epoch 2 | Time 8868.78s | Train Loss -15.693
|
585 |
-
Valid Summary | End of Epoch 2 | Time 1297.45s | Valid Loss -13.954
|
586 |
-
Test Summary | End of Epoch 2 | Time 778.92s | Test Loss -13.553
|
587 |
-
Train Summary | End of Epoch 3 | Time 8857.29s | Train Loss -15.673
|
588 |
-
Valid Summary | End of Epoch 3 | Time 1297.92s | Valid Loss -13.922
|
589 |
-
Test Summary | End of Epoch 3 | Time 778.84s | Test Loss -13.479
|
590 |
-
Train Summary | End of Epoch 4 | Time 8841.24s | Train Loss -15.672
|
591 |
-
Valid Summary | End of Epoch 4 | Time 1297.81s | Valid Loss -14.050
|
592 |
-
Test Summary | End of Epoch 4 | Time 778.69s | Test Loss -13.570
|
593 |
-
Fund new best model, dict saved
|
594 |
-
Train Summary | End of Epoch 5 | Time 8842.92s | Train Loss -15.658
|
595 |
-
Valid Summary | End of Epoch 5 | Time 1297.78s | Valid Loss -13.980
|
596 |
-
Test Summary | End of Epoch 5 | Time 778.76s | Test Loss -13.672
|
597 |
-
Train Summary | End of Epoch 6 | Time 8863.22s | Train Loss -15.625
|
598 |
-
Valid Summary | End of Epoch 6 | Time 1297.63s | Valid Loss -14.042
|
599 |
-
Test Summary | End of Epoch 6 | Time 778.93s | Test Loss -13.626
|
600 |
-
Train Summary | End of Epoch 7 | Time 8874.34s | Train Loss -15.628
|
601 |
-
Valid Summary | End of Epoch 7 | Time 1297.79s | Valid Loss -14.223
|
602 |
-
Test Summary | End of Epoch 7 | Time 778.60s | Test Loss -13.703
|
603 |
-
Fund new best model, dict saved
|
604 |
-
Train Summary | End of Epoch 8 | Time 8878.31s | Train Loss -15.633
|
605 |
-
Valid Summary | End of Epoch 8 | Time 1297.68s | Valid Loss -14.197
|
606 |
-
Test Summary | End of Epoch 8 | Time 778.55s | Test Loss -13.741
|
607 |
-
Train Summary | End of Epoch 9 | Time 8879.85s | Train Loss -15.644
|
608 |
-
Valid Summary | End of Epoch 9 | Time 1297.92s | Valid Loss -14.241
|
609 |
-
Test Summary | End of Epoch 9 | Time 778.76s | Test Loss -13.799
|
610 |
-
Fund new best model, dict saved
|
611 |
-
Train Summary | End of Epoch 10 | Time 8885.21s | Train Loss -15.669
|
612 |
-
Valid Summary | End of Epoch 10 | Time 1297.27s | Valid Loss -14.229
|
613 |
-
Test Summary | End of Epoch 10 | Time 778.51s | Test Loss -13.791
|
614 |
-
Train Summary | End of Epoch 11 | Time 8884.38s | Train Loss -15.687
|
615 |
-
Valid Summary | End of Epoch 11 | Time 1297.56s | Valid Loss -14.257
|
616 |
-
Test Summary | End of Epoch 11 | Time 778.75s | Test Loss -13.766
|
617 |
-
Fund new best model, dict saved
|
618 |
-
Train Summary | End of Epoch 12 | Time 8861.05s | Train Loss -15.689
|
619 |
-
Valid Summary | End of Epoch 12 | Time 1297.83s | Valid Loss -14.225
|
620 |
-
Test Summary | End of Epoch 12 | Time 778.37s | Test Loss -13.740
|
621 |
-
Train Summary | End of Epoch 13 | Time 8854.73s | Train Loss -15.715
|
622 |
-
Valid Summary | End of Epoch 13 | Time 1297.07s | Valid Loss -14.338
|
623 |
-
Test Summary | End of Epoch 13 | Time 778.98s | Test Loss -13.852
|
624 |
-
Fund new best model, dict saved
|
625 |
-
Train Summary | End of Epoch 14 | Time 8861.60s | Train Loss -15.720
|
626 |
-
Valid Summary | End of Epoch 14 | Time 1297.61s | Valid Loss -14.320
|
627 |
-
Test Summary | End of Epoch 14 | Time 778.73s | Test Loss -13.842
|
628 |
-
Train Summary | End of Epoch 15 | Time 8867.45s | Train Loss -15.740
|
629 |
-
Valid Summary | End of Epoch 15 | Time 1297.15s | Valid Loss -14.264
|
630 |
-
Test Summary | End of Epoch 15 | Time 778.45s | Test Loss -13.831
|
631 |
-
Train Summary | End of Epoch 16 | Time 8871.73s | Train Loss -15.750
|
632 |
-
Valid Summary | End of Epoch 16 | Time 1297.58s | Valid Loss -14.301
|
633 |
-
Test Summary | End of Epoch 16 | Time 778.59s | Test Loss -13.901
|
634 |
-
Train Summary | End of Epoch 17 | Time 8874.12s | Train Loss -15.739
|
635 |
-
Valid Summary | End of Epoch 17 | Time 1297.21s | Valid Loss -14.338
|
636 |
-
Test Summary | End of Epoch 17 | Time 778.18s | Test Loss -13.897
|
637 |
-
Fund new best model, dict saved
|
638 |
-
Train Summary | End of Epoch 18 | Time 8866.66s | Train Loss -15.756
|
639 |
-
Valid Summary | End of Epoch 18 | Time 1297.23s | Valid Loss -14.248
|
640 |
-
Test Summary | End of Epoch 18 | Time 778.35s | Test Loss -13.823
|
641 |
-
Train Summary | End of Epoch 19 | Time 8866.16s | Train Loss -15.784
|
642 |
-
Valid Summary | End of Epoch 19 | Time 1296.46s | Valid Loss -14.318
|
643 |
-
Test Summary | End of Epoch 19 | Time 778.56s | Test Loss -13.898
|
644 |
-
Train Summary | End of Epoch 20 | Time 8861.71s | Train Loss -15.780
|
645 |
-
Valid Summary | End of Epoch 20 | Time 1297.90s | Valid Loss -14.337
|
646 |
-
Test Summary | End of Epoch 20 | Time 778.23s | Test Loss -13.892
|
647 |
-
Train Summary | End of Epoch 21 | Time 8861.37s | Train Loss -15.811
|
648 |
-
Valid Summary | End of Epoch 21 | Time 1296.68s | Valid Loss -14.383
|
649 |
-
Test Summary | End of Epoch 21 | Time 778.57s | Test Loss -13.913
|
650 |
-
Fund new best model, dict saved
|
651 |
-
Train Summary | End of Epoch 22 | Time 8868.58s | Train Loss -15.815
|
652 |
-
Valid Summary | End of Epoch 22 | Time 1297.38s | Valid Loss -14.341
|
653 |
-
Test Summary | End of Epoch 22 | Time 778.10s | Test Loss -13.883
|
654 |
-
Train Summary | End of Epoch 23 | Time 8876.67s | Train Loss -15.816
|
655 |
-
Valid Summary | End of Epoch 23 | Time 1297.54s | Valid Loss -14.315
|
656 |
-
Test Summary | End of Epoch 23 | Time 778.88s | Test Loss -13.974
|
657 |
-
Train Summary | End of Epoch 24 | Time 8873.42s | Train Loss -15.821
|
658 |
-
Valid Summary | End of Epoch 24 | Time 1297.77s | Valid Loss -14.415
|
659 |
-
Test Summary | End of Epoch 24 | Time 778.40s | Test Loss -13.877
|
660 |
-
Fund new best model, dict saved
|
661 |
-
Train Summary | End of Epoch 25 | Time 8853.35s | Train Loss -15.834
|
662 |
-
Valid Summary | End of Epoch 25 | Time 1297.67s | Valid Loss -14.356
|
663 |
-
Test Summary | End of Epoch 25 | Time 778.28s | Test Loss -13.917
|
664 |
-
Train Summary | End of Epoch 26 | Time 8879.37s | Train Loss -15.844
|
665 |
-
Valid Summary | End of Epoch 26 | Time 1297.45s | Valid Loss -14.384
|
666 |
-
Test Summary | End of Epoch 26 | Time 778.64s | Test Loss -13.973
|
667 |
-
Train Summary | End of Epoch 27 | Time 8872.84s | Train Loss -15.854
|
668 |
-
Valid Summary | End of Epoch 27 | Time 1297.92s | Valid Loss -14.402
|
669 |
-
Test Summary | End of Epoch 27 | Time 779.00s | Test Loss -14.017
|
670 |
-
Train Summary | End of Epoch 28 | Time 8875.05s | Train Loss -15.856
|
671 |
-
Valid Summary | End of Epoch 28 | Time 1297.66s | Valid Loss -14.351
|
672 |
-
Test Summary | End of Epoch 28 | Time 778.75s | Test Loss -13.939
|
673 |
-
Train Summary | End of Epoch 29 | Time 8869.04s | Train Loss -15.884
|
674 |
-
Valid Summary | End of Epoch 29 | Time 1297.87s | Valid Loss -14.430
|
675 |
-
Test Summary | End of Epoch 29 | Time 778.77s | Test Loss -13.954
|
676 |
-
Fund new best model, dict saved
|
677 |
-
Train Summary | End of Epoch 30 | Time 8864.22s | Train Loss -15.890
|
678 |
-
Valid Summary | End of Epoch 30 | Time 1297.79s | Valid Loss -14.398
|
679 |
-
Test Summary | End of Epoch 30 | Time 778.88s | Test Loss -14.004
|
680 |
-
Train Summary | End of Epoch 31 | Time 8862.20s | Train Loss -15.906
|
681 |
-
Valid Summary | End of Epoch 31 | Time 1297.63s | Valid Loss -14.396
|
682 |
-
Test Summary | End of Epoch 31 | Time 778.78s | Test Loss -13.985
|
683 |
-
Train Summary | End of Epoch 32 | Time 8872.91s | Train Loss -15.911
|
684 |
-
Valid Summary | End of Epoch 32 | Time 1297.77s | Valid Loss -14.425
|
685 |
-
Test Summary | End of Epoch 32 | Time 778.33s | Test Loss -13.933
|
686 |
-
Train Summary | End of Epoch 33 | Time 8867.16s | Train Loss -15.911
|
687 |
-
Valid Summary | End of Epoch 33 | Time 1297.78s | Valid Loss -14.423
|
688 |
-
Test Summary | End of Epoch 33 | Time 778.53s | Test Loss -13.952
|
689 |
-
Train Summary | End of Epoch 34 | Time 8863.09s | Train Loss -15.916
|
690 |
-
Valid Summary | End of Epoch 34 | Time 1297.82s | Valid Loss -14.470
|
691 |
-
Test Summary | End of Epoch 34 | Time 778.80s | Test Loss -13.885
|
692 |
-
Fund new best model, dict saved
|
693 |
-
Train Summary | End of Epoch 35 | Time 8865.77s | Train Loss -15.936
|
694 |
-
Valid Summary | End of Epoch 35 | Time 1297.80s | Valid Loss -14.486
|
695 |
-
Test Summary | End of Epoch 35 | Time 778.59s | Test Loss -13.990
|
696 |
-
Fund new best model, dict saved
|
697 |
-
Train Summary | End of Epoch 36 | Time 8864.53s | Train Loss -15.931
|
698 |
-
Valid Summary | End of Epoch 36 | Time 1297.82s | Valid Loss -14.413
|
699 |
-
Test Summary | End of Epoch 36 | Time 778.79s | Test Loss -13.959
|
700 |
-
Train Summary | End of Epoch 37 | Time 8860.11s | Train Loss -15.934
|
701 |
-
Valid Summary | End of Epoch 37 | Time 1298.28s | Valid Loss -14.503
|
702 |
-
Test Summary | End of Epoch 37 | Time 778.33s | Test Loss -13.980
|
703 |
-
Fund new best model, dict saved
|
704 |
-
Train Summary | End of Epoch 38 | Time 8855.04s | Train Loss -15.957
|
705 |
-
Valid Summary | End of Epoch 38 | Time 1298.32s | Valid Loss -14.387
|
706 |
-
Test Summary | End of Epoch 38 | Time 778.98s | Test Loss -13.980
|
707 |
-
Train Summary | End of Epoch 39 | Time 8862.85s | Train Loss -15.964
|
708 |
-
Valid Summary | End of Epoch 39 | Time 1297.98s | Valid Loss -14.462
|
709 |
-
Test Summary | End of Epoch 39 | Time 779.50s | Test Loss -14.014
|
710 |
-
Train Summary | End of Epoch 40 | Time 8861.48s | Train Loss -15.956
|
711 |
-
Valid Summary | End of Epoch 40 | Time 1297.55s | Valid Loss -14.344
|
712 |
-
Test Summary | End of Epoch 40 | Time 778.53s | Test Loss -13.998
|
713 |
-
Train Summary | End of Epoch 41 | Time 8861.38s | Train Loss -15.955
|
714 |
-
Valid Summary | End of Epoch 41 | Time 1297.55s | Valid Loss -14.369
|
715 |
-
Test Summary | End of Epoch 41 | Time 779.16s | Test Loss -13.946
|
716 |
-
Train Summary | End of Epoch 42 | Time 8853.68s | Train Loss -15.969
|
717 |
-
Valid Summary | End of Epoch 42 | Time 1297.52s | Valid Loss -14.472
|
718 |
-
Test Summary | End of Epoch 42 | Time 778.78s | Test Loss -14.000
|
719 |
-
reload weights and optimizer from last best checkpoint
|
720 |
-
Learning rate adjusted to: 0.000255
|
721 |
-
Train Summary | End of Epoch 43 | Time 8855.22s | Train Loss -16.093
|
722 |
-
Valid Summary | End of Epoch 43 | Time 1297.53s | Valid Loss -14.584
|
723 |
-
Test Summary | End of Epoch 43 | Time 778.81s | Test Loss -14.144
|
724 |
-
Fund new best model, dict saved
|
725 |
-
Train Summary | End of Epoch 44 | Time 8839.27s | Train Loss -16.155
|
726 |
-
Valid Summary | End of Epoch 44 | Time 1297.71s | Valid Loss -14.520
|
727 |
-
Test Summary | End of Epoch 44 | Time 778.86s | Test Loss -14.054
|
728 |
-
Train Summary | End of Epoch 45 | Time 8840.93s | Train Loss -16.158
|
729 |
-
Valid Summary | End of Epoch 45 | Time 1298.07s | Valid Loss -14.566
|
730 |
-
Test Summary | End of Epoch 45 | Time 778.17s | Test Loss -14.039
|
731 |
-
Train Summary | End of Epoch 46 | Time 8846.57s | Train Loss -16.194
|
732 |
-
Valid Summary | End of Epoch 46 | Time 1297.09s | Valid Loss -14.602
|
733 |
-
Test Summary | End of Epoch 46 | Time 779.00s | Test Loss -14.058
|
734 |
-
Fund new best model, dict saved
|
735 |
-
Train Summary | End of Epoch 47 | Time 8848.27s | Train Loss -16.203
|
736 |
-
Valid Summary | End of Epoch 47 | Time 1297.44s | Valid Loss -14.561
|
737 |
-
Test Summary | End of Epoch 47 | Time 778.99s | Test Loss -14.038
|
738 |
-
Train Summary | End of Epoch 48 | Time 8855.13s | Train Loss -16.208
|
739 |
-
Valid Summary | End of Epoch 48 | Time 1298.17s | Valid Loss -14.567
|
740 |
-
Test Summary | End of Epoch 48 | Time 779.06s | Test Loss -14.059
|
741 |
-
Train Summary | End of Epoch 49 | Time 8860.44s | Train Loss -16.230
|
742 |
-
Valid Summary | End of Epoch 49 | Time 1297.85s | Valid Loss -14.600
|
743 |
-
Test Summary | End of Epoch 49 | Time 778.57s | Test Loss -14.109
|
744 |
-
Train Summary | End of Epoch 50 | Time 8849.69s | Train Loss -16.234
|
745 |
-
Valid Summary | End of Epoch 50 | Time 1297.78s | Valid Loss -14.520
|
746 |
-
Test Summary | End of Epoch 50 | Time 778.36s | Test Loss -14.044
|
747 |
-
Train Summary | End of Epoch 51 | Time 8845.04s | Train Loss -16.262
|
748 |
-
Valid Summary | End of Epoch 51 | Time 1296.91s | Valid Loss -14.605
|
749 |
-
Test Summary | End of Epoch 51 | Time 778.81s | Test Loss -14.112
|
750 |
-
Fund new best model, dict saved
|
751 |
-
Train Summary | End of Epoch 52 | Time 8846.18s | Train Loss -16.262
|
752 |
-
Valid Summary | End of Epoch 52 | Time 1297.45s | Valid Loss -14.570
|
753 |
-
Test Summary | End of Epoch 52 | Time 778.92s | Test Loss -14.142
|
754 |
-
Train Summary | End of Epoch 53 | Time 8848.28s | Train Loss -16.273
|
755 |
-
Valid Summary | End of Epoch 53 | Time 1297.74s | Valid Loss -14.576
|
756 |
-
Test Summary | End of Epoch 53 | Time 778.71s | Test Loss -14.089
|
757 |
-
Train Summary | End of Epoch 54 | Time 8827.56s | Train Loss -16.303
|
758 |
-
Valid Summary | End of Epoch 54 | Time 1298.30s | Valid Loss -14.529
|
759 |
-
Test Summary | End of Epoch 54 | Time 779.41s | Test Loss -14.063
|
760 |
-
Train Summary | End of Epoch 55 | Time 8853.03s | Train Loss -16.310
|
761 |
-
Valid Summary | End of Epoch 55 | Time 1297.30s | Valid Loss -14.512
|
762 |
-
Test Summary | End of Epoch 55 | Time 778.55s | Test Loss -14.077
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/log_2024-12-31(09:48:03).txt
DELETED
@@ -1,632 +0,0 @@
|
|
1 |
-
## Config file
|
2 |
-
|
3 |
-
# Log
|
4 |
-
seed: 777
|
5 |
-
use_cuda: 1 # 1 for True, 0 for False
|
6 |
-
|
7 |
-
# dataset
|
8 |
-
speaker_no: 2
|
9 |
-
mix_lst_path: ./data/VoxCeleb2_non_repeat/mixture_data_list_2mix.csv
|
10 |
-
audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/
|
11 |
-
reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/
|
12 |
-
audio_sr: 16000
|
13 |
-
ref_sr: 25
|
14 |
-
|
15 |
-
# dataloader
|
16 |
-
num_workers: 2
|
17 |
-
batch_size: 1 # 4-GPU training with a total effective batch size of 8
|
18 |
-
accu_grad: 1
|
19 |
-
effec_batch_size: 2 # per GPU, only used if accu_grad is set to 1, must be multiple times of batch size
|
20 |
-
max_length: 3 # truncate the utterances in dataloader, in seconds
|
21 |
-
|
22 |
-
# network settings
|
23 |
-
init_from: 'checkpoints/log_VoxCeleb2_lip_tfgridnet_2spk' # 'None' or a log name 'log_2024-07-22(18:12:13)'
|
24 |
-
causal: 0 # 1 for True, 0 for False
|
25 |
-
network_reference:
|
26 |
-
cue: lip # lip or speech or gesture or EEG
|
27 |
-
backbone: resnet18 # resnet18 or shufflenetV2 or blazenet64
|
28 |
-
emb_size: 256 # resnet18:256
|
29 |
-
network_audio:
|
30 |
-
backbone: av_tfgridnet_att_ss
|
31 |
-
n_fft: 256
|
32 |
-
stride: 128
|
33 |
-
window: "hann"
|
34 |
-
use_builtin_complex: False
|
35 |
-
n_srcs: 1
|
36 |
-
n_imics: 1
|
37 |
-
n_layers: 6
|
38 |
-
lstm_hidden_units: 192
|
39 |
-
attn_n_head: 4
|
40 |
-
attn_qk_output_channel: 4
|
41 |
-
emb_dim: 48
|
42 |
-
emb_ks: 4
|
43 |
-
emb_hs: 1
|
44 |
-
activation: "prelu"
|
45 |
-
|
46 |
-
# optimizer
|
47 |
-
spk_att_dropout: 1 # 0 for always use speaker attention
|
48 |
-
loss_type: ss_sisdr # "snr", "sisdr", "hybrid"
|
49 |
-
lr_warmup: 1
|
50 |
-
init_learning_rate: 0.0005
|
51 |
-
max_epoch: 150
|
52 |
-
clip_grad_norm: 5
|
53 |
-
W1231 09:48:39.901772 139973816584000 torch/distributed/run.py:779]
|
54 |
-
W1231 09:48:39.901772 139973816584000 torch/distributed/run.py:779] *****************************************
|
55 |
-
W1231 09:48:39.901772 139973816584000 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
56 |
-
W1231 09:48:39.901772 139973816584000 torch/distributed/run.py:779] *****************************************
|
57 |
-
[W1231 09:49:05.069551764 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
58 |
-
[W1231 09:49:05.069566273 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
59 |
-
[W1231 09:49:05.070374617 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
|
60 |
-
[W1231 09:49:05.070402090 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
|
61 |
-
[W1231 09:49:05.069566736 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
62 |
-
[W1231 09:49:05.070424006 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
|
63 |
-
[W1231 09:49:05.069610510 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
64 |
-
[W1231 09:49:05.070456802 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
|
65 |
-
started on checkpoints/log_2024-12-23(15:50:05)
|
66 |
-
|
67 |
-
namespace(accu_grad=1, audio_direc='/mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/', audio_sr=16000, batch_size=1, causal=0, checkpoint_dir='checkpoints/log_2024-12-23(15:50:05)', clip_grad_norm=5.0, config=[<yamlargparse.Path object at 0x7f87241d2c40>], device=device(type='cuda'), distributed=True, effec_batch_size=2, evaluate_only=0, init_from='checkpoints/log_VoxCeleb2_lip_tfgridnet_2spk', init_learning_rate=0.0005, local_rank=0, loss_type='ss_sisdr', lr_warmup=1, max_epoch=150, max_length=3, mix_lst_path='./data/VoxCeleb2_non_repeat/mixture_data_list_2mix.csv', network_audio=namespace(activation='prelu', attn_n_head=4, attn_qk_output_channel=4, backbone='av_tfgridnet_att_ss', emb_dim=48, emb_hs=1, emb_ks=4, lstm_hidden_units=192, n_fft=256, n_imics=1, n_layers=6, n_srcs=1, stride=128, use_builtin_complex=False, window='hann'), network_reference=namespace(backbone='resnet18', cue='lip', emb_size=256), num_workers=2, ref_sr=25, reference_direc='/mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/', seed=777, speaker_no=2, spk_att_dropout=1, train_from_last_checkpoint=1, use_cuda=1, world_size=4)
|
68 |
-
network_wrapper(
|
69 |
-
(sep_network): av_TFGridNetV3_att_ss(
|
70 |
-
(enc): STFTEncoder(
|
71 |
-
(stft): Stft(n_fft=256, win_length=256, hop_length=128, center=True, normalized=False, onesided=True)
|
72 |
-
)
|
73 |
-
(dec): STFTDecoder(
|
74 |
-
(stft): Stft(n_fft=256, win_length=256, hop_length=128, center=True, normalized=False, onesided=True)
|
75 |
-
)
|
76 |
-
(conv): Sequential(
|
77 |
-
(0): Conv2d(2, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
78 |
-
(1): GroupNorm(1, 48, eps=1e-05, affine=True)
|
79 |
-
)
|
80 |
-
(blocks): ModuleList(
|
81 |
-
(0-5): 6 x GridNetV3Block(
|
82 |
-
(intra_norm): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
|
83 |
-
(intra_rnn): LSTM(192, 192, batch_first=True, bidirectional=True)
|
84 |
-
(intra_linear): ConvTranspose1d(384, 48, kernel_size=(4,), stride=(1,))
|
85 |
-
(inter_norm): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
|
86 |
-
(inter_rnn): LSTM(192, 192, batch_first=True, bidirectional=True)
|
87 |
-
(inter_linear): ConvTranspose1d(384, 48, kernel_size=(4,), stride=(1,))
|
88 |
-
(attn_conv_Q): Conv2d(48, 16, kernel_size=(1, 1), stride=(1, 1))
|
89 |
-
(attn_norm_Q): AllHeadPReLULayerNormalization4DC(
|
90 |
-
(act): PReLU(num_parameters=4)
|
91 |
-
)
|
92 |
-
(attn_conv_K): Conv2d(48, 16, kernel_size=(1, 1), stride=(1, 1))
|
93 |
-
(attn_norm_K): AllHeadPReLULayerNormalization4DC(
|
94 |
-
(act): PReLU(num_parameters=4)
|
95 |
-
)
|
96 |
-
(attn_conv_V): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1))
|
97 |
-
(attn_norm_V): AllHeadPReLULayerNormalization4DC(
|
98 |
-
(act): PReLU(num_parameters=4)
|
99 |
-
)
|
100 |
-
(attn_concat_proj): Sequential(
|
101 |
-
(0): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1))
|
102 |
-
(1): PReLU(num_parameters=1)
|
103 |
-
(2): LayerNormalization()
|
104 |
-
)
|
105 |
-
(spk_att): TransformerEncoder(
|
106 |
-
(layers): ModuleList(
|
107 |
-
(0): TransformerEncoderLayer(
|
108 |
-
(self_attn): MultiheadAttention(
|
109 |
-
(out_proj): NonDynamicallyQuantizableLinear(in_features=48, out_features=48, bias=True)
|
110 |
-
)
|
111 |
-
(linear1): Linear(in_features=48, out_features=192, bias=True)
|
112 |
-
(dropout): Dropout(p=0.1, inplace=False)
|
113 |
-
(linear2): Linear(in_features=192, out_features=48, bias=True)
|
114 |
-
(norm1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
|
115 |
-
(norm2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
|
116 |
-
(dropout1): Dropout(p=0.1, inplace=False)
|
117 |
-
(dropout2): Dropout(p=0.1, inplace=False)
|
118 |
-
)
|
119 |
-
)
|
120 |
-
)
|
121 |
-
(spk_norm): GroupNorm(1, 48, eps=1e-08, affine=True)
|
122 |
-
)
|
123 |
-
)
|
124 |
-
(deconv): ConvTranspose2d(48, 2, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
125 |
-
(av_conv): ModuleList(
|
126 |
-
(0-5): 6 x Linear(in_features=304, out_features=48, bias=True)
|
127 |
-
)
|
128 |
-
)
|
129 |
-
(ref_encoder): Visual_encoder(
|
130 |
-
(v_frontend): VisualFrontend(
|
131 |
-
(frontend3D): Sequential(
|
132 |
-
(0): Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
|
133 |
-
(1): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
134 |
-
(2): ReLU()
|
135 |
-
(3): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
|
136 |
-
)
|
137 |
-
(resnet): ResNet(
|
138 |
-
(layer1): ResNetLayer(
|
139 |
-
(conv1a): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
140 |
-
(bn1a): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
141 |
-
(conv2a): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
142 |
-
(downsample): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
|
143 |
-
(outbna): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
144 |
-
(conv1b): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
145 |
-
(bn1b): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
146 |
-
(conv2b): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
147 |
-
(outbnb): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
148 |
-
)
|
149 |
-
(layer2): ResNetLayer(
|
150 |
-
(conv1a): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
|
151 |
-
(bn1a): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
152 |
-
(conv2a): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
153 |
-
(downsample): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
|
154 |
-
(outbna): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
155 |
-
(conv1b): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
156 |
-
(bn1b): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
157 |
-
(conv2b): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
158 |
-
(outbnb): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
159 |
-
)
|
160 |
-
(layer3): ResNetLayer(
|
161 |
-
(conv1a): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
|
162 |
-
(bn1a): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
163 |
-
(conv2a): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
164 |
-
(downsample): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
|
165 |
-
(outbna): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
166 |
-
(conv1b): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
167 |
-
(bn1b): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
168 |
-
(conv2b): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
169 |
-
(outbnb): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
170 |
-
)
|
171 |
-
(layer4): ResNetLayer(
|
172 |
-
(conv1a): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
|
173 |
-
(bn1a): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
174 |
-
(conv2a): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
175 |
-
(downsample): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
|
176 |
-
(outbna): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
177 |
-
(conv1b): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
178 |
-
(bn1b): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
179 |
-
(conv2b): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
180 |
-
(outbnb): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
181 |
-
)
|
182 |
-
(avgpool): AvgPool2d(kernel_size=(4, 4), stride=(1, 1), padding=0)
|
183 |
-
)
|
184 |
-
)
|
185 |
-
(v_ds): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
186 |
-
(visual_conv): Sequential(
|
187 |
-
(0): VisualConv1D(
|
188 |
-
(relu_0): ReLU()
|
189 |
-
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
190 |
-
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
191 |
-
(relu): ReLU()
|
192 |
-
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
193 |
-
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
194 |
-
(prelu): PReLU(num_parameters=1)
|
195 |
-
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
196 |
-
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
197 |
-
)
|
198 |
-
(1): VisualConv1D(
|
199 |
-
(relu_0): ReLU()
|
200 |
-
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
201 |
-
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
202 |
-
(relu): ReLU()
|
203 |
-
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
204 |
-
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
205 |
-
(prelu): PReLU(num_parameters=1)
|
206 |
-
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
207 |
-
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
208 |
-
)
|
209 |
-
(2): VisualConv1D(
|
210 |
-
(relu_0): ReLU()
|
211 |
-
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
212 |
-
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
213 |
-
(relu): ReLU()
|
214 |
-
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
215 |
-
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
216 |
-
(prelu): PReLU(num_parameters=1)
|
217 |
-
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
218 |
-
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
219 |
-
)
|
220 |
-
(3): VisualConv1D(
|
221 |
-
(relu_0): ReLU()
|
222 |
-
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
223 |
-
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
224 |
-
(relu): ReLU()
|
225 |
-
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
226 |
-
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
227 |
-
(prelu): PReLU(num_parameters=1)
|
228 |
-
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
229 |
-
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
230 |
-
)
|
231 |
-
(4): VisualConv1D(
|
232 |
-
(relu_0): ReLU()
|
233 |
-
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
234 |
-
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
235 |
-
(relu): ReLU()
|
236 |
-
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
237 |
-
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
238 |
-
(prelu): PReLU(num_parameters=1)
|
239 |
-
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
240 |
-
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
241 |
-
)
|
242 |
-
)
|
243 |
-
)
|
244 |
-
)
|
245 |
-
|
246 |
-
Total number of parameters: 20950309
|
247 |
-
|
248 |
-
|
249 |
-
Total number of trainable parameters: 9765221
|
250 |
-
|
251 |
-
dlc10xm9l399lwkq-master-0:26:26 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
252 |
-
dlc10xm9l399lwkq-master-0:26:26 [0] NCCL INFO Bootstrap : Using eth0:22.6.236.79<0>
|
253 |
-
dlc10xm9l399lwkq-master-0:26:26 [0] NCCL INFO Plugin name set by env to libnccl-net-none.so
|
254 |
-
dlc10xm9l399lwkq-master-0:26:26 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
|
255 |
-
dlc10xm9l399lwkq-master-0:26:26 [0] NCCL INFO cudaDriverVersion 11040
|
256 |
-
dlc10xm9l399lwkq-master-0:29:29 [3] NCCL INFO cudaDriverVersion 11040
|
257 |
-
dlc10xm9l399lwkq-master-0:27:27 [1] NCCL INFO cudaDriverVersion 11040
|
258 |
-
dlc10xm9l399lwkq-master-0:28:28 [2] NCCL INFO cudaDriverVersion 11040
|
259 |
-
NCCL version 2.20.5+cuda11.8
|
260 |
-
dlc10xm9l399lwkq-master-0:29:29 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
261 |
-
dlc10xm9l399lwkq-master-0:27:27 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
262 |
-
dlc10xm9l399lwkq-master-0:28:28 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
263 |
-
dlc10xm9l399lwkq-master-0:29:29 [3] NCCL INFO Bootstrap : Using eth0:22.6.236.79<0>
|
264 |
-
dlc10xm9l399lwkq-master-0:28:28 [2] NCCL INFO Bootstrap : Using eth0:22.6.236.79<0>
|
265 |
-
dlc10xm9l399lwkq-master-0:27:27 [1] NCCL INFO Bootstrap : Using eth0:22.6.236.79<0>
|
266 |
-
dlc10xm9l399lwkq-master-0:29:29 [3] NCCL INFO Plugin name set by env to libnccl-net-none.so
|
267 |
-
dlc10xm9l399lwkq-master-0:28:28 [2] NCCL INFO Plugin name set by env to libnccl-net-none.so
|
268 |
-
dlc10xm9l399lwkq-master-0:27:27 [1] NCCL INFO Plugin name set by env to libnccl-net-none.so
|
269 |
-
dlc10xm9l399lwkq-master-0:29:29 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
|
270 |
-
dlc10xm9l399lwkq-master-0:28:28 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
|
271 |
-
dlc10xm9l399lwkq-master-0:27:27 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
|
272 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
273 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
274 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
275 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
276 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO NCCL_IB_HCA set to mlx5
|
277 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO NCCL_IB_HCA set to mlx5
|
278 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO NCCL_IB_HCA set to mlx5
|
279 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO NCCL_IB_HCA set to mlx5
|
280 |
-
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
|
281 |
-
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
|
282 |
-
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
|
283 |
-
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
|
284 |
-
libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
|
285 |
-
libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
|
286 |
-
libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
|
287 |
-
libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
|
288 |
-
libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
|
289 |
-
libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
|
290 |
-
libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
|
291 |
-
libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
|
292 |
-
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
293 |
-
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
294 |
-
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
295 |
-
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
296 |
-
libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
|
297 |
-
libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
|
298 |
-
libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
|
299 |
-
libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
|
300 |
-
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
|
301 |
-
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
|
302 |
-
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
|
303 |
-
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
|
304 |
-
libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
|
305 |
-
libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
|
306 |
-
libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
|
307 |
-
libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
|
308 |
-
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
|
309 |
-
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
|
310 |
-
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
|
311 |
-
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
|
312 |
-
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
313 |
-
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
314 |
-
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
315 |
-
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
316 |
-
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
|
317 |
-
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
|
318 |
-
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
|
319 |
-
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
|
320 |
-
libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
|
321 |
-
libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
|
322 |
-
libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
|
323 |
-
libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
|
324 |
-
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
|
325 |
-
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
|
326 |
-
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
|
327 |
-
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
|
328 |
-
libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
|
329 |
-
libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
|
330 |
-
libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
|
331 |
-
libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
|
332 |
-
libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
|
333 |
-
libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
|
334 |
-
libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
|
335 |
-
libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
|
336 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.6.236.79<0>
|
337 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Using non-device net plugin version 0
|
338 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Using network IB
|
339 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.6.236.79<0>
|
340 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.6.236.79<0>
|
341 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Using non-device net plugin version 0
|
342 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Using network IB
|
343 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Using non-device net plugin version 0
|
344 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Using network IB
|
345 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [2]mlx5_2:1/RoCE [RO]; OOB eth0:22.6.236.79<0>
|
346 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Using non-device net plugin version 0
|
347 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Using network IB
|
348 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO comm 0x8e64ad0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 20 commId 0xf279881ae65b16f2 - Init START
|
349 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO comm 0x933e770 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 10 commId 0xf279881ae65b16f2 - Init START
|
350 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO comm 0x792e140 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId 40 commId 0xf279881ae65b16f2 - Init START
|
351 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO comm 0x754a850 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId 30 commId 0xf279881ae65b16f2 - Init START
|
352 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
|
353 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
|
354 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
|
355 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO comm 0x933e770 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
|
356 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO comm 0x8e64ad0 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
|
357 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO comm 0x792e140 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
|
358 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO comm 0x754a850 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
|
359 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
|
360 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
|
361 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
|
362 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
|
363 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 00/12 : 0 1 2 3
|
364 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0
|
365 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 01/12 : 0 1 2 3
|
366 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO P2P Chunksize set to 524288
|
367 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2
|
368 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 02/12 : 0 1 2 3
|
369 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1
|
370 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO P2P Chunksize set to 524288
|
371 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO P2P Chunksize set to 524288
|
372 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 03/12 : 0 1 2 3
|
373 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 04/12 : 0 1 2 3
|
374 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 05/12 : 0 1 2 3
|
375 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 06/12 : 0 1 2 3
|
376 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 07/12 : 0 1 2 3
|
377 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 08/12 : 0 1 2 3
|
378 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 09/12 : 0 1 2 3
|
379 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 10/12 : 0 1 2 3
|
380 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 11/12 : 0 1 2 3
|
381 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1
|
382 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO P2P Chunksize set to 524288
|
383 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/IPC/read
|
384 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/IPC/read
|
385 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/IPC/read
|
386 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/IPC/read
|
387 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/IPC/read
|
388 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/IPC/read
|
389 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/IPC/read
|
390 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/IPC/read
|
391 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/IPC/read
|
392 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/IPC/read
|
393 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/IPC/read
|
394 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/IPC/read
|
395 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/IPC/read
|
396 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/IPC/read
|
397 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/IPC/read
|
398 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/IPC/read
|
399 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/IPC/read
|
400 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/IPC/read
|
401 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/IPC/read
|
402 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/IPC/read
|
403 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/IPC/read
|
404 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/IPC/read
|
405 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/IPC/read
|
406 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/IPC/read
|
407 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/IPC/read
|
408 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/IPC/read
|
409 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/IPC/read
|
410 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/IPC/read
|
411 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/IPC/read
|
412 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/IPC/read
|
413 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/IPC/read
|
414 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/IPC/read
|
415 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/IPC/read
|
416 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/IPC/read
|
417 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/IPC/read
|
418 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/IPC/read
|
419 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/IPC/read
|
420 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/IPC/read
|
421 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/IPC/read
|
422 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/IPC/read
|
423 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/IPC/read
|
424 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/IPC/read
|
425 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/IPC/read
|
426 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/IPC/read
|
427 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/IPC/read
|
428 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/IPC/read
|
429 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/IPC/read
|
430 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/IPC/read
|
431 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Connected all rings
|
432 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Connected all rings
|
433 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/IPC/read
|
434 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Connected all rings
|
435 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Connected all rings
|
436 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/IPC/read
|
437 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/IPC/read
|
438 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 03/0 : 3[3] -> 2[2] via P2P/IPC/read
|
439 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/IPC/read
|
440 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/IPC/read
|
441 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/IPC/read
|
442 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/IPC/read
|
443 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/IPC/read
|
444 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/IPC/read
|
445 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/IPC/read
|
446 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Channel 11/0 : 3[3] -> 2[2] via P2P/IPC/read
|
447 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/IPC/read
|
448 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/IPC/read
|
449 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/IPC/read
|
450 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/IPC/read
|
451 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/IPC/read
|
452 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/IPC/read
|
453 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/IPC/read
|
454 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/IPC/read
|
455 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/IPC/read
|
456 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/IPC/read
|
457 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/IPC/read
|
458 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/IPC/read
|
459 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/IPC/read
|
460 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/IPC/read
|
461 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/IPC/read
|
462 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/IPC/read
|
463 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/IPC/read
|
464 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/IPC/read
|
465 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/IPC/read
|
466 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/IPC/read
|
467 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/IPC/read
|
468 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/IPC/read
|
469 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/IPC/read
|
470 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/IPC/read
|
471 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO Connected all trees
|
472 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
473 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
|
474 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO Connected all trees
|
475 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
476 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
|
477 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO Connected all trees
|
478 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO Connected all trees
|
479 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
480 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
|
481 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
482 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
|
483 |
-
dlc10xm9l399lwkq-master-0:27:57 [1] NCCL INFO comm 0x8e64ad0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 20 commId 0xf279881ae65b16f2 - Init COMPLETE
|
484 |
-
dlc10xm9l399lwkq-master-0:26:56 [0] NCCL INFO comm 0x933e770 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 10 commId 0xf279881ae65b16f2 - Init COMPLETE
|
485 |
-
dlc10xm9l399lwkq-master-0:28:59 [2] NCCL INFO comm 0x754a850 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId 30 commId 0xf279881ae65b16f2 - Init COMPLETE
|
486 |
-
dlc10xm9l399lwkq-master-0:29:58 [3] NCCL INFO comm 0x792e140 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId 40 commId 0xf279881ae65b16f2 - Init COMPLETE
|
487 |
-
[rank0]:[W1231 09:49:44.279868977 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
488 |
-
[rank2]:[W1231 09:49:44.279870560 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
489 |
-
[rank3]:[W1231 09:49:44.280285689 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
490 |
-
[rank1]:[W1231 09:49:44.280413742 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
491 |
-
Resume training from epoch: 56
|
492 |
-
[rank1]:[W1231 09:51:26.949995860 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
493 |
-
[rank2]:[W1231 09:51:26.954996096 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
494 |
-
[rank0]:[W1231 09:51:26.955118250 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
495 |
-
[rank3]:[W1231 09:51:26.957005274 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
496 |
-
Train Summary | End of Epoch 56 | Time 8839.06s | Train Loss -16.312
|
497 |
-
Valid Summary | End of Epoch 56 | Time 1303.07s | Valid Loss -14.529
|
498 |
-
Test Summary | End of Epoch 56 | Time 781.34s | Test Loss -14.061
|
499 |
-
reload weights and optimizer from last best checkpoint
|
500 |
-
Learning rate adjusted to: 0.000128
|
501 |
-
Learning rate is: 0.000128
|
502 |
-
Train Summary | End of Epoch 57 | Time 8856.34s | Train Loss -16.327
|
503 |
-
Valid Summary | End of Epoch 57 | Time 1301.12s | Valid Loss -14.620
|
504 |
-
Test Summary | End of Epoch 57 | Time 781.09s | Test Loss -14.115
|
505 |
-
Learning rate is: 0.000128
|
506 |
-
Fund new best model, dict saved
|
507 |
-
Train Summary | End of Epoch 58 | Time 8854.23s | Train Loss -16.346
|
508 |
-
Valid Summary | End of Epoch 58 | Time 1301.64s | Valid Loss -14.590
|
509 |
-
Test Summary | End of Epoch 58 | Time 780.97s | Test Loss -14.113
|
510 |
-
Learning rate is: 0.000128
|
511 |
-
Train Summary | End of Epoch 59 | Time 8854.80s | Train Loss -16.353
|
512 |
-
Valid Summary | End of Epoch 59 | Time 1301.66s | Valid Loss -14.589
|
513 |
-
Test Summary | End of Epoch 59 | Time 781.22s | Test Loss -14.091
|
514 |
-
Learning rate is: 0.000128
|
515 |
-
Train Summary | End of Epoch 60 | Time 8863.27s | Train Loss -16.371
|
516 |
-
Valid Summary | End of Epoch 60 | Time 1301.48s | Valid Loss -14.578
|
517 |
-
Test Summary | End of Epoch 60 | Time 781.07s | Test Loss -14.179
|
518 |
-
Learning rate is: 0.000128
|
519 |
-
Train Summary | End of Epoch 61 | Time 8878.44s | Train Loss -16.381
|
520 |
-
Valid Summary | End of Epoch 61 | Time 1301.09s | Valid Loss -14.561
|
521 |
-
Test Summary | End of Epoch 61 | Time 781.19s | Test Loss -14.117
|
522 |
-
Learning rate is: 0.000128
|
523 |
-
Train Summary | End of Epoch 62 | Time 8874.16s | Train Loss -16.381
|
524 |
-
Valid Summary | End of Epoch 62 | Time 1301.83s | Valid Loss -14.627
|
525 |
-
Test Summary | End of Epoch 62 | Time 781.20s | Test Loss -14.076
|
526 |
-
Learning rate is: 0.000128
|
527 |
-
Fund new best model, dict saved
|
528 |
-
Train Summary | End of Epoch 63 | Time 8872.86s | Train Loss -16.386
|
529 |
-
Valid Summary | End of Epoch 63 | Time 1301.73s | Valid Loss -14.573
|
530 |
-
Test Summary | End of Epoch 63 | Time 781.22s | Test Loss -14.034
|
531 |
-
Learning rate is: 0.000128
|
532 |
-
Train Summary | End of Epoch 64 | Time 8872.70s | Train Loss -16.407
|
533 |
-
Valid Summary | End of Epoch 64 | Time 1302.35s | Valid Loss -14.552
|
534 |
-
Test Summary | End of Epoch 64 | Time 781.45s | Test Loss -14.107
|
535 |
-
Learning rate is: 0.000128
|
536 |
-
Train Summary | End of Epoch 65 | Time 8871.09s | Train Loss -16.411
|
537 |
-
Valid Summary | End of Epoch 65 | Time 1301.10s | Valid Loss -14.493
|
538 |
-
Test Summary | End of Epoch 65 | Time 781.20s | Test Loss -14.088
|
539 |
-
Learning rate is: 0.000128
|
540 |
-
Train Summary | End of Epoch 66 | Time 8868.35s | Train Loss -16.424
|
541 |
-
Valid Summary | End of Epoch 66 | Time 1301.48s | Valid Loss -14.576
|
542 |
-
Test Summary | End of Epoch 66 | Time 780.88s | Test Loss -14.107
|
543 |
-
Learning rate is: 0.000128
|
544 |
-
Train Summary | End of Epoch 67 | Time 8872.21s | Train Loss -16.428
|
545 |
-
Valid Summary | End of Epoch 67 | Time 1302.63s | Valid Loss -14.600
|
546 |
-
Test Summary | End of Epoch 67 | Time 780.67s | Test Loss -14.124
|
547 |
-
reload weights and optimizer from last best checkpoint
|
548 |
-
Learning rate adjusted to: 0.000064
|
549 |
-
Learning rate is: 0.000064
|
550 |
-
Train Summary | End of Epoch 68 | Time 8866.46s | Train Loss -16.420
|
551 |
-
Valid Summary | End of Epoch 68 | Time 1301.06s | Valid Loss -14.629
|
552 |
-
Test Summary | End of Epoch 68 | Time 781.03s | Test Loss -14.045
|
553 |
-
Learning rate is: 0.000064
|
554 |
-
Fund new best model, dict saved
|
555 |
-
Train Summary | End of Epoch 69 | Time 8869.51s | Train Loss -16.423
|
556 |
-
Valid Summary | End of Epoch 69 | Time 1301.42s | Valid Loss -14.610
|
557 |
-
Test Summary | End of Epoch 69 | Time 781.07s | Test Loss -14.122
|
558 |
-
Learning rate is: 0.000064
|
559 |
-
Train Summary | End of Epoch 70 | Time 8870.52s | Train Loss -16.439
|
560 |
-
Valid Summary | End of Epoch 70 | Time 1301.36s | Valid Loss -14.595
|
561 |
-
Test Summary | End of Epoch 70 | Time 781.50s | Test Loss -14.017
|
562 |
-
Learning rate is: 0.000064
|
563 |
-
Train Summary | End of Epoch 71 | Time 8872.92s | Train Loss -16.443
|
564 |
-
Valid Summary | End of Epoch 71 | Time 1302.03s | Valid Loss -14.576
|
565 |
-
Test Summary | End of Epoch 71 | Time 781.03s | Test Loss -14.149
|
566 |
-
Learning rate is: 0.000064
|
567 |
-
Train Summary | End of Epoch 72 | Time 8868.30s | Train Loss -16.452
|
568 |
-
Valid Summary | End of Epoch 72 | Time 1300.98s | Valid Loss -14.580
|
569 |
-
Test Summary | End of Epoch 72 | Time 780.55s | Test Loss -14.096
|
570 |
-
Learning rate is: 0.000064
|
571 |
-
Train Summary | End of Epoch 73 | Time 8867.89s | Train Loss -16.449
|
572 |
-
Valid Summary | End of Epoch 73 | Time 1301.70s | Valid Loss -14.556
|
573 |
-
Test Summary | End of Epoch 73 | Time 780.97s | Test Loss -14.083
|
574 |
-
reload weights and optimizer from last best checkpoint
|
575 |
-
Learning rate adjusted to: 0.000032
|
576 |
-
Learning rate is: 0.000032
|
577 |
-
Train Summary | End of Epoch 74 | Time 8867.13s | Train Loss -16.444
|
578 |
-
Valid Summary | End of Epoch 74 | Time 1301.02s | Valid Loss -14.603
|
579 |
-
Test Summary | End of Epoch 74 | Time 781.03s | Test Loss -14.174
|
580 |
-
Learning rate is: 0.000032
|
581 |
-
Train Summary | End of Epoch 75 | Time 8848.91s | Train Loss -16.446
|
582 |
-
Valid Summary | End of Epoch 75 | Time 1302.16s | Valid Loss -14.586
|
583 |
-
Test Summary | End of Epoch 75 | Time 781.19s | Test Loss -14.069
|
584 |
-
Learning rate is: 0.000032
|
585 |
-
Train Summary | End of Epoch 76 | Time 8854.80s | Train Loss -16.460
|
586 |
-
Valid Summary | End of Epoch 76 | Time 1300.81s | Valid Loss -14.595
|
587 |
-
Test Summary | End of Epoch 76 | Time 781.01s | Test Loss -14.063
|
588 |
-
Learning rate is: 0.000032
|
589 |
-
Train Summary | End of Epoch 77 | Time 8863.14s | Train Loss -16.455
|
590 |
-
Valid Summary | End of Epoch 77 | Time 1300.81s | Valid Loss -14.602
|
591 |
-
Test Summary | End of Epoch 77 | Time 780.15s | Test Loss -14.056
|
592 |
-
Learning rate is: 0.000032
|
593 |
-
Train Summary | End of Epoch 78 | Time 8867.28s | Train Loss -16.456
|
594 |
-
Valid Summary | End of Epoch 78 | Time 1300.89s | Valid Loss -14.569
|
595 |
-
Test Summary | End of Epoch 78 | Time 780.83s | Test Loss -14.124
|
596 |
-
No imporvement for 10 epochs, early stopping.
|
597 |
-
Start evaluation
|
598 |
-
Avg SISNR:i tensor([13.8622], device='cuda:0')
|
599 |
-
Avg SNRi: 14.180312131890753
|
600 |
-
Avg PESQi: 1.4813443135023117
|
601 |
-
Avg STOIi: 0.27852386519253225
|
602 |
-
# 2spk
|
603 |
-
Avg SISNR:i tensor([14.4913], device='cuda:0')
|
604 |
-
Avg SNRi: 14.766350478813006
|
605 |
-
Avg PESQi: 1.566092278043429
|
606 |
-
Avg STOIi: 0.2884288031001122
|
607 |
-
|
608 |
-
|
609 |
-
# lrs2
|
610 |
-
# 1spk
|
611 |
-
Avg SISNR:i tensor([14.3695], device='cuda:0')
|
612 |
-
Avg SNRi: 14.712913455144225
|
613 |
-
Avg PESQi: 1.4528894378741581
|
614 |
-
Avg STOIi: 0.27694796861127857
|
615 |
-
# 2spk
|
616 |
-
Avg SISNR:i tensor([15.0343], device='cuda:0')
|
617 |
-
Avg SNRi: 15.362836239165572
|
618 |
-
Avg PESQi: 1.5522074782848359
|
619 |
-
Avg STOIi: 0.28548176812938764
|
620 |
-
|
621 |
-
|
622 |
-
# lrs3
|
623 |
-
# 1spk
|
624 |
-
Avg SISNR:i tensor([16.2440], device='cuda:0')
|
625 |
-
Avg SNRi: 16.48747834483003
|
626 |
-
Avg PESQi: 1.7686368883450827
|
627 |
-
Avg STOIi: 0.2731847089622733
|
628 |
-
# 2spk
|
629 |
-
Avg SISNR:i tensor([16.9063], device='cuda:0')
|
630 |
-
Avg SNRi: 17.114254503063478
|
631 |
-
Avg PESQi: 1.8624962186813354
|
632 |
-
Avg STOIi: 0.279682808481817
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/tensorboard/events.out.tfevents.1734940287.dlcdanw1zq2cucwx-master-0.28.0
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:8e29766adb4a58b0df338860869285d8dbc93e0fa0c642ade14dd326772d9706
|
3 |
-
size 8228
|
|
|
|
|
|
|
|
checkpoints/log_VoxCeleb2_tfgridnet-isam_2spk/tensorboard/events.out.tfevents.1735609783.dlc10xm9l399lwkq-master-0.26.0
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:8e7ae671ffc26ca42f44f0ce092ff6f6fb160e081580b2a680bfbe4e5761751f
|
3 |
-
size 3344
|
|
|
|
|
|
|
|