Upload 6 files
Browse files- checkpoints/.DS_Store +0 -0
- checkpoints/log_VoxCeleb2_lip_tfgridnet-isam_3spk /config.yaml +59 -0
- checkpoints/log_VoxCeleb2_lip_tfgridnet-isam_3spk /last_best_checkpoint.pt +3 -0
- checkpoints/log_VoxCeleb2_lip_tfgridnet-isam_3spk /last_checkpoint.pt +3 -0
- checkpoints/log_VoxCeleb2_lip_tfgridnet-isam_3spk /log_2024-12-23(15:50:02).txt +771 -0
- checkpoints/log_VoxCeleb2_lip_tfgridnet-isam_3spk /tensorboard/events.out.tfevents.1734940287.dlchqhe6f3ef1ed0-master-0.28.0 +3 -0
checkpoints/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
checkpoints/log_VoxCeleb2_lip_tfgridnet-isam_3spk /config.yaml
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Config file
|
2 |
+
|
3 |
+
# Log
|
4 |
+
seed: 777
|
5 |
+
use_cuda: 1 # 1 for True, 0 for False
|
6 |
+
|
7 |
+
# dataset
|
8 |
+
speaker_no: 3
|
9 |
+
mix_lst_path: ./data/VoxCeleb2_non_repeat/mixture_data_list_3mix.csv
|
10 |
+
audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/
|
11 |
+
reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/
|
12 |
+
# mix_lst_path: ./data/LRS2/mixture_data_list_3mix.csv
|
13 |
+
# audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/LRS2/audio_clean/
|
14 |
+
# reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/LRS2/mvlrs_v1/
|
15 |
+
# mix_lst_path: ./data/LRS3/mixture_data_list_3mix.csv
|
16 |
+
# audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/LRS3/audio_clean/
|
17 |
+
# reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/LRS3/orig/
|
18 |
+
audio_sr: 16000
|
19 |
+
ref_sr: 25
|
20 |
+
|
21 |
+
# dataloader
|
22 |
+
num_workers: 2
|
23 |
+
batch_size: 1 # 4-GPU training with a total effective batch size of 8
|
24 |
+
accu_grad: 1
|
25 |
+
effec_batch_size: 2 # per GPU, only used if accu_grad is set to 1, must be multiple times of batch size
|
26 |
+
max_length: 3 # truncate the utterances in dataloader, in seconds
|
27 |
+
|
28 |
+
# network settings
|
29 |
+
init_from: 'checkpoints/log_VoxCeleb2_lip_tfgridnet_3spk' # 'None' or a log name 'log_2024-07-22(18:12:13)'
|
30 |
+
causal: 0 # 1 for True, 0 for False
|
31 |
+
network_reference:
|
32 |
+
cue: lip # lip or speech or gesture or EEG
|
33 |
+
backbone: resnet18 # resnet18 or shufflenetV2 or blazenet64
|
34 |
+
emb_size: 256 # resnet18:256
|
35 |
+
network_audio:
|
36 |
+
backbone: av_tfgridnet_isam
|
37 |
+
n_fft: 256
|
38 |
+
stride: 128
|
39 |
+
window: "hann"
|
40 |
+
use_builtin_complex: False
|
41 |
+
n_srcs: 1
|
42 |
+
n_imics: 1
|
43 |
+
n_layers: 6
|
44 |
+
lstm_hidden_units: 192
|
45 |
+
attn_n_head: 4
|
46 |
+
attn_qk_output_channel: 4
|
47 |
+
emb_dim: 48
|
48 |
+
emb_ks: 4
|
49 |
+
emb_hs: 1
|
50 |
+
activation: "prelu"
|
51 |
+
isam: 1
|
52 |
+
|
53 |
+
# optimizer
|
54 |
+
spk_att_dropout: 1 # 0 for always use speaker attention
|
55 |
+
loss_type: ss_sisdr # "snr", "sisdr", "hybrid"
|
56 |
+
lr_warmup: 1
|
57 |
+
init_learning_rate: 0.0005
|
58 |
+
max_epoch: 150
|
59 |
+
clip_grad_norm: 5
|
checkpoints/log_VoxCeleb2_lip_tfgridnet-isam_3spk /last_best_checkpoint.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4351584fa30020fbda626f9f63c859a35bfc1c663509e721fb8ff46d65cf6c79
|
3 |
+
size 162585486
|
checkpoints/log_VoxCeleb2_lip_tfgridnet-isam_3spk /last_checkpoint.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90f3c8209585d03d8ef148cbff650530144f5e4b844dd557fb7c8684ed520646
|
3 |
+
size 162576686
|
checkpoints/log_VoxCeleb2_lip_tfgridnet-isam_3spk /log_2024-12-23(15:50:02).txt
ADDED
@@ -0,0 +1,771 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Config file
|
2 |
+
|
3 |
+
# Log
|
4 |
+
seed: 777
|
5 |
+
use_cuda: 1 # 1 for True, 0 for False
|
6 |
+
|
7 |
+
# dataset
|
8 |
+
speaker_no: 3
|
9 |
+
mix_lst_path: ./data/VoxCeleb2_non_repeat/mixture_data_list_3mix.csv
|
10 |
+
audio_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/
|
11 |
+
reference_direc: /mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/
|
12 |
+
audio_sr: 16000
|
13 |
+
ref_sr: 25
|
14 |
+
|
15 |
+
# dataloader
|
16 |
+
num_workers: 2
|
17 |
+
batch_size: 1 # 4-GPU training with a total effective batch size of 8
|
18 |
+
accu_grad: 1
|
19 |
+
effec_batch_size: 2 # per GPU, only used if accu_grad is set to 1, must be multiple times of batch size
|
20 |
+
max_length: 3 # truncate the utterances in dataloader, in seconds
|
21 |
+
|
22 |
+
# network settings
|
23 |
+
init_from: 'checkpoints/log_VoxCeleb2_lip_tfgridnet_3spk' # 'None' or a log name 'log_2024-07-22(18:12:13)'
|
24 |
+
causal: 0 # 1 for True, 0 for False
|
25 |
+
network_reference:
|
26 |
+
cue: lip # lip or speech or gesture or EEG
|
27 |
+
backbone: resnet18 # resnet18 or shufflenetV2 or blazenet64
|
28 |
+
emb_size: 256 # resnet18:256
|
29 |
+
network_audio:
|
30 |
+
backbone: av_tfgridnet_att_ss
|
31 |
+
n_fft: 256
|
32 |
+
stride: 128
|
33 |
+
window: "hann"
|
34 |
+
use_builtin_complex: False
|
35 |
+
n_srcs: 1
|
36 |
+
n_imics: 1
|
37 |
+
n_layers: 6
|
38 |
+
lstm_hidden_units: 192
|
39 |
+
attn_n_head: 4
|
40 |
+
attn_qk_output_channel: 4
|
41 |
+
emb_dim: 48
|
42 |
+
emb_ks: 4
|
43 |
+
emb_hs: 1
|
44 |
+
activation: "prelu"
|
45 |
+
|
46 |
+
# optimizer
|
47 |
+
spk_att_dropout: 1 # 0 for always use speaker attention
|
48 |
+
loss_type: ss_sisdr # "snr", "sisdr", "hybrid"
|
49 |
+
lr_warmup: 1
|
50 |
+
init_learning_rate: 0.0005
|
51 |
+
max_epoch: 150
|
52 |
+
clip_grad_norm: 5
|
53 |
+
W1223 15:50:36.042814 140017896986432 torch/distributed/run.py:779]
|
54 |
+
W1223 15:50:36.042814 140017896986432 torch/distributed/run.py:779] *****************************************
|
55 |
+
W1223 15:50:36.042814 140017896986432 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
56 |
+
W1223 15:50:36.042814 140017896986432 torch/distributed/run.py:779] *****************************************
|
57 |
+
[W1223 15:50:57.603307732 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
58 |
+
[W1223 15:50:57.603314732 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
59 |
+
[W1223 15:50:57.604096578 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
|
60 |
+
[W1223 15:50:57.603320695 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
61 |
+
[W1223 15:50:57.604115033 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
|
62 |
+
[W1223 15:50:57.604131007 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
|
63 |
+
[W1223 15:50:57.603696860 Utils.hpp:164] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
64 |
+
[W1223 15:50:57.604146436 Utils.hpp:135] Warning: Environment variable NCCL_ASYNC_ERROR_HANDLING is deprecated; use TORCH_NCCL_ASYNC_ERROR_HANDLING instead (function operator())
|
65 |
+
started on checkpoints/log_2024-12-23(15:50:02)
|
66 |
+
|
67 |
+
namespace(accu_grad=1, audio_direc='/mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/audio_clean/', audio_sr=16000, batch_size=1, causal=0, checkpoint_dir='checkpoints/log_2024-12-23(15:50:02)', clip_grad_norm=5.0, config=[<yamlargparse.Path object at 0x7fefcdc30e80>], device=device(type='cuda'), distributed=True, effec_batch_size=2, evaluate_only=0, init_from='checkpoints/log_VoxCeleb2_lip_tfgridnet_3spk', init_learning_rate=0.0005, local_rank=0, loss_type='ss_sisdr', lr_warmup=1, max_epoch=150, max_length=3, mix_lst_path='./data/VoxCeleb2_non_repeat/mixture_data_list_3mix.csv', network_audio=namespace(activation='prelu', attn_n_head=4, attn_qk_output_channel=4, backbone='av_tfgridnet_att_ss', emb_dim=48, emb_hs=1, emb_ks=4, lstm_hidden_units=192, n_fft=256, n_imics=1, n_layers=6, n_srcs=1, stride=128, use_builtin_complex=False, window='hann'), network_reference=namespace(backbone='resnet18', cue='lip', emb_size=256), num_workers=2, ref_sr=25, reference_direc='/mnt/nas_sg/wulanchabu/zexu.pan/datasets/VoxCeleb2/orig/', seed=777, speaker_no=3, spk_att_dropout=1, train_from_last_checkpoint=0, use_cuda=1, world_size=4)
|
68 |
+
network_wrapper(
|
69 |
+
(sep_network): av_TFGridNetV3_att_ss(
|
70 |
+
(enc): STFTEncoder(
|
71 |
+
(stft): Stft(n_fft=256, win_length=256, hop_length=128, center=True, normalized=False, onesided=True)
|
72 |
+
)
|
73 |
+
(dec): STFTDecoder(
|
74 |
+
(stft): Stft(n_fft=256, win_length=256, hop_length=128, center=True, normalized=False, onesided=True)
|
75 |
+
)
|
76 |
+
(conv): Sequential(
|
77 |
+
(0): Conv2d(2, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
78 |
+
(1): GroupNorm(1, 48, eps=1e-05, affine=True)
|
79 |
+
)
|
80 |
+
(blocks): ModuleList(
|
81 |
+
(0-5): 6 x GridNetV3Block(
|
82 |
+
(intra_norm): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
|
83 |
+
(intra_rnn): LSTM(192, 192, batch_first=True, bidirectional=True)
|
84 |
+
(intra_linear): ConvTranspose1d(384, 48, kernel_size=(4,), stride=(1,))
|
85 |
+
(inter_norm): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
|
86 |
+
(inter_rnn): LSTM(192, 192, batch_first=True, bidirectional=True)
|
87 |
+
(inter_linear): ConvTranspose1d(384, 48, kernel_size=(4,), stride=(1,))
|
88 |
+
(attn_conv_Q): Conv2d(48, 16, kernel_size=(1, 1), stride=(1, 1))
|
89 |
+
(attn_norm_Q): AllHeadPReLULayerNormalization4DC(
|
90 |
+
(act): PReLU(num_parameters=4)
|
91 |
+
)
|
92 |
+
(attn_conv_K): Conv2d(48, 16, kernel_size=(1, 1), stride=(1, 1))
|
93 |
+
(attn_norm_K): AllHeadPReLULayerNormalization4DC(
|
94 |
+
(act): PReLU(num_parameters=4)
|
95 |
+
)
|
96 |
+
(attn_conv_V): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1))
|
97 |
+
(attn_norm_V): AllHeadPReLULayerNormalization4DC(
|
98 |
+
(act): PReLU(num_parameters=4)
|
99 |
+
)
|
100 |
+
(attn_concat_proj): Sequential(
|
101 |
+
(0): Conv2d(48, 48, kernel_size=(1, 1), stride=(1, 1))
|
102 |
+
(1): PReLU(num_parameters=1)
|
103 |
+
(2): LayerNormalization()
|
104 |
+
)
|
105 |
+
(spk_att): TransformerEncoder(
|
106 |
+
(layers): ModuleList(
|
107 |
+
(0): TransformerEncoderLayer(
|
108 |
+
(self_attn): MultiheadAttention(
|
109 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=48, out_features=48, bias=True)
|
110 |
+
)
|
111 |
+
(linear1): Linear(in_features=48, out_features=192, bias=True)
|
112 |
+
(dropout): Dropout(p=0.1, inplace=False)
|
113 |
+
(linear2): Linear(in_features=192, out_features=48, bias=True)
|
114 |
+
(norm1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
|
115 |
+
(norm2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
|
116 |
+
(dropout1): Dropout(p=0.1, inplace=False)
|
117 |
+
(dropout2): Dropout(p=0.1, inplace=False)
|
118 |
+
)
|
119 |
+
)
|
120 |
+
)
|
121 |
+
(spk_norm): GroupNorm(1, 48, eps=1e-08, affine=True)
|
122 |
+
)
|
123 |
+
)
|
124 |
+
(deconv): ConvTranspose2d(48, 2, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
|
125 |
+
(av_conv): ModuleList(
|
126 |
+
(0-5): 6 x Linear(in_features=304, out_features=48, bias=True)
|
127 |
+
)
|
128 |
+
)
|
129 |
+
(ref_encoder): Visual_encoder(
|
130 |
+
(v_frontend): VisualFrontend(
|
131 |
+
(frontend3D): Sequential(
|
132 |
+
(0): Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), bias=False)
|
133 |
+
(1): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
134 |
+
(2): ReLU()
|
135 |
+
(3): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
|
136 |
+
)
|
137 |
+
(resnet): ResNet(
|
138 |
+
(layer1): ResNetLayer(
|
139 |
+
(conv1a): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
140 |
+
(bn1a): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
141 |
+
(conv2a): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
142 |
+
(downsample): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
|
143 |
+
(outbna): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
144 |
+
(conv1b): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
145 |
+
(bn1b): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
146 |
+
(conv2b): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
147 |
+
(outbnb): SyncBatchNorm(64, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
148 |
+
)
|
149 |
+
(layer2): ResNetLayer(
|
150 |
+
(conv1a): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
|
151 |
+
(bn1a): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
152 |
+
(conv2a): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
153 |
+
(downsample): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
|
154 |
+
(outbna): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
155 |
+
(conv1b): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
156 |
+
(bn1b): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
157 |
+
(conv2b): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
158 |
+
(outbnb): SyncBatchNorm(128, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
159 |
+
)
|
160 |
+
(layer3): ResNetLayer(
|
161 |
+
(conv1a): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
|
162 |
+
(bn1a): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
163 |
+
(conv2a): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
164 |
+
(downsample): Conv2d(128, 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
|
165 |
+
(outbna): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
166 |
+
(conv1b): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
167 |
+
(bn1b): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
168 |
+
(conv2b): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
169 |
+
(outbnb): SyncBatchNorm(256, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
170 |
+
)
|
171 |
+
(layer4): ResNetLayer(
|
172 |
+
(conv1a): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
|
173 |
+
(bn1a): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
174 |
+
(conv2a): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
175 |
+
(downsample): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
|
176 |
+
(outbna): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
177 |
+
(conv1b): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
178 |
+
(bn1b): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
179 |
+
(conv2b): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
|
180 |
+
(outbnb): SyncBatchNorm(512, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
|
181 |
+
)
|
182 |
+
(avgpool): AvgPool2d(kernel_size=(4, 4), stride=(1, 1), padding=0)
|
183 |
+
)
|
184 |
+
)
|
185 |
+
(v_ds): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
186 |
+
(visual_conv): Sequential(
|
187 |
+
(0): VisualConv1D(
|
188 |
+
(relu_0): ReLU()
|
189 |
+
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
190 |
+
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
191 |
+
(relu): ReLU()
|
192 |
+
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
193 |
+
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
194 |
+
(prelu): PReLU(num_parameters=1)
|
195 |
+
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
196 |
+
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
197 |
+
)
|
198 |
+
(1): VisualConv1D(
|
199 |
+
(relu_0): ReLU()
|
200 |
+
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
201 |
+
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
202 |
+
(relu): ReLU()
|
203 |
+
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
204 |
+
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
205 |
+
(prelu): PReLU(num_parameters=1)
|
206 |
+
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
207 |
+
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
208 |
+
)
|
209 |
+
(2): VisualConv1D(
|
210 |
+
(relu_0): ReLU()
|
211 |
+
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
212 |
+
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
213 |
+
(relu): ReLU()
|
214 |
+
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
215 |
+
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
216 |
+
(prelu): PReLU(num_parameters=1)
|
217 |
+
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
218 |
+
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
219 |
+
)
|
220 |
+
(3): VisualConv1D(
|
221 |
+
(relu_0): ReLU()
|
222 |
+
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
223 |
+
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
224 |
+
(relu): ReLU()
|
225 |
+
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
226 |
+
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
227 |
+
(prelu): PReLU(num_parameters=1)
|
228 |
+
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
229 |
+
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
230 |
+
)
|
231 |
+
(4): VisualConv1D(
|
232 |
+
(relu_0): ReLU()
|
233 |
+
(norm_0): SyncBatchNorm(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
234 |
+
(conv1x1): Conv1d(256, 512, kernel_size=(1,), stride=(1,), bias=False)
|
235 |
+
(relu): ReLU()
|
236 |
+
(norm_1): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
237 |
+
(dsconv): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,), groups=512)
|
238 |
+
(prelu): PReLU(num_parameters=1)
|
239 |
+
(norm_2): SyncBatchNorm(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
|
240 |
+
(pw_conv): Conv1d(512, 256, kernel_size=(1,), stride=(1,), bias=False)
|
241 |
+
)
|
242 |
+
)
|
243 |
+
)
|
244 |
+
)
|
245 |
+
|
246 |
+
Total number of parameters: 20950309
|
247 |
+
|
248 |
+
|
249 |
+
Total number of trainable parameters: 9765221
|
250 |
+
|
251 |
+
dlchqhe6f3ef1ed0-master-0:28:28 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
252 |
+
dlchqhe6f3ef1ed0-master-0:28:28 [0] NCCL INFO Bootstrap : Using eth0:22.5.232.161<0>
|
253 |
+
dlchqhe6f3ef1ed0-master-0:28:28 [0] NCCL INFO Plugin name set by env to libnccl-net-none.so
|
254 |
+
dlchqhe6f3ef1ed0-master-0:28:28 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
|
255 |
+
dlchqhe6f3ef1ed0-master-0:28:28 [0] NCCL INFO cudaDriverVersion 11040
|
256 |
+
dlchqhe6f3ef1ed0-master-0:31:31 [3] NCCL INFO cudaDriverVersion 11040
|
257 |
+
dlchqhe6f3ef1ed0-master-0:30:30 [2] NCCL INFO cudaDriverVersion 11040
|
258 |
+
dlchqhe6f3ef1ed0-master-0:29:29 [1] NCCL INFO cudaDriverVersion 11040
|
259 |
+
NCCL version 2.20.5+cuda11.8
|
260 |
+
dlchqhe6f3ef1ed0-master-0:29:29 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
261 |
+
dlchqhe6f3ef1ed0-master-0:31:31 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
262 |
+
dlchqhe6f3ef1ed0-master-0:30:30 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
263 |
+
dlchqhe6f3ef1ed0-master-0:29:29 [1] NCCL INFO Bootstrap : Using eth0:22.5.232.161<0>
|
264 |
+
dlchqhe6f3ef1ed0-master-0:30:30 [2] NCCL INFO Bootstrap : Using eth0:22.5.232.161<0>
|
265 |
+
dlchqhe6f3ef1ed0-master-0:31:31 [3] NCCL INFO Bootstrap : Using eth0:22.5.232.161<0>
|
266 |
+
dlchqhe6f3ef1ed0-master-0:29:29 [1] NCCL INFO Plugin name set by env to libnccl-net-none.so
|
267 |
+
dlchqhe6f3ef1ed0-master-0:30:30 [2] NCCL INFO Plugin name set by env to libnccl-net-none.so
|
268 |
+
dlchqhe6f3ef1ed0-master-0:31:31 [3] NCCL INFO Plugin name set by env to libnccl-net-none.so
|
269 |
+
dlchqhe6f3ef1ed0-master-0:30:30 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
|
270 |
+
dlchqhe6f3ef1ed0-master-0:31:31 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
|
271 |
+
dlchqhe6f3ef1ed0-master-0:29:29 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net-none.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net-none.so), using internal implementation
|
272 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
273 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
274 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
275 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth
|
276 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO NCCL_IB_HCA set to mlx5
|
277 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO NCCL_IB_HCA set to mlx5
|
278 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO NCCL_IB_HCA set to mlx5
|
279 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO NCCL_IB_HCA set to mlx5
|
280 |
+
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
|
281 |
+
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
|
282 |
+
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
|
283 |
+
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav25.so': libhfi1verbs-rdmav25.so: cannot open shared object file: No such file or directory
|
284 |
+
libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
|
285 |
+
libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
|
286 |
+
libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
|
287 |
+
libibverbs: Warning: couldn't load driver 'librxe-rdmav25.so': librxe-rdmav25.so: cannot open shared object file: No such file or directory
|
288 |
+
libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
|
289 |
+
libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
|
290 |
+
libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
|
291 |
+
libibverbs: Warning: couldn't load driver 'libmthca-rdmav25.so': libmthca-rdmav25.so: cannot open shared object file: No such file or directory
|
292 |
+
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
293 |
+
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
294 |
+
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
295 |
+
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav25.so': libvmw_pvrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
296 |
+
libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
|
297 |
+
libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
|
298 |
+
libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
|
299 |
+
libibverbs: Warning: couldn't load driver 'libhns-rdmav25.so': libhns-rdmav25.so: cannot open shared object file: No such file or directory
|
300 |
+
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
|
301 |
+
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
|
302 |
+
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
|
303 |
+
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav25.so': libipathverbs-rdmav25.so: cannot open shared object file: No such file or directory
|
304 |
+
libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
|
305 |
+
libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
|
306 |
+
libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
|
307 |
+
libibverbs: Warning: couldn't load driver 'libsiw-rdmav25.so': libsiw-rdmav25.so: cannot open shared object file: No such file or directory
|
308 |
+
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
|
309 |
+
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
|
310 |
+
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
|
311 |
+
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav25.so': libbnxt_re-rdmav25.so: cannot open shared object file: No such file or directory
|
312 |
+
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
313 |
+
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
314 |
+
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
315 |
+
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav25.so': libocrdma-rdmav25.so: cannot open shared object file: No such file or directory
|
316 |
+
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
|
317 |
+
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
|
318 |
+
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
|
319 |
+
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav25.so': libmlx4-rdmav25.so: cannot open shared object file: No such file or directory
|
320 |
+
libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
|
321 |
+
libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
|
322 |
+
libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
|
323 |
+
libibverbs: Warning: couldn't load driver 'libqedr-rdmav25.so': libqedr-rdmav25.so: cannot open shared object file: No such file or directory
|
324 |
+
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
|
325 |
+
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
|
326 |
+
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
|
327 |
+
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav25.so': libcxgb4-rdmav25.so: cannot open shared object file: No such file or directory
|
328 |
+
libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
|
329 |
+
libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
|
330 |
+
libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
|
331 |
+
libibverbs: Warning: couldn't load driver 'libi40iw-rdmav25.so': libi40iw-rdmav25.so: cannot open shared object file: No such file or directory
|
332 |
+
libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
|
333 |
+
libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
|
334 |
+
libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
|
335 |
+
libibverbs: Warning: couldn't load driver 'libefa-rdmav25.so': libefa-rdmav25.so: cannot open shared object file: No such file or directory
|
336 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [RO]; OOB eth0:22.5.232.161<0>
|
337 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Using non-device net plugin version 0
|
338 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Using network IB
|
339 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [RO]; OOB eth0:22.5.232.161<0>
|
340 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [RO]; OOB eth0:22.5.232.161<0>
|
341 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Using non-device net plugin version 0
|
342 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Using network IB
|
343 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Using non-device net plugin version 0
|
344 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Using network IB
|
345 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [1]mlx5_1:1/RoCE [RO]; OOB eth0:22.5.232.161<0>
|
346 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Using non-device net plugin version 0
|
347 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Using network IB
|
348 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO comm 0x8620bf0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId 40 commId 0x7a7db4e36deb53ea - Init START
|
349 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO comm 0x86c7e10 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId 30 commId 0x7a7db4e36deb53ea - Init START
|
350 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO comm 0x9352850 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 20 commId 0x7a7db4e36deb53ea - Init START
|
351 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO comm 0x81df340 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 10 commId 0x7a7db4e36deb53ea - Init START
|
352 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Setting affinity for GPU 3 to ffffff
|
353 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Setting affinity for GPU 0 to ffffff
|
354 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Setting affinity for GPU 2 to ffffff
|
355 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Setting affinity for GPU 1 to ffffff
|
356 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO comm 0x81df340 rank 0 nRanks 4 nNodes 1 localRanks 4 localRank 0 MNNVL 0
|
357 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO comm 0x8620bf0 rank 3 nRanks 4 nNodes 1 localRanks 4 localRank 3 MNNVL 0
|
358 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
|
359 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO comm 0x9352850 rank 1 nRanks 4 nNodes 1 localRanks 4 localRank 1 MNNVL 0
|
360 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 00/12 : 0 1 2 3
|
361 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO comm 0x86c7e10 rank 2 nRanks 4 nNodes 1 localRanks 4 localRank 2 MNNVL 0
|
362 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
|
363 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 01/12 : 0 1 2 3
|
364 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
|
365 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 02/12 : 0 1 2 3
|
366 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 03/12 : 0 1 2 3
|
367 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO NCCL_MIN_NCHANNELS set by environment to 4.
|
368 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 04/12 : 0 1 2 3
|
369 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 05/12 : 0 1 2 3
|
370 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 06/12 : 0 1 2 3
|
371 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 07/12 : 0 1 2 3
|
372 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 08/12 : 0 1 2 3
|
373 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 [2] -1/-1/-1->3->2 [3] -1/-1/-1->3->2 [4] -1/-1/-1->3->2 [5] -1/-1/-1->3->2 [6] -1/-1/-1->3->2 [7] -1/-1/-1->3->2 [8] -1/-1/-1->3->2 [9] -1/-1/-1->3->2 [10] -1/-1/-1->3->2 [11] -1/-1/-1->3->2
|
374 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 09/12 : 0 1 2 3
|
375 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 10/12 : 0 1 2 3
|
376 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO P2P Chunksize set to 524288
|
377 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 11/12 : 0 1 2 3
|
378 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0
|
379 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO P2P Chunksize set to 524288
|
380 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1
|
381 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1
|
382 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO P2P Chunksize set to 524288
|
383 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO P2P Chunksize set to 524288
|
384 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/IPC/read
|
385 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 00/0 : 3[3] -> 0[0] via P2P/IPC/read
|
386 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/IPC/read
|
387 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/IPC/read
|
388 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 01/0 : 3[3] -> 0[0] via P2P/IPC/read
|
389 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/IPC/read
|
390 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/IPC/read
|
391 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 02/0 : 3[3] -> 0[0] via P2P/IPC/read
|
392 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/IPC/read
|
393 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/IPC/read
|
394 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 03/0 : 3[3] -> 0[0] via P2P/IPC/read
|
395 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/IPC/read
|
396 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/IPC/read
|
397 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/IPC/read
|
398 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 04/0 : 3[3] -> 0[0] via P2P/IPC/read
|
399 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/IPC/read
|
400 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/IPC/read
|
401 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/IPC/read
|
402 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 05/0 : 3[3] -> 0[0] via P2P/IPC/read
|
403 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/IPC/read
|
404 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/IPC/read
|
405 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/IPC/read
|
406 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 06/0 : 3[3] -> 0[0] via P2P/IPC/read
|
407 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/IPC/read
|
408 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/IPC/read
|
409 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/IPC/read
|
410 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 07/0 : 3[3] -> 0[0] via P2P/IPC/read
|
411 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/IPC/read
|
412 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/IPC/read
|
413 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/IPC/read
|
414 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 08/0 : 3[3] -> 0[0] via P2P/IPC/read
|
415 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/IPC/read
|
416 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/IPC/read
|
417 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/IPC/read
|
418 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 09/0 : 3[3] -> 0[0] via P2P/IPC/read
|
419 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/IPC/read
|
420 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/IPC/read
|
421 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/IPC/read
|
422 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 10/0 : 3[3] -> 0[0] via P2P/IPC/read
|
423 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/IPC/read
|
424 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/IPC/read
|
425 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/IPC/read
|
426 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 11/0 : 3[3] -> 0[0] via P2P/IPC/read
|
427 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/IPC/read
|
428 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/IPC/read
|
429 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/IPC/read
|
430 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/IPC/read
|
431 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/IPC/read
|
432 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Connected all rings
|
433 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/IPC/read
|
434 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Connected all rings
|
435 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Connected all rings
|
436 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Connected all rings
|
437 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/IPC/read
|
438 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/IPC/read
|
439 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 03/0 : 3[3] -> 2[2] via P2P/IPC/read
|
440 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/IPC/read
|
441 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/IPC/read
|
442 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/IPC/read
|
443 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/IPC/read
|
444 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/IPC/read
|
445 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/IPC/read
|
446 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/IPC/read
|
447 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Channel 11/0 : 3[3] -> 2[2] via P2P/IPC/read
|
448 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/IPC/read
|
449 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/IPC/read
|
450 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/IPC/read
|
451 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/IPC/read
|
452 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/IPC/read
|
453 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/IPC/read
|
454 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/IPC/read
|
455 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/IPC/read
|
456 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/IPC/read
|
457 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/IPC/read
|
458 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/IPC/read
|
459 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/IPC/read
|
460 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/IPC/read
|
461 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/IPC/read
|
462 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/IPC/read
|
463 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/IPC/read
|
464 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/IPC/read
|
465 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/IPC/read
|
466 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/IPC/read
|
467 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/IPC/read
|
468 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/IPC/read
|
469 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/IPC/read
|
470 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/IPC/read
|
471 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/IPC/read
|
472 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO Connected all trees
|
473 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
474 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
|
475 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO Connected all trees
|
476 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO Connected all trees
|
477 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
478 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
|
479 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO Connected all trees
|
480 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
481 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
|
482 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
|
483 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO 12 coll channels, 0 collnet channels, 0 nvls channels, 16 p2p channels, 16 p2p channels per peer
|
484 |
+
dlchqhe6f3ef1ed0-master-0:31:60 [3] NCCL INFO comm 0x8620bf0 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId 40 commId 0x7a7db4e36deb53ea - Init COMPLETE
|
485 |
+
dlchqhe6f3ef1ed0-master-0:29:57 [1] NCCL INFO comm 0x9352850 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 20 commId 0x7a7db4e36deb53ea - Init COMPLETE
|
486 |
+
dlchqhe6f3ef1ed0-master-0:30:59 [2] NCCL INFO comm 0x86c7e10 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId 30 commId 0x7a7db4e36deb53ea - Init COMPLETE
|
487 |
+
dlchqhe6f3ef1ed0-master-0:28:58 [0] NCCL INFO comm 0x81df340 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 10 commId 0x7a7db4e36deb53ea - Init COMPLETE
|
488 |
+
[rank2]:[W1223 15:51:28.859981038 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
489 |
+
[rank3]:[W1223 15:51:28.859981116 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
490 |
+
[rank1]:[W1223 15:51:28.860002924 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
491 |
+
[rank0]:[W1223 15:51:28.860009929 Utils.hpp:110] Warning: Environment variable NCCL_BLOCKING_WAIT is deprecated; use TORCH_NCCL_BLOCKING_WAIT instead (function operator())
|
492 |
+
module.sep_network.blocks.0.spk_att.layers.0.self_attn.in_proj_weight not loaded
|
493 |
+
module.sep_network.blocks.0.spk_att.layers.0.self_attn.in_proj_bias not loaded
|
494 |
+
module.sep_network.blocks.0.spk_att.layers.0.self_attn.out_proj.weight not loaded
|
495 |
+
module.sep_network.blocks.0.spk_att.layers.0.self_attn.out_proj.bias not loaded
|
496 |
+
module.sep_network.blocks.0.spk_att.layers.0.linear1.weight not loaded
|
497 |
+
module.sep_network.blocks.0.spk_att.layers.0.linear1.bias not loaded
|
498 |
+
module.sep_network.blocks.0.spk_att.layers.0.linear2.weight not loaded
|
499 |
+
module.sep_network.blocks.0.spk_att.layers.0.linear2.bias not loaded
|
500 |
+
module.sep_network.blocks.0.spk_att.layers.0.norm1.weight not loaded
|
501 |
+
module.sep_network.blocks.0.spk_att.layers.0.norm1.bias not loaded
|
502 |
+
module.sep_network.blocks.0.spk_att.layers.0.norm2.weight not loaded
|
503 |
+
module.sep_network.blocks.0.spk_att.layers.0.norm2.bias not loaded
|
504 |
+
module.sep_network.blocks.0.spk_norm.weight not loaded
|
505 |
+
module.sep_network.blocks.0.spk_norm.bias not loaded
|
506 |
+
module.sep_network.blocks.1.spk_att.layers.0.self_attn.in_proj_weight not loaded
|
507 |
+
module.sep_network.blocks.1.spk_att.layers.0.self_attn.in_proj_bias not loaded
|
508 |
+
module.sep_network.blocks.1.spk_att.layers.0.self_attn.out_proj.weight not loaded
|
509 |
+
module.sep_network.blocks.1.spk_att.layers.0.self_attn.out_proj.bias not loaded
|
510 |
+
module.sep_network.blocks.1.spk_att.layers.0.linear1.weight not loaded
|
511 |
+
module.sep_network.blocks.1.spk_att.layers.0.linear1.bias not loaded
|
512 |
+
module.sep_network.blocks.1.spk_att.layers.0.linear2.weight not loaded
|
513 |
+
module.sep_network.blocks.1.spk_att.layers.0.linear2.bias not loaded
|
514 |
+
module.sep_network.blocks.1.spk_att.layers.0.norm1.weight not loaded
|
515 |
+
module.sep_network.blocks.1.spk_att.layers.0.norm1.bias not loaded
|
516 |
+
module.sep_network.blocks.1.spk_att.layers.0.norm2.weight not loaded
|
517 |
+
module.sep_network.blocks.1.spk_att.layers.0.norm2.bias not loaded
|
518 |
+
module.sep_network.blocks.1.spk_norm.weight not loaded
|
519 |
+
module.sep_network.blocks.1.spk_norm.bias not loaded
|
520 |
+
module.sep_network.blocks.2.spk_att.layers.0.self_attn.in_proj_weight not loaded
|
521 |
+
module.sep_network.blocks.2.spk_att.layers.0.self_attn.in_proj_bias not loaded
|
522 |
+
module.sep_network.blocks.2.spk_att.layers.0.self_attn.out_proj.weight not loaded
|
523 |
+
module.sep_network.blocks.2.spk_att.layers.0.self_attn.out_proj.bias not loaded
|
524 |
+
module.sep_network.blocks.2.spk_att.layers.0.linear1.weight not loaded
|
525 |
+
module.sep_network.blocks.2.spk_att.layers.0.linear1.bias not loaded
|
526 |
+
module.sep_network.blocks.2.spk_att.layers.0.linear2.weight not loaded
|
527 |
+
module.sep_network.blocks.2.spk_att.layers.0.linear2.bias not loaded
|
528 |
+
module.sep_network.blocks.2.spk_att.layers.0.norm1.weight not loaded
|
529 |
+
module.sep_network.blocks.2.spk_att.layers.0.norm1.bias not loaded
|
530 |
+
module.sep_network.blocks.2.spk_att.layers.0.norm2.weight not loaded
|
531 |
+
module.sep_network.blocks.2.spk_att.layers.0.norm2.bias not loaded
|
532 |
+
module.sep_network.blocks.2.spk_norm.weight not loaded
|
533 |
+
module.sep_network.blocks.2.spk_norm.bias not loaded
|
534 |
+
module.sep_network.blocks.3.spk_att.layers.0.self_attn.in_proj_weight not loaded
|
535 |
+
module.sep_network.blocks.3.spk_att.layers.0.self_attn.in_proj_bias not loaded
|
536 |
+
module.sep_network.blocks.3.spk_att.layers.0.self_attn.out_proj.weight not loaded
|
537 |
+
module.sep_network.blocks.3.spk_att.layers.0.self_attn.out_proj.bias not loaded
|
538 |
+
module.sep_network.blocks.3.spk_att.layers.0.linear1.weight not loaded
|
539 |
+
module.sep_network.blocks.3.spk_att.layers.0.linear1.bias not loaded
|
540 |
+
module.sep_network.blocks.3.spk_att.layers.0.linear2.weight not loaded
|
541 |
+
module.sep_network.blocks.3.spk_att.layers.0.linear2.bias not loaded
|
542 |
+
module.sep_network.blocks.3.spk_att.layers.0.norm1.weight not loaded
|
543 |
+
module.sep_network.blocks.3.spk_att.layers.0.norm1.bias not loaded
|
544 |
+
module.sep_network.blocks.3.spk_att.layers.0.norm2.weight not loaded
|
545 |
+
module.sep_network.blocks.3.spk_att.layers.0.norm2.bias not loaded
|
546 |
+
module.sep_network.blocks.3.spk_norm.weight not loaded
|
547 |
+
module.sep_network.blocks.3.spk_norm.bias not loaded
|
548 |
+
module.sep_network.blocks.4.spk_att.layers.0.self_attn.in_proj_weight not loaded
|
549 |
+
module.sep_network.blocks.4.spk_att.layers.0.self_attn.in_proj_bias not loaded
|
550 |
+
module.sep_network.blocks.4.spk_att.layers.0.self_attn.out_proj.weight not loaded
|
551 |
+
module.sep_network.blocks.4.spk_att.layers.0.self_attn.out_proj.bias not loaded
|
552 |
+
module.sep_network.blocks.4.spk_att.layers.0.linear1.weight not loaded
|
553 |
+
module.sep_network.blocks.4.spk_att.layers.0.linear1.bias not loaded
|
554 |
+
module.sep_network.blocks.4.spk_att.layers.0.linear2.weight not loaded
|
555 |
+
module.sep_network.blocks.4.spk_att.layers.0.linear2.bias not loaded
|
556 |
+
module.sep_network.blocks.4.spk_att.layers.0.norm1.weight not loaded
|
557 |
+
module.sep_network.blocks.4.spk_att.layers.0.norm1.bias not loaded
|
558 |
+
module.sep_network.blocks.4.spk_att.layers.0.norm2.weight not loaded
|
559 |
+
module.sep_network.blocks.4.spk_att.layers.0.norm2.bias not loaded
|
560 |
+
module.sep_network.blocks.4.spk_norm.weight not loaded
|
561 |
+
module.sep_network.blocks.4.spk_norm.bias not loaded
|
562 |
+
module.sep_network.blocks.5.spk_att.layers.0.self_attn.in_proj_weight not loaded
|
563 |
+
module.sep_network.blocks.5.spk_att.layers.0.self_attn.in_proj_bias not loaded
|
564 |
+
module.sep_network.blocks.5.spk_att.layers.0.self_attn.out_proj.weight not loaded
|
565 |
+
module.sep_network.blocks.5.spk_att.layers.0.self_attn.out_proj.bias not loaded
|
566 |
+
module.sep_network.blocks.5.spk_att.layers.0.linear1.weight not loaded
|
567 |
+
module.sep_network.blocks.5.spk_att.layers.0.linear1.bias not loaded
|
568 |
+
module.sep_network.blocks.5.spk_att.layers.0.linear2.weight not loaded
|
569 |
+
module.sep_network.blocks.5.spk_att.layers.0.linear2.bias not loaded
|
570 |
+
module.sep_network.blocks.5.spk_att.layers.0.norm1.weight not loaded
|
571 |
+
module.sep_network.blocks.5.spk_att.layers.0.norm1.bias not loaded
|
572 |
+
module.sep_network.blocks.5.spk_att.layers.0.norm2.weight not loaded
|
573 |
+
module.sep_network.blocks.5.spk_att.layers.0.norm2.bias not loaded
|
574 |
+
module.sep_network.blocks.5.spk_norm.weight not loaded
|
575 |
+
module.sep_network.blocks.5.spk_norm.bias not loaded
|
576 |
+
Init model from checkpoints/log_VoxCeleb2_lip_tfgridnet_3spk, and start new training
|
577 |
+
[rank1]:[W1223 15:51:50.762137069 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
578 |
+
[rank0]:[W1223 15:51:50.762144793 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
579 |
+
[rank3]:[W1223 15:51:50.762153452 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
580 |
+
[rank2]:[W1223 15:51:50.762154877 reducer.cpp:1400] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
|
581 |
+
Train Summary | End of Epoch 1 | Time 12225.14s | Train Loss -10.991
|
582 |
+
Valid Summary | End of Epoch 1 | Time 1985.94s | Valid Loss -9.535
|
583 |
+
Test Summary | End of Epoch 1 | Time 1191.17s | Test Loss -9.332
|
584 |
+
Fund new best model, dict saved
|
585 |
+
Train Summary | End of Epoch 2 | Time 12245.66s | Train Loss -11.196
|
586 |
+
Valid Summary | End of Epoch 2 | Time 1984.22s | Valid Loss -9.752
|
587 |
+
Test Summary | End of Epoch 2 | Time 1191.02s | Test Loss -9.545
|
588 |
+
Fund new best model, dict saved
|
589 |
+
Train Summary | End of Epoch 3 | Time 12255.68s | Train Loss -11.276
|
590 |
+
Valid Summary | End of Epoch 3 | Time 1984.68s | Valid Loss -9.986
|
591 |
+
Test Summary | End of Epoch 3 | Time 1190.82s | Test Loss -9.660
|
592 |
+
Fund new best model, dict saved
|
593 |
+
Train Summary | End of Epoch 4 | Time 12232.89s | Train Loss -11.290
|
594 |
+
Valid Summary | End of Epoch 4 | Time 1984.51s | Valid Loss -9.968
|
595 |
+
Test Summary | End of Epoch 4 | Time 1190.85s | Test Loss -9.738
|
596 |
+
Train Summary | End of Epoch 5 | Time 12251.83s | Train Loss -11.241
|
597 |
+
Valid Summary | End of Epoch 5 | Time 1984.37s | Valid Loss -10.024
|
598 |
+
Test Summary | End of Epoch 5 | Time 1190.96s | Test Loss -9.794
|
599 |
+
Fund new best model, dict saved
|
600 |
+
Train Summary | End of Epoch 6 | Time 12265.31s | Train Loss -11.113
|
601 |
+
Valid Summary | End of Epoch 6 | Time 1985.26s | Valid Loss -9.982
|
602 |
+
Test Summary | End of Epoch 6 | Time 1190.49s | Test Loss -9.781
|
603 |
+
Train Summary | End of Epoch 7 | Time 12263.74s | Train Loss -10.994
|
604 |
+
Valid Summary | End of Epoch 7 | Time 1984.31s | Valid Loss -9.952
|
605 |
+
Test Summary | End of Epoch 7 | Time 1190.49s | Test Loss -9.796
|
606 |
+
Train Summary | End of Epoch 8 | Time 12242.87s | Train Loss -10.854
|
607 |
+
Valid Summary | End of Epoch 8 | Time 1984.14s | Valid Loss -10.036
|
608 |
+
Test Summary | End of Epoch 8 | Time 1191.18s | Test Loss -9.791
|
609 |
+
Fund new best model, dict saved
|
610 |
+
Train Summary | End of Epoch 9 | Time 12236.27s | Train Loss -10.703
|
611 |
+
Valid Summary | End of Epoch 9 | Time 1984.02s | Valid Loss -9.937
|
612 |
+
Test Summary | End of Epoch 9 | Time 1190.37s | Test Loss -9.650
|
613 |
+
Train Summary | End of Epoch 10 | Time 12223.76s | Train Loss -10.582
|
614 |
+
Valid Summary | End of Epoch 10 | Time 1983.95s | Valid Loss -9.851
|
615 |
+
Test Summary | End of Epoch 10 | Time 1190.40s | Test Loss -9.663
|
616 |
+
Train Summary | End of Epoch 11 | Time 12222.04s | Train Loss -10.333
|
617 |
+
Valid Summary | End of Epoch 11 | Time 1984.19s | Valid Loss -9.653
|
618 |
+
Test Summary | End of Epoch 11 | Time 1191.05s | Test Loss -9.510
|
619 |
+
Train Summary | End of Epoch 12 | Time 12240.55s | Train Loss -10.279
|
620 |
+
Valid Summary | End of Epoch 12 | Time 1983.65s | Valid Loss -9.600
|
621 |
+
Test Summary | End of Epoch 12 | Time 1190.74s | Test Loss -9.431
|
622 |
+
Train Summary | End of Epoch 13 | Time 12235.76s | Train Loss -10.031
|
623 |
+
Valid Summary | End of Epoch 13 | Time 1983.34s | Valid Loss -9.060
|
624 |
+
Test Summary | End of Epoch 13 | Time 1190.27s | Test Loss -8.872
|
625 |
+
reload weights and optimizer from last best checkpoint
|
626 |
+
Learning rate adjusted to: 0.000255
|
627 |
+
Train Summary | End of Epoch 14 | Time 12226.64s | Train Loss -11.080
|
628 |
+
Valid Summary | End of Epoch 14 | Time 1983.85s | Valid Loss -10.234
|
629 |
+
Test Summary | End of Epoch 14 | Time 1190.61s | Test Loss -9.979
|
630 |
+
Fund new best model, dict saved
|
631 |
+
Train Summary | End of Epoch 15 | Time 12229.66s | Train Loss -11.199
|
632 |
+
Valid Summary | End of Epoch 15 | Time 1984.06s | Valid Loss -10.375
|
633 |
+
Test Summary | End of Epoch 15 | Time 1190.50s | Test Loss -10.135
|
634 |
+
Fund new best model, dict saved
|
635 |
+
Train Summary | End of Epoch 16 | Time 12232.49s | Train Loss -11.242
|
636 |
+
Valid Summary | End of Epoch 16 | Time 1983.78s | Valid Loss -10.281
|
637 |
+
Test Summary | End of Epoch 16 | Time 1190.14s | Test Loss -10.068
|
638 |
+
Train Summary | End of Epoch 17 | Time 12238.05s | Train Loss -11.275
|
639 |
+
Valid Summary | End of Epoch 17 | Time 1983.20s | Valid Loss -10.382
|
640 |
+
Test Summary | End of Epoch 17 | Time 1190.36s | Test Loss -10.152
|
641 |
+
Fund new best model, dict saved
|
642 |
+
Train Summary | End of Epoch 18 | Time 12245.93s | Train Loss -11.297
|
643 |
+
Valid Summary | End of Epoch 18 | Time 1983.33s | Valid Loss -10.459
|
644 |
+
Test Summary | End of Epoch 18 | Time 1190.37s | Test Loss -10.193
|
645 |
+
Fund new best model, dict saved
|
646 |
+
Train Summary | End of Epoch 19 | Time 12239.15s | Train Loss -11.333
|
647 |
+
Valid Summary | End of Epoch 19 | Time 1983.50s | Valid Loss -10.302
|
648 |
+
Test Summary | End of Epoch 19 | Time 1190.54s | Test Loss -10.117
|
649 |
+
Train Summary | End of Epoch 20 | Time 12237.59s | Train Loss -11.366
|
650 |
+
Valid Summary | End of Epoch 20 | Time 1983.43s | Valid Loss -10.481
|
651 |
+
Test Summary | End of Epoch 20 | Time 1190.16s | Test Loss -10.247
|
652 |
+
Fund new best model, dict saved
|
653 |
+
Train Summary | End of Epoch 21 | Time 12237.17s | Train Loss -11.364
|
654 |
+
Valid Summary | End of Epoch 21 | Time 1983.54s | Valid Loss -10.396
|
655 |
+
Test Summary | End of Epoch 21 | Time 1189.99s | Test Loss -10.263
|
656 |
+
Train Summary | End of Epoch 22 | Time 12247.11s | Train Loss -11.387
|
657 |
+
Valid Summary | End of Epoch 22 | Time 1983.62s | Valid Loss -10.483
|
658 |
+
Test Summary | End of Epoch 22 | Time 1190.28s | Test Loss -10.264
|
659 |
+
Fund new best model, dict saved
|
660 |
+
Train Summary | End of Epoch 23 | Time 12239.50s | Train Loss -11.386
|
661 |
+
Valid Summary | End of Epoch 23 | Time 1984.03s | Valid Loss -10.445
|
662 |
+
Test Summary | End of Epoch 23 | Time 1190.35s | Test Loss -10.239
|
663 |
+
Train Summary | End of Epoch 24 | Time 12244.29s | Train Loss -11.395
|
664 |
+
Valid Summary | End of Epoch 24 | Time 1983.04s | Valid Loss -10.464
|
665 |
+
Test Summary | End of Epoch 24 | Time 1190.10s | Test Loss -10.245
|
666 |
+
Train Summary | End of Epoch 25 | Time 12242.35s | Train Loss -11.402
|
667 |
+
Valid Summary | End of Epoch 25 | Time 1983.03s | Valid Loss -10.567
|
668 |
+
Test Summary | End of Epoch 25 | Time 1189.98s | Test Loss -10.172
|
669 |
+
Fund new best model, dict saved
|
670 |
+
Train Summary | End of Epoch 26 | Time 12242.38s | Train Loss -11.430
|
671 |
+
Valid Summary | End of Epoch 26 | Time 1983.15s | Valid Loss -10.517
|
672 |
+
Test Summary | End of Epoch 26 | Time 1190.06s | Test Loss -10.234
|
673 |
+
Train Summary | End of Epoch 27 | Time 12241.62s | Train Loss -11.407
|
674 |
+
Valid Summary | End of Epoch 27 | Time 1982.98s | Valid Loss -10.536
|
675 |
+
Test Summary | End of Epoch 27 | Time 1190.38s | Test Loss -10.245
|
676 |
+
Train Summary | End of Epoch 28 | Time 12243.78s | Train Loss -11.443
|
677 |
+
Valid Summary | End of Epoch 28 | Time 1982.87s | Valid Loss -10.544
|
678 |
+
Test Summary | End of Epoch 28 | Time 1189.98s | Test Loss -10.239
|
679 |
+
Train Summary | End of Epoch 29 | Time 12245.88s | Train Loss -11.440
|
680 |
+
Valid Summary | End of Epoch 29 | Time 1982.87s | Valid Loss -10.550
|
681 |
+
Test Summary | End of Epoch 29 | Time 1190.15s | Test Loss -10.319
|
682 |
+
Train Summary | End of Epoch 30 | Time 12239.32s | Train Loss -11.457
|
683 |
+
Valid Summary | End of Epoch 30 | Time 1982.92s | Valid Loss -10.515
|
684 |
+
Test Summary | End of Epoch 30 | Time 1189.95s | Test Loss -10.311
|
685 |
+
reload weights and optimizer from last best checkpoint
|
686 |
+
Learning rate adjusted to: 0.000128
|
687 |
+
Train Summary | End of Epoch 31 | Time 12240.18s | Train Loss -11.540
|
688 |
+
Valid Summary | End of Epoch 31 | Time 1983.06s | Valid Loss -10.636
|
689 |
+
Test Summary | End of Epoch 31 | Time 1189.96s | Test Loss -10.376
|
690 |
+
Fund new best model, dict saved
|
691 |
+
Train Summary | End of Epoch 32 | Time 12236.81s | Train Loss -11.609
|
692 |
+
Valid Summary | End of Epoch 32 | Time 1983.10s | Valid Loss -10.657
|
693 |
+
Test Summary | End of Epoch 32 | Time 1190.04s | Test Loss -10.379
|
694 |
+
Fund new best model, dict saved
|
695 |
+
Train Summary | End of Epoch 33 | Time 12239.32s | Train Loss -11.645
|
696 |
+
Valid Summary | End of Epoch 33 | Time 1983.07s | Valid Loss -10.643
|
697 |
+
Test Summary | End of Epoch 33 | Time 1190.25s | Test Loss -10.411
|
698 |
+
Train Summary | End of Epoch 34 | Time 12236.30s | Train Loss -11.641
|
699 |
+
Valid Summary | End of Epoch 34 | Time 1983.17s | Valid Loss -10.679
|
700 |
+
Test Summary | End of Epoch 34 | Time 1190.24s | Test Loss -10.401
|
701 |
+
Fund new best model, dict saved
|
702 |
+
Train Summary | End of Epoch 35 | Time 12245.94s | Train Loss -11.678
|
703 |
+
Valid Summary | End of Epoch 35 | Time 1983.96s | Valid Loss -10.720
|
704 |
+
Test Summary | End of Epoch 35 | Time 1190.20s | Test Loss -10.445
|
705 |
+
Fund new best model, dict saved
|
706 |
+
Train Summary | End of Epoch 36 | Time 12254.78s | Train Loss -11.681
|
707 |
+
Valid Summary | End of Epoch 36 | Time 1983.80s | Valid Loss -10.655
|
708 |
+
Test Summary | End of Epoch 36 | Time 1190.39s | Test Loss -10.401
|
709 |
+
Train Summary | End of Epoch 37 | Time 12244.39s | Train Loss -11.702
|
710 |
+
Valid Summary | End of Epoch 37 | Time 1982.89s | Valid Loss -10.654
|
711 |
+
Test Summary | End of Epoch 37 | Time 1189.95s | Test Loss -10.361
|
712 |
+
Train Summary | End of Epoch 38 | Time 12244.42s | Train Loss -11.710
|
713 |
+
Valid Summary | End of Epoch 38 | Time 1983.02s | Valid Loss -10.683
|
714 |
+
Test Summary | End of Epoch 38 | Time 1190.16s | Test Loss -10.420
|
715 |
+
Train Summary | End of Epoch 39 | Time 12245.65s | Train Loss -11.723
|
716 |
+
Valid Summary | End of Epoch 39 | Time 1982.96s | Valid Loss -10.716
|
717 |
+
Test Summary | End of Epoch 39 | Time 1190.09s | Test Loss -10.468
|
718 |
+
# 1spk
|
719 |
+
Avg SISNR:i tensor([14.2587], device='cuda:0')
|
720 |
+
Avg SNRi: 14.708940179199223
|
721 |
+
Avg PESQi: 0.9282668574651083
|
722 |
+
Avg STOIi: 0.3682121250644893
|
723 |
+
# 2spk
|
724 |
+
Avg SISNR:i tensor([14.7764], device='cuda:0')
|
725 |
+
Avg SNRi: 15.152641806291069
|
726 |
+
Avg PESQi: 0.9619782056808471
|
727 |
+
Avg STOIi: 0.3763378789653155
|
728 |
+
# 3spk
|
729 |
+
Avg SISNR:i tensor([15.5649], device='cuda:0')
|
730 |
+
Avg SNRi: 15.93704020578726
|
731 |
+
Avg PESQi: 1.0807508533398311
|
732 |
+
Avg STOIi: 0.3897622864471111
|
733 |
+
|
734 |
+
|
735 |
+
# on LRS2
|
736 |
+
# 1spk
|
737 |
+
Avg SISNR:i tensor([13.8892], device='cuda:0')
|
738 |
+
Avg SNRi: 14.385947986215816
|
739 |
+
Avg PESQi: 0.8105871801376343
|
740 |
+
Avg STOIi: 0.35947340789408305
|
741 |
+
# 2spk
|
742 |
+
Avg SISNR:i tensor([14.4489], device='cuda:0')
|
743 |
+
Avg SNRi: 14.8332929594773
|
744 |
+
Avg PESQi: 0.8476730751593907
|
745 |
+
Avg STOIi: 0.36752675748255176
|
746 |
+
|
747 |
+
# 3spk
|
748 |
+
Avg SISNR:i tensor([15.7151], device='cuda:0')
|
749 |
+
Avg SNRi: 16.10634403873235
|
750 |
+
Avg PESQi: 1.0045358400742213
|
751 |
+
Avg STOIi: 0.3877202131964405
|
752 |
+
|
753 |
+
|
754 |
+
# on LRS3
|
755 |
+
# 1spk
|
756 |
+
Avg SISNR:i tensor([15.1143], device='cuda:0')
|
757 |
+
Avg SNRi: 15.476579012213097
|
758 |
+
Avg PESQi: 1.0222730774879456
|
759 |
+
Avg STOIi: 0.3547696576788025
|
760 |
+
# 2spk
|
761 |
+
Avg SISNR:i tensor([15.8752], device='cuda:0')
|
762 |
+
Avg SNRi: 16.128778737789872
|
763 |
+
Avg PESQi: 1.0782776288191478
|
764 |
+
Avg STOIi: 0.36668242221201147
|
765 |
+
# 3spk
|
766 |
+
Total 3000 samples evaluated.
|
767 |
+
Total 1 samples not evaluated for pesq.
|
768 |
+
Avg SISNR:i tensor([17.2220], device='cuda:0')
|
769 |
+
Avg SNRi: 17.448678353901542
|
770 |
+
Avg PESQi: 1.2627000254789988
|
771 |
+
Avg STOIi: 0.38352159958506526
|
checkpoints/log_VoxCeleb2_lip_tfgridnet-isam_3spk /tensorboard/events.out.tfevents.1734940287.dlchqhe6f3ef1ed0-master-0.28.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3376efcc409b958e793ea56a2df50a80676d187d8a1149eacd6156546581fce8
|
3 |
+
size 5860
|