removed code

Browse files

Files changed (16) hide show

.gitattributes +0 -35
.gitignore +0 -189
neucodec/__init__.py +0 -3
neucodec/activations.py +0 -120
neucodec/alias_free_torch/__init__.py +0 -6
neucodec/alias_free_torch/act.py +0 -28
neucodec/alias_free_torch/filter.py +0 -95
neucodec/alias_free_torch/resample.py +0 -49
neucodec/bs_roformer5.py +0 -120
neucodec/codec_decoder_vocos.py +0 -431
neucodec/codec_encoder.py +0 -84
neucodec/model.py +0 -269
neucodec/module.py +0 -114
setup.py +0 -32
tests/__init__.py +0 -0
tests/test_neucodec.py +0 -128

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,189 +0,0 @@
-# Emacs
-*~
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-/runs
-/checkpoints
-/base
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
-.pdm.toml
-.pdm-python
-.pdm-build/
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-/runs
-/.cache
-/__pycache__
-*.wav
-*.pth
-*.pt
-*.pt.gz
-wandb/
-sven_latest_checkpoint/
-sven_qwen/
-pretrained_models/
-xcodec/
-small_speaker_shards_all/
-sven_all_shards/
-qwen_380k/
-evals/
-*.safetensors
-*.pt
-.ruff_cache

neucodec/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from .codec_encoder import CodecEncoder
-from .codec_decoder_vocos import CodecDecoderVocos
-from .model import NeuCodec

neucodec/activations.py DELETED Viewed

@@ -1,120 +0,0 @@
-# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
-#   LICENSE is in incl_licenses directory.
-import torch
-from torch import nn, sin, pow
-from torch.nn import Parameter
-class Snake(nn.Module):
-    '''
-    Implementation of a sine-based periodic activation function
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter
-    References:
-        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snake(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    '''
-    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
-        '''
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha: trainable parameter
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            alpha will be trained along with the rest of your model.
-        '''
-        super(Snake, self).__init__()
-        self.in_features = in_features
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale: # log scale alphas initialized to zeros
-            self.alpha = Parameter(torch.zeros(in_features) * alpha)
-        else: # linear scale alphas initialized to ones
-            self.alpha = Parameter(torch.ones(in_features) * alpha)
-        self.alpha.requires_grad = alpha_trainable
-        self.no_div_by_zero = 0.000000001
-    def forward(self, x):
-        '''
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        Snake ∶= x + 1/a * sin^2 (xa)
-        '''
-        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)
-        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
-        return x
-class SnakeBeta(nn.Module):
-    '''
-    A modified Snake function which uses separate parameters for the magnitude of the periodic components
-    Shape:
-        - Input: (B, C, T)
-        - Output: (B, C, T), same shape as the input
-    Parameters:
-        - alpha - trainable parameter that controls frequency
-        - beta - trainable parameter that controls magnitude
-    References:
-        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
-        https://arxiv.org/abs/2006.08195
-    Examples:
-        >>> a1 = snakebeta(256)
-        >>> x = torch.randn(256)
-        >>> x = a1(x)
-    '''
-    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
-        '''
-        Initialization.
-        INPUT:
-            - in_features: shape of the input
-            - alpha - trainable parameter that controls frequency
-            - beta - trainable parameter that controls magnitude
-            alpha is initialized to 1 by default, higher values = higher-frequency.
-            beta is initialized to 1 by default, higher values = higher-magnitude.
-            alpha will be trained along with the rest of your model.
-        '''
-        super(SnakeBeta, self).__init__()
-        self.in_features = in_features
-        # initialize alpha
-        self.alpha_logscale = alpha_logscale
-        if self.alpha_logscale: # log scale alphas initialized to zeros
-            self.alpha = Parameter(torch.zeros(in_features) * alpha)
-            self.beta = Parameter(torch.zeros(in_features) * alpha)
-        else: # linear scale alphas initialized to ones
-            self.alpha = Parameter(torch.ones(in_features) * alpha)
-            self.beta = Parameter(torch.ones(in_features) * alpha)
-        self.alpha.requires_grad = alpha_trainable
-        self.beta.requires_grad = alpha_trainable
-        self.no_div_by_zero = 0.000000001
-    def forward(self, x):
-        '''
-        Forward pass of the function.
-        Applies the function to the input elementwise.
-        SnakeBeta ∶= x + 1/b * sin^2 (xa)
-        '''
-        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
-        beta = self.beta.unsqueeze(0).unsqueeze(-1)
-        if self.alpha_logscale:
-            alpha = torch.exp(alpha)
-            beta = torch.exp(beta)
-        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
-        return x

neucodec/alias_free_torch/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
-#   LICENSE is in incl_licenses directory.
-from .filter import *
-from .resample import *
-from .act import *

neucodec/alias_free_torch/act.py DELETED Viewed

@@ -1,28 +0,0 @@
-# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
-#   LICENSE is in incl_licenses directory.
-import torch.nn as nn
-from .resample import UpSample1d, DownSample1d
-class Activation1d(nn.Module):
-    def __init__(self,
-                 activation,
-                 up_ratio: int = 2,
-                 down_ratio: int = 2,
-                 up_kernel_size: int = 12,
-                 down_kernel_size: int = 12):
-        super().__init__()
-        self.up_ratio = up_ratio
-        self.down_ratio = down_ratio
-        self.act = activation
-        self.upsample = UpSample1d(up_ratio, up_kernel_size)
-        self.downsample = DownSample1d(down_ratio, down_kernel_size)
-    # x: [B,C,T]
-    def forward(self, x):
-        x = self.upsample(x)
-        x = self.act(x)
-        x = self.downsample(x)
-        return x

neucodec/alias_free_torch/filter.py DELETED Viewed

@@ -1,95 +0,0 @@
-# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
-#   LICENSE is in incl_licenses directory.
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import math
-if 'sinc' in dir(torch):
-    sinc = torch.sinc
-else:
-    # This code is adopted from adefossez's julius.core.sinc under the MIT License
-    # https://adefossez.github.io/julius/julius/core.html
-    #   LICENSE is in incl_licenses directory.
-    def sinc(x: torch.Tensor):
-        """
-        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
-        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
-        """
-        return torch.where(x == 0,
-                           torch.tensor(1., device=x.device, dtype=x.dtype),
-                           torch.sin(math.pi * x) / math.pi / x)
-# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
-# https://adefossez.github.io/julius/julius/lowpass.html
-#   LICENSE is in incl_licenses directory.
-def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
-    even = (kernel_size % 2 == 0)
-    half_size = kernel_size // 2
-    #For kaiser window
-    delta_f = 4 * half_width
-    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
-    if A > 50.:
-        beta = 0.1102 * (A - 8.7)
-    elif A >= 21.:
-        beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
-    else:
-        beta = 0.
-    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
-    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
-    if even:
-        time = (torch.arange(-half_size, half_size) + 0.5)
-    else:
-        time = torch.arange(kernel_size) - half_size
-    if cutoff == 0:
-        filter_ = torch.zeros_like(time)
-    else:
-        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
-        # Normalize filter to have sum = 1, otherwise we will have a small leakage
-        # of the constant component in the input signal.
-        filter_ /= filter_.sum()
-        filter = filter_.view(1, 1, kernel_size)
-    return filter
-class LowPassFilter1d(nn.Module):
-    def __init__(self,
-                 cutoff=0.5,
-                 half_width=0.6,
-                 stride: int = 1,
-                 padding: bool = True,
-                 padding_mode: str = 'replicate',
-                 kernel_size: int = 12):
-        # kernel_size should be even number for stylegan3 setup,
-        # in this implementation, odd number is also possible.
-        super().__init__()
-        if cutoff < -0.:
-            raise ValueError("Minimum cutoff must be larger than zero.")
-        if cutoff > 0.5:
-            raise ValueError("A cutoff above 0.5 does not make sense.")
-        self.kernel_size = kernel_size
-        self.even = (kernel_size % 2 == 0)
-        self.pad_left = kernel_size // 2 - int(self.even)
-        self.pad_right = kernel_size // 2
-        self.stride = stride
-        self.padding = padding
-        self.padding_mode = padding_mode
-        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
-        self.register_buffer("filter", filter)
-    #input [B, C, T]
-    def forward(self, x):
-        _, C, _ = x.shape
-        if self.padding:
-            x = F.pad(x, (self.pad_left, self.pad_right),
-                      mode=self.padding_mode)
-        out = F.conv1d(x, self.filter.expand(C, -1, -1),
-                       stride=self.stride, groups=C)
-        return out

neucodec/alias_free_torch/resample.py DELETED Viewed

@@ -1,49 +0,0 @@
-# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
-#   LICENSE is in incl_licenses directory.
-import torch.nn as nn
-from torch.nn import functional as F
-from .filter import LowPassFilter1d
-from .filter import kaiser_sinc_filter1d
-class UpSample1d(nn.Module):
-    def __init__(self, ratio=2, kernel_size=None):
-        super().__init__()
-        self.ratio = ratio
-        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
-        self.stride = ratio
-        self.pad = self.kernel_size // ratio - 1
-        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
-        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
-        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
-                                      half_width=0.6 / ratio,
-                                      kernel_size=self.kernel_size)
-        self.register_buffer("filter", filter)
-    # x: [B, C, T]
-    def forward(self, x):
-        _, C, _ = x.shape
-        x = F.pad(x, (self.pad, self.pad), mode='replicate')
-        x = self.ratio * F.conv_transpose1d(
-            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
-        x = x[..., self.pad_left:-self.pad_right]
-        return x
-class DownSample1d(nn.Module):
-    def __init__(self, ratio=2, kernel_size=None):
-        super().__init__()
-        self.ratio = ratio
-        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
-        self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
-                                       half_width=0.6 / ratio,
-                                       stride=ratio,
-                                       kernel_size=self.kernel_size)
-    def forward(self, x):
-        xx = self.lowpass(x)
-        return xx

neucodec/bs_roformer5.py DELETED Viewed

@@ -1,120 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchaudio
-import numpy as np
-from torch.nn import Module, ModuleList
-from einops import rearrange
-from torchtune.modules import RotaryPositionalEmbeddings
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        r"""https://github.com/meta-llama/llama/blob/main/llama/model.py"""
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        norm_x = torch.mean(x ** 2, dim=-1, keepdim=True)
-        output = x * torch.rsqrt(norm_x + self.eps) * self.weight
-        return output
-class MLP(nn.Module):
-    def __init__(self, dim: int) -> None:
-        super().__init__()
-        self.fc1 = nn.Linear(dim, 4 * dim, bias=False)
-        self.silu = nn.SiLU()
-        self.fc2 = nn.Linear(4 * dim, dim, bias=False)
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.silu(x)
-        x = self.fc2(x)
-        return x
-class Attention(nn.Module):
-    def __init__(self, dim: int, n_heads: int, rotary_embed: RotaryPositionalEmbeddings):
-        super().__init__()
-        assert dim % n_heads == 0
-        self.n_heads = n_heads
-        self.dim = dim
-        self.rotary_embed = rotary_embed
-        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
-        assert self.flash, "Must have flash attention."
-        self.c_attn = nn.Linear(dim, 3 * dim, bias=False)
-        self.c_proj = nn.Linear(dim, dim, bias=False)
-    def forward(self, x):
-        r"""
-        Args:
-            x: (b, t, h*d)
-        Constants:
-            b: batch_size
-            t: time steps
-            r: 3
-            h: heads_num
-            d: heads_dim
-        """
-        B, T, C = x.size()
-        q, k, v = rearrange(self.c_attn(x), 'b t (r h d) -> r b h t d', r=3, h=self.n_heads)
-        # q, k, v: (b, h, t, d)
-        q = self.rotary_embed(q)
-        k = self.rotary_embed(k)
-        if self.flash:
-            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0, is_causal=False)
-        y = rearrange(y, 'b h t d -> b t (h d)')
-        y = self.c_proj(y)
-        # shape: (b, t, h*d)
-        return y
-class TransformerBlock(nn.Module):
-    def __init__(self, dim: int, n_heads: int, rotary_embed: RotaryPositionalEmbeddings):
-        super().__init__()
-        self.dim = dim
-        self.n_heads = n_heads
-        self.att_norm = RMSNorm(dim)
-        self.ffn_norm = RMSNorm(dim)
-        self.att = Attention(dim=dim, n_heads=n_heads, rotary_embed=rotary_embed)
-        self.mlp = MLP(dim=dim)
-    def forward(
-        self,
-        x: torch.Tensor,
-    ):
-        x = x + self.att(self.att_norm(x))
-        x = x + self.mlp(self.ffn_norm(x))
-        return x
-if __name__ == '__main__':
-    rotary_embed_128 = RotaryPositionalEmbeddings(dim=128)
-    transformer_block = TransformerBlock(
-        dim=1024,
-        n_heads=8,
-        rotary_embed=rotary_embed_128
-    )
-    x = torch.randn(2, 128, 1024)
-    y = transformer_block(x)
-    print(y.shape)
-    c=1

neucodec/codec_decoder_vocos.py DELETED Viewed

@@ -1,431 +0,0 @@
-import torch
-import torch.nn as nn
-from typing import List
-from torchtune.modules import RotaryPositionalEmbeddings
-from vector_quantize_pytorch import ResidualFSQ
-from .bs_roformer5 import TransformerBlock
-class ISTFT(nn.Module):
-    """
-    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
-    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
-    See issue: https://github.com/pytorch/pytorch/issues/62323
-    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
-    The NOLA constraint is met as we trim padded samples anyway.
-    Args:
-        n_fft (int): Size of Fourier transform.
-        hop_length (int): The distance between neighboring sliding window frames.
-        win_length (int): The size of window frame and STFT filter.
-        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
-    """
-    def __init__(
-        self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"
-    ):
-        super().__init__()
-        if padding not in ["center", "same"]:
-            raise ValueError("Padding must be 'center' or 'same'.")
-        self.padding = padding
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.win_length = win_length
-        window = torch.hann_window(win_length)
-        self.register_buffer("window", window)
-    def forward(self, spec: torch.Tensor) -> torch.Tensor:
-        """
-        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
-        Args:
-            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
-                            N is the number of frequency bins, and T is the number of time frames.
-        Returns:
-            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
-        """
-        if self.padding == "center":
-            # Fallback to pytorch native implementation
-            return torch.istft(
-                spec,
-                self.n_fft,
-                self.hop_length,
-                self.win_length,
-                self.window,
-                center=True,
-            )
-        elif self.padding == "same":
-            pad = (self.win_length - self.hop_length) // 2
-        else:
-            raise ValueError("Padding must be 'center' or 'same'.")
-        assert spec.dim() == 3, "Expected a 3D tensor as input"
-        B, N, T = spec.shape
-        # Inverse FFT
-        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
-        ifft = ifft * self.window[None, :, None]
-        # Overlap and Add
-        output_size = (T - 1) * self.hop_length + self.win_length
-        y = torch.nn.functional.fold(
-            ifft,
-            output_size=(1, output_size),
-            kernel_size=(1, self.win_length),
-            stride=(1, self.hop_length),
-        )[:, 0, 0, pad:-pad]
-        # Window envelope
-        window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
-        window_envelope = torch.nn.functional.fold(
-            window_sq,
-            output_size=(1, output_size),
-            kernel_size=(1, self.win_length),
-            stride=(1, self.hop_length),
-        ).squeeze()[pad:-pad]
-        # Normalize
-        assert (window_envelope > 1e-11).all()
-        y = y / window_envelope
-        return y
-class FourierHead(nn.Module):
-    """Base class for inverse fourier modules."""
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
-                        L is the sequence length, and H denotes the model dimension.
-        Returns:
-            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
-        """
-        raise NotImplementedError("Subclasses must implement the forward method.")
-class ISTFTHead(FourierHead):
-    """
-    ISTFT Head module for predicting STFT complex coefficients.
-    Args:
-        dim (int): Hidden dimension of the model.
-        n_fft (int): Size of Fourier transform.
-        hop_length (int): The distance between neighboring sliding window frames, which should align with
-                          the resolution of the input features.
-        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
-    """
-    def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"):
-        super().__init__()
-        out_dim = n_fft + 2
-        self.out = torch.nn.Linear(dim, out_dim)
-        self.istft = ISTFT(
-            n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding
-        )
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass of the ISTFTHead module.
-        Args:
-            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
-                        L is the sequence length, and H denotes the model dimension.
-        Returns:
-            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
-        """
-        x_pred = self.out(x)
-        # x_pred = x
-        x_pred = x_pred.transpose(1, 2)
-        mag, p = x_pred.chunk(2, dim=1)
-        mag = torch.exp(mag)
-        mag = torch.clip(
-            mag, max=1e2
-        )  # safeguard to prevent excessively large magnitudes
-        # wrapping happens here. These two lines produce real and imaginary value
-        x = torch.cos(p)
-        y = torch.sin(p)
-        # recalculating phase here does not produce anything new
-        # only costs time
-        # phase = torch.atan2(y, x)
-        # S = mag * torch.exp(phase * 1j)
-        # better directly produce the complex value
-        S = mag * (x + 1j * y)
-        audio = self.istft(S)
-        return audio.unsqueeze(1), x_pred
-def nonlinearity(x):
-    # swish
-    return x * torch.sigmoid(x)
-def Normalize(in_channels, num_groups=32):
-    return torch.nn.GroupNorm(
-        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
-    )
-class ResnetBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        in_channels,
-        out_channels=None,
-        conv_shortcut=False,
-        dropout,
-        temb_channels=512,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-        self.norm1 = Normalize(in_channels)
-        self.conv1 = torch.nn.Conv1d(
-            in_channels, out_channels, kernel_size=3, stride=1, padding=1
-        )
-        if temb_channels > 0:
-            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
-        self.norm2 = Normalize(out_channels)
-        self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = torch.nn.Conv1d(
-            out_channels, out_channels, kernel_size=3, stride=1, padding=1
-        )
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                self.conv_shortcut = torch.nn.Conv1d(
-                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
-                )
-            else:
-                self.nin_shortcut = torch.nn.Conv1d(
-                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
-                )
-    def forward(self, x, temb=None):
-        h = x
-        h = self.norm1(h)
-        h = nonlinearity(h)
-        h = self.conv1(h)
-        if temb is not None:
-            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
-        h = self.norm2(h)
-        h = nonlinearity(h)
-        h = self.dropout(h)
-        h = self.conv2(h)
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                x = self.conv_shortcut(x)
-            else:
-                x = self.nin_shortcut(x)
-        return x + h
-class Backbone(nn.Module):
-    """Base class for the generator's backbone. It preserves the same temporal resolution across all layers."""
-    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
-        """
-        Args:
-            x (Tensor): Input tensor of shape (B, C, L), where B is the batch size,
-                        C denotes output features, and L is the sequence length.
-        Returns:
-            Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length,
-                    and H denotes the model dimension.
-        """
-        raise NotImplementedError("Subclasses must implement the forward method.")
-class VocosBackbone(Backbone):
-    """
-    Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization
-    Args:
-        input_channels (int): Number of input features channels.
-        dim (int): Hidden dimension of the model.
-        intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
-        num_layers (int): Number of ConvNeXtBlock layers.
-        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
-        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
-                                                None means non-conditional model. Defaults to None.
-    """
-    def __init__(self, hidden_dim=1024, depth=12, heads=16, pos_meb_dim=64):
-        super().__init__()
-        self.embed = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=7, padding=3)
-        self.temb_ch = 0
-        block_in = hidden_dim
-        dropout = 0.1
-        prior_net: List[nn.Module] = [
-            ResnetBlock(
-                in_channels=block_in,
-                out_channels=block_in,
-                temb_channels=self.temb_ch,
-                dropout=dropout,
-            ),
-            ResnetBlock(
-                in_channels=block_in,
-                out_channels=block_in,
-                temb_channels=self.temb_ch,
-                dropout=dropout,
-            ),
-        ]
-        self.prior_net = nn.Sequential(*prior_net)
-        depth = depth
-        time_rotary_embed = RotaryPositionalEmbeddings(dim=pos_meb_dim)
-        transformer_blocks = [
-            TransformerBlock(
-                dim=hidden_dim, n_heads=heads, rotary_embed=time_rotary_embed
-            )
-            for _ in range(depth)
-        ]
-        self.transformers = nn.Sequential(*transformer_blocks)
-        self.final_layer_norm = nn.LayerNorm(hidden_dim, eps=1e-6)
-        post_net: List[nn.Module] = [
-            ResnetBlock(
-                in_channels=block_in,
-                out_channels=block_in,
-                temb_channels=self.temb_ch,
-                dropout=dropout,
-            ),
-            ResnetBlock(
-                in_channels=block_in,
-                out_channels=block_in,
-                temb_channels=self.temb_ch,
-                dropout=dropout,
-            ),
-        ]
-        self.post_net = nn.Sequential(*post_net)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = x.transpose(1, 2)
-        x = self.embed(x)
-        x = self.prior_net(x)
-        x = x.transpose(1, 2)
-        x = self.transformers(x)
-        x = x.transpose(1, 2)
-        x = self.post_net(x)
-        x = x.transpose(1, 2)
-        x = self.final_layer_norm(x)
-        return x
-def init_weights(m):
-    if isinstance(m, nn.Conv1d):
-        nn.init.trunc_normal_(m.weight, std=0.02)
-        nn.init.constant_(m.bias, 0)
-class CodecDecoderVocos(nn.Module):
-    def __init__(
-        self,
-        hidden_dim=1024,
-        depth=12,
-        heads=16,
-        pos_meb_dim=64,
-        hop_length=320,
-        vq_num_quantizers=1,
-        vq_dim=2048,  # 1024 2048
-        vq_commit_weight=0.25,
-        vq_weight_init=False,
-        vq_full_commit_loss=False,
-        codebook_size=16384,
-        codebook_dim=16,
-    ):
-        super().__init__()
-        self.hop_length = hop_length
-        self.quantizer = ResidualFSQ(
-            dim=vq_dim, levels=[4, 4, 4, 4, 4, 4, 4, 4], num_quantizers=1
-        )
-        self.backbone = VocosBackbone(
-            hidden_dim=hidden_dim, depth=depth, heads=heads, pos_meb_dim=pos_meb_dim
-        )
-        self.head = ISTFTHead(
-            dim=hidden_dim,
-            n_fft=self.hop_length * 4,
-            hop_length=self.hop_length,
-            padding="same",
-        )
-        self.reset_parameters()
-    def forward(self, x, vq=True):
-        if vq is True:
-            # x, q, commit_loss = self.quantizer(x)
-            x = x.permute(0, 2, 1)
-            x, q = self.quantizer(x)
-            x = x.permute(0, 2, 1)
-            q = q.permute(0, 2, 1)
-            return x, q, None
-        x = self.backbone(x)
-        x, _ = self.head(x)
-        return x, _
-    def vq2emb(self, vq):
-        self.quantizer = self.quantizer.eval()
-        x = self.quantizer.vq2emb(vq)
-        return x
-    def get_emb(self):
-        self.quantizer = self.quantizer.eval()
-        embs = self.quantizer.get_emb()
-        return embs
-    def inference_vq(self, vq):
-        x = vq[None, :, :]
-        x = self.model(x)
-        return x
-    def inference_0(self, x):
-        x, q, loss, perp = self.quantizer(x)
-        x = self.model(x)
-        return x, None
-    def inference(self, x):
-        x = self.model(x)
-        return x, None
-    def remove_weight_norm(self):
-        """Remove weight normalization module from all of the layers."""
-        def _remove_weight_norm(m):
-            try:
-                torch.nn.utils.remove_weight_norm(m)
-            except ValueError:  # this module didn't have weight norm
-                return
-        self.apply(_remove_weight_norm)
-    def apply_weight_norm(self):
-        """Apply weight normalization module from all of the layers."""
-        def _apply_weight_norm(m):
-            if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
-                torch.nn.utils.weight_norm(m)
-        self.apply(_apply_weight_norm)
-    def reset_parameters(self):
-        self.apply(init_weights)

neucodec/codec_encoder.py DELETED Viewed

@@ -1,84 +0,0 @@
-import torch
-import numpy as np
-from torch import nn
-from .module import WNConv1d, EncoderBlock
-from .alias_free_torch import Activation1d
-from . import activations
-def init_weights(m):
-    if isinstance(m, nn.Conv1d):
-        nn.init.trunc_normal_(m.weight, std=0.02)
-        nn.init.constant_(m.bias, 0)
-class CodecEncoder(nn.Module):
-    def __init__(
-        self,
-        ngf=48,
-        up_ratios=[2, 2, 4, 4, 5],
-        dilations=(1, 3, 9),
-        hidden_dim=1024,
-        depth=12,
-        heads=12,
-        pos_meb_dim=64,
-    ):
-        super().__init__()
-        self.hop_length = np.prod(up_ratios)
-        self.ngf = ngf
-        self.up_ratios = up_ratios
-        d_model = ngf
-        self.conv_blocks = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
-        for i, stride in enumerate(up_ratios):
-            d_model *= 2
-            self.conv_blocks += [
-                EncoderBlock(d_model, stride=stride, dilations=dilations)
-            ]
-        self.conv_blocks = nn.Sequential(*self.conv_blocks)
-        self.conv_final_block = [
-            Activation1d(
-                activation=activations.SnakeBeta(d_model, alpha_logscale=True)
-            ),
-            WNConv1d(d_model, hidden_dim, kernel_size=3, padding=1),
-        ]
-        self.conv_final_block = nn.Sequential(*self.conv_final_block)
-        self.reset_parameters()
-    def forward(self, x):
-        x = self.conv_blocks(x)
-        x = self.conv_final_block(x)
-        x = x.permute(0, 2, 1)
-        return x
-    def inference(self, x):
-        return self.block(x)
-    def remove_weight_norm(self):
-        """Remove weight normalization module from all of the layers."""
-        def _remove_weight_norm(m):
-            try:
-                torch.nn.utils.remove_weight_norm(m)
-            except ValueError:  # this module didn't have weight norm
-                return
-        self.apply(_remove_weight_norm)
-    def apply_weight_norm(self):
-        """Apply weight normalization module from all of the layers."""
-        def _apply_weight_norm(m):
-            if isinstance(m, nn.Conv1d):
-                torch.nn.utils.weight_norm(m)
-        self.apply(_apply_weight_norm)
-    def reset_parameters(self):
-        self.apply(init_weights)

neucodec/model.py DELETED Viewed

@@ -1,269 +0,0 @@
-import soundfile as sf
-import os
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torchaudio
-from typing import Optional
-from torchaudio import transforms as T
-from transformers import AutoFeatureExtractor, Wav2Vec2BertModel
-from .codec_encoder import CodecEncoder
-from .codec_decoder_vocos import CodecDecoderVocos
-from .module import SemanticEncoder
-class NeuCodec(nn.Module):
-    def __init__(self, ckpt_path: str, sample_rate: int, hop_length: int):
-        super().__init__()
-        # load ckpt
-        ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
-        self.sample_rate = sample_rate
-        self.hop_length = hop_length
-        # load modules
-        self.semantic_model = Wav2Vec2BertModel.from_pretrained(
-            "facebook/w2v-bert-2.0", output_hidden_states=True
-        )
-        self.semantic_model.eval()
-        self.feature_extractor = AutoFeatureExtractor.from_pretrained(
-            "facebook/w2v-bert-2.0"
-        )
-        self.SemanticEncoder_module = SemanticEncoder(1024, 1024, 1024)
-        self.CodecEnc = CodecEncoder()
-        self.generator = CodecDecoderVocos(hop_length=hop_length)
-        self.fc_prior = nn.Linear(2048, 2048)
-        self.fc_post_a = nn.Linear(2048, 1024)
-        # load checkpoint
-        self._load_ckpt(ckpt)
-    def _load_ckpt(self, ckpt):
-        # differentiate between `.ckpt` and `.bin`
-        if ckpt.get("state_dict"):
-            state_dicts = ckpt.get("state_dict")
-        else:
-            state_dicts = ckpt
-        # assign keys to correct model components
-        filtered_enc = {}
-        filtered_gen = {}
-        filtered_post = {}
-        filtered_prior = {}
-        filtered_semantic = {}
-        for key, value in state_dicts.items():
-            if key.startswith("CodecEnc."):
-                new_key = key[len("CodecEnc."):]
-                filtered_enc[new_key] = value
-            elif key.startswith("generator."):
-                new_key = key[len("generator."):]
-                filtered_gen[new_key] = value
-            elif key.startswith("fc_post_a."):
-                new_key = key[len("fc_post_a."):]
-                filtered_post[new_key] = value
-            elif key.startswith("SemanticEncoder_module."):
-                new_key = key[len("SemanticEncoder_module."):]
-                filtered_semantic[new_key] = value
-            elif key.startswith("fc_prior."):
-                new_key = key[len("fc_prior."):]
-                filtered_prior[new_key] = value
-        # load
-        self.CodecEnc.load_state_dict(filtered_enc)
-        self.CodecEnc.eval()
-        self.generator.load_state_dict(filtered_gen, strict=False)
-        self.generator.eval()
-        self.fc_post_a.load_state_dict(filtered_post)
-        self.fc_post_a.eval()
-        self.fc_prior.load_state_dict(filtered_prior)
-        self.SemanticEncoder_module.load_state_dict(filtered_semantic)
-        self.SemanticEncoder_module.eval()
-    @torch.inference_mode()
-    def encode_code(
-        self,
-        input_waveform: torch.Tensor,
-        semantic_features: torch.Tensor = None,
-        sample_rate: int = 16_000,
-    ) -> torch.Tensor:
-        pad_for_wav = 320 - (input_waveform.shape[1] % 320)
-        input_waveform = torch.nn.functional.pad(input_waveform, (0, pad_for_wav))
-        if semantic_features is None:
-            semantic_features = self.feature_extractor(
-                input_waveform, sampling_rate=sample_rate, return_tensors="pt"
-            ).input_features.to(self.device)  # [batch, frames, feat_dim]
-        else:
-            semantic_features = semantic_features[:, 0, :, :]
-        semantic_output = self.semantic_model(semantic_features)
-        semantic_hidden_16 = semantic_output.hidden_states[16]
-        semantic_hidden_16 = semantic_hidden_16.transpose(
-            1, 2
-        )  # [batch, hidden_dim, frames]
-        semantic_encoded = self.SemanticEncoder_module(semantic_hidden_16)
-        if len(input_waveform.shape) == 2:
-            wav = input_waveform.unsqueeze(1).to(self.device)  # shape: [batch, 1, time]
-        else:
-            wav = input_waveform.to(self.device)
-        vq_emb = self.CodecEnc(wav)  # [batch, time//down, 1024]
-        vq_emb = vq_emb.transpose(1, 2)  # -> [batch, 1024, frames]
-        if vq_emb.shape[-1] != semantic_encoded.shape[-1]:
-            min_len = min(vq_emb.shape[-1], semantic_encoded.shape[-1])
-            vq_emb = vq_emb[:, :, :min_len]
-            semantic_encoded = semantic_encoded[:, :, :min_len]
-        concat_emb = torch.cat(
-            [semantic_encoded, vq_emb], dim=1
-        )  # [batch, 2048, frames]
-        concat_emb = self.fc_prior(concat_emb.transpose(1, 2)).transpose(1, 2)
-        _, vq_code, _ = self.generator(concat_emb, vq=True)
-        return vq_code
-    @torch.inference_mode()
-    def decode_code(self, vq_code: torch.Tensor) -> torch.Tensor:
-        vq_post_emb = self.generator.quantizer.get_output_from_indices(
-            vq_code.transpose(1, 2)
-        )
-        vq_post_emb = vq_post_emb.transpose(1, 2)  # [batch, 1024, frames]
-        vq_post_emb = self.fc_post_a(vq_post_emb.transpose(1, 2)).transpose(
-            1, 2
-        )  # [batch, 1024, frames]
-        recon_audio = self.generator(vq_post_emb.transpose(1, 2), vq=False)[
-            0
-        ]  # [batch, time]
-        return recon_audio
-    @torch.inference_mode()
-    def autoencode(self, fpath: str, output_fpath: Optional[str] = None):
-        y, sr = torchaudio.load(fpath)
-        if sr != 16_000:
-            y = T.Resample(sr, 16_000)(y)
-        vq_codes = self.encode_code(y)
-        recon = self.decode_code(vq_codes)
-        if output_fpath is None:
-            name, fext = os.path.splitext(fpath)
-            output_fpath = f"{name}_recon{fext}"
-        sf.write(output_fpath, recon[0, 0, :].cpu(), self.sample_rate)
-    @torch.inference_mode()
-    def batch_encode(
-        self, fpaths: list[str], return_tensor: bool = False
-    ) -> tuple[list[torch.Tensor], list[int]] | tuple[torch.Tensor, list[int]]:
-        # prepare batch
-        wavs_batch, semantic_batch, token_durations = self._pad_batch(
-            [self._preprocess_file(fpath) for fpath in fpaths]
-        )
-        vq_codes = self.encode_code(wavs_batch, semantic_batch)
-        # return, unpad if we want to
-        if return_tensor:
-            return vq_codes, list(token_durations)
-        unpadded_vq_codes = []
-        for idx, token_dur in enumerate(token_durations):
-            curr_codes = vq_codes[idx, :, :token_dur]
-            unpadded_vq_codes.append(curr_codes)
-        return unpadded_vq_codes, None
-    @torch.inference_mode()
-    def batch_decode(
-        self,
-        vq_codes: list[torch.Tensor] | torch.Tensor,
-        token_durations: Optional[list[int]] = None,
-    ):
-        # pad tensor if need be
-        if isinstance(vq_codes, list):
-            vq_codes, token_durations = self._pad_codes(vq_codes)
-        else:
-            assert token_durations is not None
-        # decode
-        recons = self.decode_code(vq_codes)
-        # unpad
-        cut_recons = []
-        for idx, token_dur in enumerate(token_durations):
-            curr_recon = recons[idx, :, : int(token_dur * self.hop_length)]
-            cut_recons.append(curr_recon)
-        return cut_recons
-    @torch.inference_mode()
-    def batch_autoencode(
-        self, fpaths: list[str], output_fpaths: Optional[list[str]] = None
-    ) -> list[torch.Tensor]:
-        vq_codes, token_durations = self.batch_encode(fpaths, return_tensor=True)
-        cut_recons = self.batch_decode(vq_codes, token_durations)
-        if output_fpaths:
-            for recon, output_fpath in zip(cut_recons, output_fpaths):
-                sf.write(output_fpath, recon.cpu().numpy()[0, :], self.sample_rate)
-        return cut_recons
-    def _preprocess_file(self, fpath: str):
-        # load and resample
-        y, sr = torchaudio.load(fpath)
-        if sr != 16_000:
-            y = T.Resample(sr, 16_000)(y)
-        # compute duration for any cutting we might need to do, in terms of n_tokens
-        token_duration = int((y.shape[-1] / 16_000) * 50)
-        # get semantic model features: [harry] note i don't think this can be batched
-        semantic_model_input = self.feature_extractor(
-            y, sampling_rate=16_000, return_tensors="pt"
-        ).input_features
-        return y.to(self.device), semantic_model_input.to(self.device), token_duration
-    def _pad_batch(self, batch: list[tuple[torch.Tensor, torch.Tensor, int]]):
-        # unpack batch
-        wavs, semantic_features, token_durations = zip(*batch)
-        max_length_semantic = max([f.shape[1] for f in semantic_features])
-        max_length = max_length_semantic * 320
-        # pad wavs
-        wavs_padded = []
-        for audio in wavs:
-            padding = max_length - audio.shape[1]
-            if padding > 0:
-                padded_audio = F.pad(audio, (0, padding), mode="constant", value=0)
-            else:
-                padded_audio = audio[:, :max_length]
-            wavs_padded.append(padded_audio)
-        wavs_tensor = torch.stack(wavs_padded)
-        # pad semantic features
-        semantic_features_padded = []
-        for feat in semantic_features:
-            padding = max_length_semantic - feat.shape[1]
-            padded_feat = F.pad(feat, (0, 0, 0, padding), mode="constant", value=0)
-            semantic_features_padded.append(padded_feat)
-        semantic_feature_tensor = torch.stack(semantic_features_padded)
-        return wavs_tensor, semantic_feature_tensor, token_durations
-    def _pad_codes(self, vq_codes: list[torch.Tensor]):
-        max_len = max([i.shape[-1] for i in vq_codes])
-        token_durations = []
-        padded_codes = []
-        for curr_codes in vq_codes:
-            curr_len = curr_codes.shape[-1]
-            token_durations.append(curr_len)
-            padding = max_len - curr_len
-            curr_codes = F.pad(curr_codes, (0, padding), mode="constant", value=0)
-            padded_codes.append(curr_codes)
-        return torch.stack(padded_codes), token_durations
-    @property
-    def device(self):
-        return next(self.parameters()).device

neucodec/module.py DELETED Viewed

@@ -1,114 +0,0 @@
-import torch.nn as nn
-from torch.nn.utils import weight_norm
-from .activations import SnakeBeta
-from .alias_free_torch import Activation1d
-def WNConv1d(*args, **kwargs):
-    return weight_norm(nn.Conv1d(*args, **kwargs))
-class ResidualUnit(nn.Module):
-    def __init__(self, dim: int = 16, dilation: int = 1):
-        super().__init__()
-        pad = ((7 - 1) * dilation) // 2
-        self.block = nn.Sequential(
-            Activation1d(activation=SnakeBeta(dim, alpha_logscale=True)),
-            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
-            Activation1d(activation=SnakeBeta(dim, alpha_logscale=True)),
-            WNConv1d(dim, dim, kernel_size=1),
-        )
-    def forward(self, x):
-        return x + self.block(x)
-class EncoderBlock(nn.Module):
-    def __init__(self, dim: int = 16, stride: int = 1, dilations=(1, 3, 9)):
-        super().__init__()
-        runits = [ResidualUnit(dim // 2, dilation=d) for d in dilations]
-        self.block = nn.Sequential(
-            *runits,
-            Activation1d(activation=SnakeBeta(dim // 2, alpha_logscale=True)),
-            WNConv1d(
-                dim // 2,
-                dim,
-                kernel_size=2 * stride,
-                stride=stride,
-                padding=stride // 2 + stride % 2,
-            ),
-        )
-    def forward(self, x):
-        return self.block(x)
-class SemanticEncoder(nn.Module):
-    def __init__(
-        self,
-        input_channels: int,
-        code_dim: int,
-        encode_channels: int,
-        kernel_size: int = 3,
-        bias: bool = True,
-    ):
-        super(SemanticEncoder, self).__init__()
-        # 初始卷积，将 input_channels 映射到 encode_channels
-        self.initial_conv = nn.Conv1d(
-            in_channels=input_channels,
-            out_channels=encode_channels,
-            kernel_size=kernel_size,
-            stride=1,
-            padding=(kernel_size - 1) // 2,
-            bias=False,
-        )
-        # 残差块
-        self.residual_blocks = nn.Sequential(
-            nn.ReLU(inplace=True),
-            nn.Conv1d(
-                encode_channels,
-                encode_channels,
-                kernel_size=kernel_size,
-                stride=1,
-                padding=(kernel_size - 1) // 2,
-                bias=bias,
-            ),
-            nn.ReLU(inplace=True),
-            nn.Conv1d(
-                encode_channels,
-                encode_channels,
-                kernel_size=kernel_size,
-                stride=1,
-                padding=(kernel_size - 1) // 2,
-                bias=bias,
-            ),
-        )
-        # 最终卷积，将 encode_channels 映射到 code_dim
-        self.final_conv = nn.Conv1d(
-            in_channels=encode_channels,
-            out_channels=code_dim,
-            kernel_size=kernel_size,
-            stride=1,
-            padding=(kernel_size - 1) // 2,
-            bias=False,
-        )
-    def forward(self, x):
-        """
-        前向传播方法。
-        Args:
-            x (Tensor): 输入张量，形状为 (Batch, Input_channels, Length)
-        Returns:
-            Tensor: 编码后的张量，形状为 (Batch, Code_dim, Length)
-        """
-        x = self.initial_conv(x)  # (Batch, Encode_channels, Length)
-        x = self.residual_blocks(x) + x  # 残差连接
-        x = self.final_conv(x)  # (Batch, Code_dim, Length)
-        return x

setup.py DELETED Viewed

@@ -1,32 +0,0 @@
-from setuptools import setup, find_packages
-setup(
-    name='neucodec',
-    version='0.0.1',
-    description='A package for neucodec, based on xcodec2.',
-    long_description_content_type='text/markdown',
-    author='Harry Julian',
-    author_email='[email protected]',
-    packages=find_packages(),
-    install_requires=[
-        'librosa',
-        'soundfile',
-        'numpy>=2.0.2',
-        'omegaconf>=2.3.0',
-        'torch>=2.5.1',
-        'torchaudio>=2.5.1',
-        'torchao>=0.5.0',
-        'torchtune>=0.3.1',
-        'vector-quantize-pytorch>=1.17.8',
-        'rotary-embedding-torch>=0.8.4',
-        'transformers>=4.44.2',
-        'boto3>1.0',
-        'tqdm',
-    ],
-    classifiers=[
-        'Programming Language :: Python',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.10',
-    ],
-)

tests/__init__.py DELETED Viewed

File without changes

tests/test_neucodec.py DELETED Viewed

@@ -1,128 +0,0 @@
-import pytest
-import torch
-import torchaudio
-import librosa
-from xcodec2 import XCodec2, MiniXCodec2Encoder
-@pytest.fixture
-def model_16khz():
-    return XCodec2.from_cache("16khz")
-@pytest.fixture
-def model_24khz():
-    return XCodec2.from_cache("24khz")
-@pytest.fixture
-def model_asr_encoder():
-    return MiniXCodec2Encoder.from_cache()
-@pytest.fixture
-def example_audio():
-    y, sr = torchaudio.load(librosa.ex("libri1"))
-    return y, sr
-@pytest.fixture
-def example_fpath():
-    return librosa.ex("libri1")
-@pytest.fixture
-def batch_fpaths():
-    return [librosa.ex("libri1"), librosa.ex("libri2")]
-def load_and_validate_audio(save_path, sample_rate):
-    _, sr = torchaudio.load(save_path)
-    assert sr == sample_rate
-def test_16khz_autoencode(example_fpath, tmp_path, model_16khz):
-    save_path = str(tmp_path / "0.wav")
-    model_16khz.autoencode(example_fpath, save_path)
-    load_and_validate_audio(save_path, 16_000)
-def test_24khz_autoencode(example_fpath, tmp_path, model_24khz):
-    save_path = str(tmp_path / "0.wav")
-    model_24khz.autoencode(example_fpath, save_path)
-    load_and_validate_audio(save_path, 24_000)
-def test_24khz_encode_decode_single(example_audio, model_24khz):
-    y, sr = example_audio
-    if sr != 16_000:
-        y = torchaudio.transforms.Resample(sr, 16_000)(y)
-        sr = 16_000
-    # encode
-    vq_codes = model_24khz.encode_code(y, sample_rate=sr)
-    assert isinstance(vq_codes, torch.Tensor)
-    assert vq_codes.dim() == 3  # [batch, channels, time]
-    # decode
-    reconstructed = model_24khz.decode_code(vq_codes)
-    assert isinstance(reconstructed, torch.Tensor)
-    assert reconstructed.dim() == 3  # [batch, channels, time]
-def test_24khz_batch_encode(batch_fpaths, model_24khz):
-    vq_codes_list, token_durations = model_24khz.batch_encode(batch_fpaths, return_tensor=False)
-    assert isinstance(vq_codes_list, list)
-    assert token_durations is None
-    assert len(vq_codes_list) == 2
-    for codes in vq_codes_list:
-        assert isinstance(codes, torch.Tensor)
-        assert codes.dim() == 2  # [channels, time]
-def test_24khz_batch_encode_tensor(batch_fpaths, model_24khz):
-    vq_codes_tensor, token_durations = model_24khz.batch_encode(batch_fpaths, return_tensor=True)
-    assert isinstance(vq_codes_tensor, torch.Tensor)
-    assert isinstance(token_durations, list)
-    assert vq_codes_tensor.dim() == 3  # [batch, channels, time]
-    assert len(token_durations) == 2
-    assert len(set(token_durations)) == 2 # ensure we get two different durations back
-def test_24khz_batch_decode(batch_fpaths, model_24khz):
-    vq_codes_tensor, token_durations = model_24khz.batch_encode(batch_fpaths, return_tensor=True)
-    reconstructed_list = model_24khz.batch_decode(vq_codes_tensor, token_durations)
-    assert isinstance(reconstructed_list, list)
-    assert len(reconstructed_list) == 2
-    for recon in reconstructed_list:
-        assert isinstance(recon, torch.Tensor)
-        assert recon.dim() == 2  # [channels, time]
-def test_24khz_batch_decode_list_input(batch_fpaths, model_24khz):
-    vq_codes_list, _ = model_24khz.batch_encode(batch_fpaths, return_tensor=False)
-    reconstructed_list = model_24khz.batch_decode(vq_codes_list)
-    assert isinstance(reconstructed_list, list)
-    assert len(reconstructed_list) == 2
-    for recon in reconstructed_list:
-        assert isinstance(recon, torch.Tensor)
-        assert recon.dim() == 2  # [channels, time]
-def test_24khz_batch_autoencode(batch_fpaths, tmp_path, model_24khz):
-    output_paths = [str(tmp_path / f"{i}.wav") for i in range(len(batch_fpaths))]
-    reconstructed_list = model_24khz.batch_autoencode(batch_fpaths, output_paths)
-    assert isinstance(reconstructed_list, list)
-    assert len(reconstructed_list) == 2
-    for i, output_path in enumerate(output_paths):
-        load_and_validate_audio(output_path, 24_000)
-def test_asr_encoder_encode(example_audio, model_asr_encoder):
-    y, sr = example_audio
-    if sr != model_asr_encoder.sample_rate:
-        y = torchaudio.transforms.Resample(sr, model_asr_encoder.sample_rate)(y)
-    vq_codes = model_asr_encoder.encode_code(y)
-    assert isinstance(vq_codes, torch.Tensor)
-    assert vq_codes.dim() == 3