Skip to content

Audio Processor

AudioProcessor

A class used to process audio signals and convert them into different representations.

Attributes:

Name Type Description
hann_window dict

A dictionary to store the Hann window for different configurations.

mel_basis dict

A dictionary to store the Mel basis for different configurations.

Methods:

Name Description
name_mel_basis

Generate a name for the Mel basis based on the FFT size, maximum frequency, data type, and device.

amp_to_db

Convert amplitude to decibels (dB).

db_to_amp

Convert decibels (dB) to amplitude.

wav_to_spec

Convert a waveform to a spectrogram and compute the magnitude.

wav_to_energy

Convert a waveform to a spectrogram and compute the energy.

spec_to_mel

Convert a spectrogram to a Mel spectrogram.

wav_to_mel

Convert a waveform to a Mel spectrogram.

Source code in training/preprocess/audio_processor.py
class AudioProcessor:
    r"""A class used to process audio signals and convert them into different representations.

    Attributes:
        hann_window (dict): A dictionary to store the Hann window for different configurations.
        mel_basis (dict): A dictionary to store the Mel basis for different configurations.

    Methods:
        name_mel_basis(spec, n_fft, fmax): Generate a name for the Mel basis based on the FFT size, maximum frequency, data type, and device.
        amp_to_db(magnitudes, C=1, clip_val=1e-5): Convert amplitude to decibels (dB).
        db_to_amp(magnitudes, C=1): Convert decibels (dB) to amplitude.
        wav_to_spec(y, n_fft, hop_length, win_length, center=False): Convert a waveform to a spectrogram and compute the magnitude.
        wav_to_energy(y, n_fft, hop_length, win_length, center=False): Convert a waveform to a spectrogram and compute the energy.
        spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax): Convert a spectrogram to a Mel spectrogram.
        wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False): Convert a waveform to a Mel spectrogram.
    """

    def __init__(self):
        self.hann_window = {}
        self.mel_basis = {}

    @staticmethod
    def name_mel_basis(spec: torch.Tensor, n_fft: int, fmax: int) -> str:
        """Generate a name for the Mel basis based on the FFT size, maximum frequency, data type, and device.

        Args:
            spec (torch.Tensor): The spectrogram tensor.
            n_fft (int): The FFT size.
            fmax (int): The maximum frequency.

        Returns:
            str: The generated name for the Mel basis.
        """
        n_fft_len = f"{n_fft}_{fmax}_{spec.dtype}_{spec.device}"
        return n_fft_len

    @staticmethod
    def amp_to_db(magnitudes: torch.Tensor, C: int = 1, clip_val: float = 1e-5) -> torch.Tensor:
        r"""Convert amplitude to decibels (dB).

        Args:
            magnitudes (Tensor): The amplitude magnitudes to convert.
            C (int, optional): A constant value used in the conversion. Defaults to 1.
            clip_val (float, optional): A value to clamp the magnitudes to avoid taking the log of zero. Defaults to 1e-5.

        Returns:
            Tensor: The converted magnitudes in dB.
        """
        return torch.log(torch.clamp(magnitudes, min=clip_val) * C)

    @staticmethod
    def db_to_amp(magnitudes: torch.Tensor, C: int = 1) -> torch.Tensor:
        r"""Convert decibels (dB) to amplitude.

        Args:
            magnitudes (Tensor): The dB magnitudes to convert.
            C (int, optional): A constant value used in the conversion. Defaults to 1.

        Returns:
            Tensor: The converted magnitudes in amplitude.
        """
        return torch.exp(magnitudes) / C

    def wav_to_spec(
        self,
        y: torch.Tensor,
        n_fft: int,
        hop_length: int,
        win_length: int,
        center: bool = False,
    ) -> torch.Tensor:
        r"""Convert a waveform to a spectrogram and compute the magnitude.

        Args:
            y (Tensor): The input waveform.
            n_fft (int): The FFT size.
            hop_length (int): The hop (stride) size.
            win_length (int): The window size.
            center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

        Returns:
            Tensor: The magnitude of the computed spectrogram.
        """
        y = y.squeeze(1)

        dtype_device = str(y.dtype) + "_" + str(y.device)
        wnsize_dtype_device = str(win_length) + "_" + dtype_device
        if wnsize_dtype_device not in self.hann_window:
            self.hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device)

        y = torch.nn.functional.pad(
            y.unsqueeze(1),
            (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
            mode="reflect",
        )
        y = y.squeeze(1)

        spec = torch.stft(
            y,
            n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=self.hann_window[wnsize_dtype_device],
            center=center,
            pad_mode="reflect",
            normalized=False,
            onesided=True,
            return_complex=True,
        )

        spec = torch.view_as_real(spec)

        # Compute the magnitude
        spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)

        return spec

    def wav_to_energy(
        self,
        y: torch.Tensor,
        n_fft: int,
        hop_length: int,
        win_length: int,
        center: bool = False,
    ) -> torch.Tensor:
        r"""Convert a waveform to a spectrogram and compute the energy.

        Args:
            y (Tensor): The input waveform.
            n_fft (int): The FFT size.
            hop_length (int): The hop (stride) size.
            win_length (int): The window size.
            center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

        Returns:
            Tensor: The energy of the computed spectrogram.
        """
        spec = self.wav_to_spec(y, n_fft, hop_length, win_length, center=center)
        spec = torch.norm(spec, dim=1, keepdim=True).squeeze(0)

        # Normalize the energy
        return (spec - spec.mean()) / spec.std()

    def spec_to_mel(
            self,
            spec: torch.Tensor,
            n_fft: int,
            num_mels: int,
            sample_rate: int,
            fmin: int,
            fmax: int,
    ) -> torch.Tensor:
        r"""Convert a spectrogram to a Mel spectrogram.

        Args:
            spec (torch.Tensor): The input spectrogram of shape [B, C, T].
            n_fft (int): The FFT size.
            num_mels (int): The number of Mel bands.
            sample_rate (int): The sample rate of the audio.
            fmin (int): The minimum frequency.
            fmax (int): The maximum frequency.

        Returns:
            torch.Tensor: The computed Mel spectrogram of shape [B, C, T].
        """
        mel_basis_key = self.name_mel_basis(spec, n_fft, fmax)

        if mel_basis_key not in self.mel_basis:
            mel = librosa_mel_fn(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
            self.mel_basis[mel_basis_key] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)

        mel = torch.matmul(self.mel_basis[mel_basis_key], spec)
        mel = self.amp_to_db(mel)

        return mel

    def wav_to_mel(
        self,
        y: torch.Tensor,
        n_fft: int,
        num_mels: int,
        sample_rate: int,
        hop_length: int,
        win_length: int,
        fmin: int,
        fmax: int,
        center: bool = False,
    ) -> torch.Tensor:
        r"""Convert a waveform to a Mel spectrogram.

        Args:
            y (torch.Tensor): The input waveform.
            n_fft (int): The FFT size.
            num_mels (int): The number of Mel bands.
            sample_rate (int): The sample rate of the audio.
            hop_length (int): The hop (stride) size.
            win_length (int): The window size.
            fmin (int): The minimum frequency.
            fmax (int): The maximum frequency.
            center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

        Returns:
            torch.Tensor: The computed Mel spectrogram.
        """
        # Convert the waveform to a spectrogram
        spec = self.wav_to_spec(y, n_fft, hop_length, win_length, center=center)

        # Convert the spectrogram to a Mel spectrogram
        mel = self.spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax)

        return mel

amp_to_db(magnitudes, C=1, clip_val=1e-05) staticmethod

Convert amplitude to decibels (dB).

Parameters:

Name Type Description Default
magnitudes Tensor

The amplitude magnitudes to convert.

required
C int

A constant value used in the conversion. Defaults to 1.

1
clip_val float

A value to clamp the magnitudes to avoid taking the log of zero. Defaults to 1e-5.

1e-05

Returns:

Name Type Description
Tensor Tensor

The converted magnitudes in dB.

Source code in training/preprocess/audio_processor.py
@staticmethod
def amp_to_db(magnitudes: torch.Tensor, C: int = 1, clip_val: float = 1e-5) -> torch.Tensor:
    r"""Convert amplitude to decibels (dB).

    Args:
        magnitudes (Tensor): The amplitude magnitudes to convert.
        C (int, optional): A constant value used in the conversion. Defaults to 1.
        clip_val (float, optional): A value to clamp the magnitudes to avoid taking the log of zero. Defaults to 1e-5.

    Returns:
        Tensor: The converted magnitudes in dB.
    """
    return torch.log(torch.clamp(magnitudes, min=clip_val) * C)

db_to_amp(magnitudes, C=1) staticmethod

Convert decibels (dB) to amplitude.

Parameters:

Name Type Description Default
magnitudes Tensor

The dB magnitudes to convert.

required
C int

A constant value used in the conversion. Defaults to 1.

1

Returns:

Name Type Description
Tensor Tensor

The converted magnitudes in amplitude.

Source code in training/preprocess/audio_processor.py
@staticmethod
def db_to_amp(magnitudes: torch.Tensor, C: int = 1) -> torch.Tensor:
    r"""Convert decibels (dB) to amplitude.

    Args:
        magnitudes (Tensor): The dB magnitudes to convert.
        C (int, optional): A constant value used in the conversion. Defaults to 1.

    Returns:
        Tensor: The converted magnitudes in amplitude.
    """
    return torch.exp(magnitudes) / C

name_mel_basis(spec, n_fft, fmax) staticmethod

Generate a name for the Mel basis based on the FFT size, maximum frequency, data type, and device.

Parameters:

Name Type Description Default
spec Tensor

The spectrogram tensor.

required
n_fft int

The FFT size.

required
fmax int

The maximum frequency.

required

Returns:

Name Type Description
str str

The generated name for the Mel basis.

Source code in training/preprocess/audio_processor.py
@staticmethod
def name_mel_basis(spec: torch.Tensor, n_fft: int, fmax: int) -> str:
    """Generate a name for the Mel basis based on the FFT size, maximum frequency, data type, and device.

    Args:
        spec (torch.Tensor): The spectrogram tensor.
        n_fft (int): The FFT size.
        fmax (int): The maximum frequency.

    Returns:
        str: The generated name for the Mel basis.
    """
    n_fft_len = f"{n_fft}_{fmax}_{spec.dtype}_{spec.device}"
    return n_fft_len

spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax)

Convert a spectrogram to a Mel spectrogram.

Parameters:

Name Type Description Default
spec Tensor

The input spectrogram of shape [B, C, T].

required
n_fft int

The FFT size.

required
num_mels int

The number of Mel bands.

required
sample_rate int

The sample rate of the audio.

required
fmin int

The minimum frequency.

required
fmax int

The maximum frequency.

required

Returns:

Type Description
Tensor

torch.Tensor: The computed Mel spectrogram of shape [B, C, T].

Source code in training/preprocess/audio_processor.py
def spec_to_mel(
        self,
        spec: torch.Tensor,
        n_fft: int,
        num_mels: int,
        sample_rate: int,
        fmin: int,
        fmax: int,
) -> torch.Tensor:
    r"""Convert a spectrogram to a Mel spectrogram.

    Args:
        spec (torch.Tensor): The input spectrogram of shape [B, C, T].
        n_fft (int): The FFT size.
        num_mels (int): The number of Mel bands.
        sample_rate (int): The sample rate of the audio.
        fmin (int): The minimum frequency.
        fmax (int): The maximum frequency.

    Returns:
        torch.Tensor: The computed Mel spectrogram of shape [B, C, T].
    """
    mel_basis_key = self.name_mel_basis(spec, n_fft, fmax)

    if mel_basis_key not in self.mel_basis:
        mel = librosa_mel_fn(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
        self.mel_basis[mel_basis_key] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)

    mel = torch.matmul(self.mel_basis[mel_basis_key], spec)
    mel = self.amp_to_db(mel)

    return mel

wav_to_energy(y, n_fft, hop_length, win_length, center=False)

Convert a waveform to a spectrogram and compute the energy.

Parameters:

Name Type Description Default
y Tensor

The input waveform.

required
n_fft int

The FFT size.

required
hop_length int

The hop (stride) size.

required
win_length int

The window size.

required
center bool

Whether to pad y such that frames are centered. Defaults to False.

False

Returns:

Name Type Description
Tensor Tensor

The energy of the computed spectrogram.

Source code in training/preprocess/audio_processor.py
def wav_to_energy(
    self,
    y: torch.Tensor,
    n_fft: int,
    hop_length: int,
    win_length: int,
    center: bool = False,
) -> torch.Tensor:
    r"""Convert a waveform to a spectrogram and compute the energy.

    Args:
        y (Tensor): The input waveform.
        n_fft (int): The FFT size.
        hop_length (int): The hop (stride) size.
        win_length (int): The window size.
        center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

    Returns:
        Tensor: The energy of the computed spectrogram.
    """
    spec = self.wav_to_spec(y, n_fft, hop_length, win_length, center=center)
    spec = torch.norm(spec, dim=1, keepdim=True).squeeze(0)

    # Normalize the energy
    return (spec - spec.mean()) / spec.std()

wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False)

Convert a waveform to a Mel spectrogram.

Parameters:

Name Type Description Default
y Tensor

The input waveform.

required
n_fft int

The FFT size.

required
num_mels int

The number of Mel bands.

required
sample_rate int

The sample rate of the audio.

required
hop_length int

The hop (stride) size.

required
win_length int

The window size.

required
fmin int

The minimum frequency.

required
fmax int

The maximum frequency.

required
center bool

Whether to pad y such that frames are centered. Defaults to False.

False

Returns:

Type Description
Tensor

torch.Tensor: The computed Mel spectrogram.

Source code in training/preprocess/audio_processor.py
def wav_to_mel(
    self,
    y: torch.Tensor,
    n_fft: int,
    num_mels: int,
    sample_rate: int,
    hop_length: int,
    win_length: int,
    fmin: int,
    fmax: int,
    center: bool = False,
) -> torch.Tensor:
    r"""Convert a waveform to a Mel spectrogram.

    Args:
        y (torch.Tensor): The input waveform.
        n_fft (int): The FFT size.
        num_mels (int): The number of Mel bands.
        sample_rate (int): The sample rate of the audio.
        hop_length (int): The hop (stride) size.
        win_length (int): The window size.
        fmin (int): The minimum frequency.
        fmax (int): The maximum frequency.
        center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

    Returns:
        torch.Tensor: The computed Mel spectrogram.
    """
    # Convert the waveform to a spectrogram
    spec = self.wav_to_spec(y, n_fft, hop_length, win_length, center=center)

    # Convert the spectrogram to a Mel spectrogram
    mel = self.spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax)

    return mel

wav_to_spec(y, n_fft, hop_length, win_length, center=False)

Convert a waveform to a spectrogram and compute the magnitude.

Parameters:

Name Type Description Default
y Tensor

The input waveform.

required
n_fft int

The FFT size.

required
hop_length int

The hop (stride) size.

required
win_length int

The window size.

required
center bool

Whether to pad y such that frames are centered. Defaults to False.

False

Returns:

Name Type Description
Tensor Tensor

The magnitude of the computed spectrogram.

Source code in training/preprocess/audio_processor.py
def wav_to_spec(
    self,
    y: torch.Tensor,
    n_fft: int,
    hop_length: int,
    win_length: int,
    center: bool = False,
) -> torch.Tensor:
    r"""Convert a waveform to a spectrogram and compute the magnitude.

    Args:
        y (Tensor): The input waveform.
        n_fft (int): The FFT size.
        hop_length (int): The hop (stride) size.
        win_length (int): The window size.
        center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

    Returns:
        Tensor: The magnitude of the computed spectrogram.
    """
    y = y.squeeze(1)

    dtype_device = str(y.dtype) + "_" + str(y.device)
    wnsize_dtype_device = str(win_length) + "_" + dtype_device
    if wnsize_dtype_device not in self.hann_window:
        self.hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device)

    y = torch.nn.functional.pad(
        y.unsqueeze(1),
        (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
        mode="reflect",
    )
    y = y.squeeze(1)

    spec = torch.stft(
        y,
        n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window=self.hann_window[wnsize_dtype_device],
        center=center,
        pad_mode="reflect",
        normalized=False,
        onesided=True,
        return_complex=True,
    )

    spec = torch.view_as_real(spec)

    # Compute the magnitude
    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)

    return spec