Skip to content

Length Adaptor

LengthAdaptor

Bases: Module

DEPRECATED: The LengthAdaptor module is used to adjust the duration of phonemes. It contains a dedicated duration predictor and methods to upsample the input features to match predicted durations.

Parameters:

Name Type Description Default
model_config AcousticModelConfigType

The model configuration object containing model parameters.

required
Source code in models/tts/delightful_tts/acoustic_model/length_adaptor.py
class LengthAdaptor(Module):
    r"""DEPRECATED: The LengthAdaptor module is used to adjust the duration of phonemes.
    It contains a dedicated duration predictor and methods to upsample the input features to match predicted durations.

    Args:
        model_config (AcousticModelConfigType): The model configuration object containing model parameters.
    """

    def __init__(
        self,
        model_config: AcousticModelConfigType,
    ):
        super().__init__()
        # Initialize the duration predictor
        self.duration_predictor = VariancePredictor(
            channels_in=model_config.encoder.n_hidden,
            channels=model_config.variance_adaptor.n_hidden,
            channels_out=1,
            kernel_size=model_config.variance_adaptor.kernel_size,
            p_dropout=model_config.variance_adaptor.p_dropout,
        )

    def length_regulate(
        self,
        x: torch.Tensor,
        duration: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        r"""Regulates the length of the input tensor using the duration tensor.

        Args:
            x (torch.Tensor): The input tensor.
            duration (torch.Tensor): The tensor containing duration for each time step in x.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: The regulated output tensor and the tensor containing the length of each sequence in the batch.
        """
        output = torch.jit.annotate(List[torch.Tensor], [])
        mel_len = torch.jit.annotate(List[int], [])
        max_len = 0
        for batch, expand_target in zip(x, duration):
            expanded = self.expand(batch, expand_target)
            if expanded.shape[0] > max_len:
                max_len = expanded.shape[0]
            output.append(expanded)
            mel_len.append(expanded.shape[0])
        output = tools.pad(output, max_len)
        return output, torch.tensor(mel_len, dtype=torch.int64)

    def expand(self, batch: torch.Tensor, predicted: torch.Tensor) -> torch.Tensor:
        r"""Expands the input tensor based on the predicted values.

        Args:
            batch (torch.Tensor): The input tensor.
            predicted (torch.Tensor): The tensor containing predicted expansion factors.

        Returns:
            torch.Tensor: The expanded tensor.
        """
        out = torch.jit.annotate(List[torch.Tensor], [])
        for i, vec in enumerate(batch):
            expand_size = predicted[i].item()
            out.append(vec.expand(max(int(expand_size), 0), -1))
        return torch.cat(out, 0)

    def upsample_train(
        self,
        x: torch.Tensor,
        x_res: torch.Tensor,
        duration_target: torch.Tensor,
        embeddings: torch.Tensor,
        src_mask: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        r"""Upsamples the input tensor during training using ground truth durations.

        Args:
            x (torch.Tensor): The input tensor.
            x_res (torch.Tensor): Another input tensor for duration prediction.
            duration_target (torch.Tensor): The ground truth durations tensor.
            embeddings (torch.Tensor): The tensor containing phoneme embeddings.
            src_mask (torch.Tensor): The mask tensor indicating valid entries in x and x_res.

        Returns:
            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The upsampled x, log duration prediction, and upsampled embeddings.
        """
        x_res = x_res.detach()
        log_duration_prediction = self.duration_predictor(
            x_res,
            src_mask,
        )  # type: torch.Tensor
        x, _ = self.length_regulate(x, duration_target)
        embeddings, _ = self.length_regulate(embeddings, duration_target)
        return x, log_duration_prediction, embeddings

    def upsample(
        self,
        x: torch.Tensor,
        x_res: torch.Tensor,
        src_mask: torch.Tensor,
        embeddings: torch.Tensor,
        control: float,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        r"""Upsamples the input tensor during inference.

        Args:
            x (torch.Tensor): The input tensor.
            x_res (torch.Tensor): Another input tensor for duration prediction.
            src_mask (torch.Tensor): The mask tensor indicating valid entries in x and x_res.
            embeddings (torch.Tensor): The tensor containing phoneme embeddings.
            control (float): A control parameter for pitch regulation.

        Returns:
            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The upsampled x, approximated duration, and upsampled embeddings.
        """
        log_duration_prediction = self.duration_predictor(
            x_res,
            src_mask,
        )
        duration_rounded = torch.clamp(
            (torch.round(torch.exp(log_duration_prediction) - 1) * control),
            min=0,
        )
        x, _ = self.length_regulate(x, duration_rounded)
        embeddings, _ = self.length_regulate(embeddings, duration_rounded)
        return x, duration_rounded, embeddings

expand(batch, predicted)

Expands the input tensor based on the predicted values.

Parameters:

Name Type Description Default
batch Tensor

The input tensor.

required
predicted Tensor

The tensor containing predicted expansion factors.

required

Returns:

Type Description
Tensor

torch.Tensor: The expanded tensor.

Source code in models/tts/delightful_tts/acoustic_model/length_adaptor.py
def expand(self, batch: torch.Tensor, predicted: torch.Tensor) -> torch.Tensor:
    r"""Expands the input tensor based on the predicted values.

    Args:
        batch (torch.Tensor): The input tensor.
        predicted (torch.Tensor): The tensor containing predicted expansion factors.

    Returns:
        torch.Tensor: The expanded tensor.
    """
    out = torch.jit.annotate(List[torch.Tensor], [])
    for i, vec in enumerate(batch):
        expand_size = predicted[i].item()
        out.append(vec.expand(max(int(expand_size), 0), -1))
    return torch.cat(out, 0)

length_regulate(x, duration)

Regulates the length of the input tensor using the duration tensor.

Parameters:

Name Type Description Default
x Tensor

The input tensor.

required
duration Tensor

The tensor containing duration for each time step in x.

required

Returns:

Type Description
Tuple[Tensor, Tensor]

Tuple[torch.Tensor, torch.Tensor]: The regulated output tensor and the tensor containing the length of each sequence in the batch.

Source code in models/tts/delightful_tts/acoustic_model/length_adaptor.py
def length_regulate(
    self,
    x: torch.Tensor,
    duration: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
    r"""Regulates the length of the input tensor using the duration tensor.

    Args:
        x (torch.Tensor): The input tensor.
        duration (torch.Tensor): The tensor containing duration for each time step in x.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: The regulated output tensor and the tensor containing the length of each sequence in the batch.
    """
    output = torch.jit.annotate(List[torch.Tensor], [])
    mel_len = torch.jit.annotate(List[int], [])
    max_len = 0
    for batch, expand_target in zip(x, duration):
        expanded = self.expand(batch, expand_target)
        if expanded.shape[0] > max_len:
            max_len = expanded.shape[0]
        output.append(expanded)
        mel_len.append(expanded.shape[0])
    output = tools.pad(output, max_len)
    return output, torch.tensor(mel_len, dtype=torch.int64)

upsample(x, x_res, src_mask, embeddings, control)

Upsamples the input tensor during inference.

Parameters:

Name Type Description Default
x Tensor

The input tensor.

required
x_res Tensor

Another input tensor for duration prediction.

required
src_mask Tensor

The mask tensor indicating valid entries in x and x_res.

required
embeddings Tensor

The tensor containing phoneme embeddings.

required
control float

A control parameter for pitch regulation.

required

Returns:

Type Description
Tuple[Tensor, Tensor, Tensor]

Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The upsampled x, approximated duration, and upsampled embeddings.

Source code in models/tts/delightful_tts/acoustic_model/length_adaptor.py
def upsample(
    self,
    x: torch.Tensor,
    x_res: torch.Tensor,
    src_mask: torch.Tensor,
    embeddings: torch.Tensor,
    control: float,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    r"""Upsamples the input tensor during inference.

    Args:
        x (torch.Tensor): The input tensor.
        x_res (torch.Tensor): Another input tensor for duration prediction.
        src_mask (torch.Tensor): The mask tensor indicating valid entries in x and x_res.
        embeddings (torch.Tensor): The tensor containing phoneme embeddings.
        control (float): A control parameter for pitch regulation.

    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The upsampled x, approximated duration, and upsampled embeddings.
    """
    log_duration_prediction = self.duration_predictor(
        x_res,
        src_mask,
    )
    duration_rounded = torch.clamp(
        (torch.round(torch.exp(log_duration_prediction) - 1) * control),
        min=0,
    )
    x, _ = self.length_regulate(x, duration_rounded)
    embeddings, _ = self.length_regulate(embeddings, duration_rounded)
    return x, duration_rounded, embeddings

upsample_train(x, x_res, duration_target, embeddings, src_mask)

Upsamples the input tensor during training using ground truth durations.

Parameters:

Name Type Description Default
x Tensor

The input tensor.

required
x_res Tensor

Another input tensor for duration prediction.

required
duration_target Tensor

The ground truth durations tensor.

required
embeddings Tensor

The tensor containing phoneme embeddings.

required
src_mask Tensor

The mask tensor indicating valid entries in x and x_res.

required

Returns:

Type Description
Tuple[Tensor, Tensor, Tensor]

Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The upsampled x, log duration prediction, and upsampled embeddings.

Source code in models/tts/delightful_tts/acoustic_model/length_adaptor.py
def upsample_train(
    self,
    x: torch.Tensor,
    x_res: torch.Tensor,
    duration_target: torch.Tensor,
    embeddings: torch.Tensor,
    src_mask: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    r"""Upsamples the input tensor during training using ground truth durations.

    Args:
        x (torch.Tensor): The input tensor.
        x_res (torch.Tensor): Another input tensor for duration prediction.
        duration_target (torch.Tensor): The ground truth durations tensor.
        embeddings (torch.Tensor): The tensor containing phoneme embeddings.
        src_mask (torch.Tensor): The mask tensor indicating valid entries in x and x_res.

    Returns:
        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The upsampled x, log duration prediction, and upsampled embeddings.
    """
    x_res = x_res.detach()
    log_duration_prediction = self.duration_predictor(
        x_res,
        src_mask,
    )  # type: torch.Tensor
    x, _ = self.length_regulate(x, duration_target)
    embeddings, _ = self.length_regulate(embeddings, duration_target)
    return x, log_duration_prediction, embeddings