| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258 |
- # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
- # 2024 Alibaba Inc (Xiang Lyu)
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # Modified from ESPnet(https://github.com/espnet/espnet)
- """ConvolutionModule definition."""
- from typing import Tuple
- import torch
- from torch import nn
- import torch.nn.functional as F
- class ConvolutionModule(nn.Module):
- """ConvolutionModule in Conformer model."""
- def __init__(self,
- channels: int,
- kernel_size: int = 15,
- activation: nn.Module = nn.ReLU(),
- norm: str = "batch_norm",
- causal: bool = False,
- bias: bool = True):
- """Construct an ConvolutionModule object.
- Args:
- channels (int): The number of channels of conv layers.
- kernel_size (int): Kernel size of conv layers.
- causal (int): Whether use causal convolution or not
- """
- super().__init__()
- self.pointwise_conv1 = nn.Conv1d(
- channels,
- 2 * channels,
- kernel_size=1,
- stride=1,
- padding=0,
- bias=bias,
- )
- # self.lorder is used to distinguish if it's a causal convolution,
- # if self.lorder > 0: it's a causal convolution, the input will be
- # padded with self.lorder frames on the left in forward.
- # else: it's a symmetrical convolution
- if causal:
- padding = 0
- self.lorder = kernel_size - 1
- else:
- # kernel_size should be an odd number for none causal convolution
- assert (kernel_size - 1) % 2 == 0
- padding = (kernel_size - 1) // 2
- self.lorder = 0
- self.depthwise_conv = nn.Conv1d(
- channels,
- channels,
- kernel_size,
- stride=1,
- padding=padding,
- groups=channels,
- bias=bias,
- )
- assert norm in ['batch_norm', 'layer_norm']
- if norm == "batch_norm":
- self.use_layer_norm = False
- self.norm = nn.BatchNorm1d(channels)
- else:
- self.use_layer_norm = True
- self.norm = nn.LayerNorm(channels)
- self.pointwise_conv2 = nn.Conv1d(
- channels,
- channels,
- kernel_size=1,
- stride=1,
- padding=0,
- bias=bias,
- )
- self.activation = activation
- def forward(
- self,
- x: torch.Tensor,
- mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
- cache: torch.Tensor = torch.zeros((0, 0, 0)),
- ) -> Tuple[torch.Tensor, torch.Tensor]:
- """Compute convolution module.
- Args:
- x (torch.Tensor): Input tensor (#batch, time, channels).
- mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
- (0, 0, 0) means fake mask.
- cache (torch.Tensor): left context cache, it is only
- used in causal convolution (#batch, channels, cache_t),
- (0, 0, 0) meas fake cache.
- Returns:
- torch.Tensor: Output tensor (#batch, time, channels).
- """
- # exchange the temporal dimension and the feature dimension
- x = x.transpose(1, 2) # (#batch, channels, time)
- # mask batch padding
- if mask_pad.size(2) > 0: # time > 0
- x.masked_fill_(~mask_pad, 0.0)
- if self.lorder > 0:
- if cache.size(2) == 0: # cache_t == 0
- x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
- else:
- assert cache.size(0) == x.size(0) # equal batch
- assert cache.size(1) == x.size(1) # equal channel
- x = torch.cat((cache, x), dim=2)
- assert (x.size(2) > self.lorder)
- new_cache = x[:, :, -self.lorder:]
- else:
- # It's better we just return None if no cache is required,
- # However, for JIT export, here we just fake one tensor instead of
- # None.
- new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
- # GLU mechanism
- x = self.pointwise_conv1(x) # (batch, 2*channel, dim)
- x = nn.functional.glu(x, dim=1) # (batch, channel, dim)
- # 1D Depthwise Conv
- x = self.depthwise_conv(x)
- if self.use_layer_norm:
- x = x.transpose(1, 2)
- x = self.activation(self.norm(x))
- if self.use_layer_norm:
- x = x.transpose(1, 2)
- x = self.pointwise_conv2(x)
- # mask batch padding
- if mask_pad.size(2) > 0: # time > 0
- x.masked_fill_(~mask_pad, 0.0)
- return x.transpose(1, 2), new_cache
- # NOTE(Xiang Lyu) causal conv module used in convolution-based vocoder
- class CausalConv1d(torch.nn.Conv1d):
- def __init__(
- self,
- in_channels: int,
- out_channels: int,
- kernel_size: int,
- stride: int = 1,
- dilation: int = 1,
- groups: int = 1,
- bias: bool = True,
- padding_mode: str = 'zeros',
- causal_type: str = 'left',
- device=None,
- dtype=None
- ) -> None:
- super(CausalConv1d, self).__init__(in_channels, out_channels,
- kernel_size, stride=1,
- padding=0, dilation=dilation,
- groups=groups, bias=bias,
- padding_mode=padding_mode,
- device=device, dtype=dtype)
- assert stride == 1
- self.causal_padding = int((kernel_size * dilation - dilation) / 2) * 2 + (kernel_size + 1) % 2
- assert causal_type in ['left', 'right']
- self.causal_type = causal_type
- def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor]:
- input_timestep = x.shape[2]
- if cache.size(2) == 0:
- cache = torch.zeros(x.shape[0], x.shape[1], self.causal_padding).to(x)
- assert cache.size(2) == self.causal_padding
- if self.causal_type == 'left':
- x = torch.concat([cache, x], dim=2)
- else:
- x = torch.concat([x, cache], dim=2)
- x = super(CausalConv1d, self).forward(x)
- assert x.shape[2] == input_timestep
- return x
- class CausalConv1dDownSample(torch.nn.Conv1d):
- def __init__(
- self,
- in_channels: int,
- out_channels: int,
- kernel_size: int,
- stride: int = 1,
- dilation: int = 1,
- groups: int = 1,
- bias: bool = True,
- padding_mode: str = 'zeros',
- device=None,
- dtype=None
- ) -> None:
- super(CausalConv1dDownSample, self).__init__(in_channels, out_channels,
- kernel_size, stride,
- padding=0, dilation=dilation,
- groups=groups, bias=bias,
- padding_mode=padding_mode,
- device=device, dtype=dtype)
- assert stride != 1 and dilation == 1
- assert kernel_size % stride == 0
- self.causal_padding = stride - 1
- def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
- if cache.size(2) == 0:
- x = F.pad(x, (self.causal_padding, 0), value=0.0)
- else:
- assert cache.size(2) == self.causal_padding
- x = torch.concat([cache, x], dim=2)
- x = super(CausalConv1dDownSample, self).forward(x)
- return x
- class CausalConv1dUpsample(torch.nn.Conv1d):
- def __init__(
- self,
- in_channels: int,
- out_channels: int,
- kernel_size: int,
- stride: int = 1,
- dilation: int = 1,
- groups: int = 1,
- bias: bool = True,
- padding_mode: str = 'zeros',
- device=None,
- dtype=None
- ) -> None:
- super(CausalConv1dUpsample, self).__init__(in_channels, out_channels,
- kernel_size, 1,
- padding=0, dilation=dilation,
- groups=groups, bias=bias,
- padding_mode=padding_mode,
- device=device, dtype=dtype)
- assert dilation == 1
- self.causal_padding = kernel_size - 1
- self.upsample = torch.nn.Upsample(scale_factor=stride, mode='nearest')
- def forward(self, x: torch.Tensor, cache: torch.Tensor = torch.zeros(0, 0, 0)) -> Tuple[torch.Tensor, torch.Tensor]:
- x = self.upsample(x)
- input_timestep = x.shape[2]
- if cache.size(2) == 0:
- x = F.pad(x, (self.causal_padding, 0), value=0.0)
- else:
- assert cache.size(2) == self.causal_padding
- x = torch.concat([cache, x], dim=2)
- x = super(CausalConv1dUpsample, self).forward(x)
- assert input_timestep == x.shape[2]
- return x
|