| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 |
- # Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
- # 2024 Alibaba Inc (Xiang Lyu)
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # Modified from ESPnet(https://github.com/espnet/espnet)
- """ConvolutionModule definition."""
- from typing import Tuple
- import torch
- from torch import nn
- class ConvolutionModule(nn.Module):
- """ConvolutionModule in Conformer model."""
- def __init__(self,
- channels: int,
- kernel_size: int = 15,
- activation: nn.Module = nn.ReLU(),
- norm: str = "batch_norm",
- causal: bool = False,
- bias: bool = True):
- """Construct an ConvolutionModule object.
- Args:
- channels (int): The number of channels of conv layers.
- kernel_size (int): Kernel size of conv layers.
- causal (int): Whether use causal convolution or not
- """
- super().__init__()
- self.pointwise_conv1 = nn.Conv1d(
- channels,
- 2 * channels,
- kernel_size=1,
- stride=1,
- padding=0,
- bias=bias,
- )
- # self.lorder is used to distinguish if it's a causal convolution,
- # if self.lorder > 0: it's a causal convolution, the input will be
- # padded with self.lorder frames on the left in forward.
- # else: it's a symmetrical convolution
- if causal:
- padding = 0
- self.lorder = kernel_size - 1
- else:
- # kernel_size should be an odd number for none causal convolution
- assert (kernel_size - 1) % 2 == 0
- padding = (kernel_size - 1) // 2
- self.lorder = 0
- self.depthwise_conv = nn.Conv1d(
- channels,
- channels,
- kernel_size,
- stride=1,
- padding=padding,
- groups=channels,
- bias=bias,
- )
- assert norm in ['batch_norm', 'layer_norm']
- if norm == "batch_norm":
- self.use_layer_norm = False
- self.norm = nn.BatchNorm1d(channels)
- else:
- self.use_layer_norm = True
- self.norm = nn.LayerNorm(channels)
- self.pointwise_conv2 = nn.Conv1d(
- channels,
- channels,
- kernel_size=1,
- stride=1,
- padding=0,
- bias=bias,
- )
- self.activation = activation
- def forward(
- self,
- x: torch.Tensor,
- mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
- cache: torch.Tensor = torch.zeros((0, 0, 0)),
- ) -> Tuple[torch.Tensor, torch.Tensor]:
- """Compute convolution module.
- Args:
- x (torch.Tensor): Input tensor (#batch, time, channels).
- mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
- (0, 0, 0) means fake mask.
- cache (torch.Tensor): left context cache, it is only
- used in causal convolution (#batch, channels, cache_t),
- (0, 0, 0) meas fake cache.
- Returns:
- torch.Tensor: Output tensor (#batch, time, channels).
- """
- # exchange the temporal dimension and the feature dimension
- x = x.transpose(1, 2) # (#batch, channels, time)
- # mask batch padding
- if mask_pad.size(2) > 0: # time > 0
- x.masked_fill_(~mask_pad, 0.0)
- if self.lorder > 0:
- if cache.size(2) == 0: # cache_t == 0
- x = nn.functional.pad(x, (self.lorder, 0), 'constant', 0.0)
- else:
- assert cache.size(0) == x.size(0) # equal batch
- assert cache.size(1) == x.size(1) # equal channel
- x = torch.cat((cache, x), dim=2)
- assert (x.size(2) > self.lorder)
- new_cache = x[:, :, -self.lorder:]
- else:
- # It's better we just return None if no cache is required,
- # However, for JIT export, here we just fake one tensor instead of
- # None.
- new_cache = torch.zeros((0, 0, 0), dtype=x.dtype, device=x.device)
- # GLU mechanism
- x = self.pointwise_conv1(x) # (batch, 2*channel, dim)
- x = nn.functional.glu(x, dim=1) # (batch, channel, dim)
- # 1D Depthwise Conv
- x = self.depthwise_conv(x)
- if self.use_layer_norm:
- x = x.transpose(1, 2)
- x = self.activation(self.norm(x))
- if self.use_layer_norm:
- x = x.transpose(1, 2)
- x = self.pointwise_conv2(x)
- # mask batch padding
- if mask_pad.size(2) > 0: # time > 0
- x.masked_fill_(~mask_pad, 0.0)
- return x.transpose(1, 2), new_cache
|