Add MVDR module to example (#1708)

nateanl · web-flow · commit 4915524fa189 · 2021-08-26T18:56:59.000+01:00
- Support three solutions for MVDR beamforming ("ref_channel", "stv_evd", "stv_power").
- Support single-channel and multi-channel time-frequency masks
- Add unit tests
diff --git a/examples/beamforming/mvdr.py b/examples/beamforming/mvdr.py
diff --git a/test/torchaudio_unittest/example/beamforming/__init__.py b/test/torchaudio_unittest/example/beamforming/__init__.py
diff --git a/test/torchaudio_unittest/example/beamforming/autograd_cpu_test.py b/test/torchaudio_unittest/example/beamforming/autograd_cpu_test.py
@@ -0,0 +1,10 @@
+from torchaudio_unittest.common_utils import PytorchTestCase
+from .autograd_test_impl import AutogradTestMixin
+
+
+class AutogradCPUTest(AutogradTestMixin, PytorchTestCase):
+    device = 'cpu'
+
+
+class AutogradRNNTCPUTest(PytorchTestCase):
+    device = 'cpu'
diff --git a/test/torchaudio_unittest/example/beamforming/autograd_cuda_test.py b/test/torchaudio_unittest/example/beamforming/autograd_cuda_test.py
@@ -0,0 +1,15 @@
+from torchaudio_unittest.common_utils import (
+    PytorchTestCase,
+    skipIfNoCuda,
+)
+from .autograd_test_impl import AutogradTestMixin
+
+
+@skipIfNoCuda
+class AutogradCUDATest(AutogradTestMixin, PytorchTestCase):
+    device = 'cuda'
+
+
+@skipIfNoCuda
+class AutogradRNNTCUDATest(PytorchTestCase):
+    device = 'cuda'
diff --git a/test/torchaudio_unittest/example/beamforming/autograd_test_impl.py b/test/torchaudio_unittest/example/beamforming/autograd_test_impl.py
@@ -0,0 +1,70 @@
+from typing import List
+
+import torch
+from beamforming.mvdr import PSD, MVDR
+from parameterized import parameterized, param
+from torch.autograd import gradcheck, gradgradcheck
+
+from torchaudio_unittest.common_utils import (
+    TestBaseMixin,
+    get_whitenoise,
+    get_spectrogram,
+)
+
+
+class AutogradTestMixin(TestBaseMixin):
+    def assert_grad(
+            self,
+            transform: torch.nn.Module,
+            inputs: List[torch.Tensor],
+            *,
+            nondet_tol: float = 0.0,
+    ):
+        transform = transform.to(dtype=torch.float64, device=self.device)
+
+        # gradcheck and gradgradcheck only pass if the input tensors are of dtype `torch.double` or
+        # `torch.cdouble`, when the default eps and tolerance values are used.
+        inputs_ = []
+        for i in inputs:
+            if torch.is_tensor(i):
+                i = i.to(
+                    dtype=torch.cdouble if i.is_complex() else torch.double,
+                    device=self.device)
+                i.requires_grad = True
+            inputs_.append(i)
+        assert gradcheck(transform, inputs_)
+        assert gradgradcheck(transform, inputs_, nondet_tol=nondet_tol)
+
+    def test_psd(self):
+        transform = PSD()
+        waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
+        spectrogram = get_spectrogram(waveform, n_fft=400)
+        self.assert_grad(transform, [spectrogram])
+
+    @parameterized.expand([
+        [True],
+        [False],
+    ])
+    def test_psd_with_mask(self, multi_mask):
+        transform = PSD(multi_mask=multi_mask)
+        waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
+        spectrogram = get_spectrogram(waveform, n_fft=400)
+        if multi_mask:
+            mask = torch.rand(spectrogram.shape[-3:])
+        else:
+            mask = torch.rand(spectrogram.shape[-2:])
+
+        self.assert_grad(transform, [spectrogram, mask])
+
+    @parameterized.expand([
+        param(solution="ref_channel"),
+        param(solution="stv_power"),
+        # evd will fail since the eigenvalues are not distinct
+        # param(solution="stv_evd"),
+    ])
+    def test_mvdr(self, solution):
+        transform = MVDR(solution=solution)
+        waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
+        spectrogram = get_spectrogram(waveform, n_fft=400)
+        mask = torch.rand(spectrogram.shape[-2:])
+        self.assert_grad(transform, [spectrogram, mask])
diff --git a/test/torchaudio_unittest/example/beamforming/batch_consistency_test.py b/test/torchaudio_unittest/example/beamforming/batch_consistency_test.py
@@ -0,0 +1,59 @@
+"""Test numerical consistency among single input and batched input."""
+import torch
+from beamforming.mvdr import PSD, MVDR
+from parameterized import parameterized
+
+from torchaudio_unittest import common_utils
+
+
+class TestTransforms(common_utils.TorchaudioTestCase):
+    def test_batch_PSD(self):
+        spec = torch.rand((4, 6, 201, 100), dtype=torch.cdouble)
+
+        # Single then transform then batch
+        expected = []
+        for i in range(4):
+            expected.append(PSD()(spec[i]))
+        expected = torch.stack(expected)
+
+        # Batch then transform
+        computed = PSD()(spec)
+
+        self.assertEqual(computed, expected)
+
+    def test_batch_PSD_with_mask(self):
+        spec = torch.rand((4, 6, 201, 100), dtype=torch.cdouble)
+        mask = torch.rand((4, 201, 100))
+
+        # Single then transform then batch
+        expected = []
+        for i in range(4):
+            expected.append(PSD()(spec[i], mask[i]))
+        expected = torch.stack(expected)
+
+        # Batch then transform
+        computed = PSD()(spec, mask)
+
+        self.assertEqual(computed, expected)
+
+    @parameterized.expand([
+        [True],
+        [False],
+    ])
+    def test_MVDR(self, multi_mask):
+        spec = torch.rand((4, 6, 201, 100), dtype=torch.cdouble)
+        if multi_mask:
+            mask = torch.rand((4, 6, 201, 100))
+        else:
+            mask = torch.rand((4, 201, 100))
+
+        # Single then transform then batch
+        expected = []
+        for i in range(4):
+            expected.append(MVDR(multi_mask=multi_mask)(spec[i], mask[i]))
+        expected = torch.stack(expected)
+
+        # Batch then transform
+        computed = MVDR(multi_mask=multi_mask)(spec, mask)
+
+        self.assertEqual(computed, expected)
diff --git a/test/torchaudio_unittest/example/beamforming/torchscript_consistency_cpu_test.py b/test/torchaudio_unittest/example/beamforming/torchscript_consistency_cpu_test.py
@@ -0,0 +1,14 @@
+import torch
+
+from torchaudio_unittest.common_utils import PytorchTestCase
+from .torchscript_consistency_impl import Transforms, TransformsFloat64Only
+
+
+class TestTransformsFloat32(Transforms, PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device('cpu')
+
+
+class TestTransformsFloat64(Transforms, TransformsFloat64Only, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device('cpu')
diff --git a/test/torchaudio_unittest/example/beamforming/torchscript_consistency_cuda_test.py b/test/torchaudio_unittest/example/beamforming/torchscript_consistency_cuda_test.py
@@ -0,0 +1,16 @@
+import torch
+
+from torchaudio_unittest.common_utils import skipIfNoCuda, PytorchTestCase
+from .torchscript_consistency_impl import Transforms, TransformsFloat64Only
+
+
+@skipIfNoCuda
+class TestTransformsFloat32(Transforms, PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device('cuda')
+
+
+@skipIfNoCuda
+class TestTransformsFloat64(Transforms, TransformsFloat64Only, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device('cuda')
diff --git a/test/torchaudio_unittest/example/beamforming/torchscript_consistency_impl.py b/test/torchaudio_unittest/example/beamforming/torchscript_consistency_impl.py
@@ -0,0 +1,57 @@
+"""Test suites for jit-ability and its numerical compatibility"""
+
+import torch
+from beamforming.mvdr import PSD, MVDR
+from parameterized import parameterized, param
+
+from torchaudio_unittest import common_utils
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TestBaseMixin,
+)
+
+
+class Transforms(TempDirMixin, TestBaseMixin):
+    """Implements test for Transforms that are performed for different devices"""
+    def _assert_consistency_complex(self, transform, tensors):
+        assert tensors[0].is_complex()
+        tensors = [tensor.to(device=self.device, dtype=self.complex_dtype) for tensor in tensors]
+        transform = transform.to(device=self.device, dtype=self.dtype)
+
+        path = self.get_temp_path('func.zip')
+        torch.jit.script(transform).save(path)
+        ts_transform = torch.jit.load(path)
+
+        output = transform(*tensors)
+        ts_output = ts_transform(*tensors)
+        self.assertEqual(ts_output, output)
+
+    def test_PSD(self):
+        tensor = common_utils.get_whitenoise(sample_rate=8000, n_channels=4)
+        spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
+        self._assert_consistency_complex(PSD(), (spectrogram,))
+
+    def test_PSD_with_mask(self):
+        tensor = common_utils.get_whitenoise(sample_rate=8000, n_channels=4)
+        spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
+        mask = torch.rand(spectrogram.shape[-2:])
+        self._assert_consistency_complex(PSD(), (spectrogram, mask))
+
+
+class TransformsFloat64Only(TestBaseMixin):
+    @parameterized.expand([
+        param(solution="ref_channel", online=True),
+        param(solution="stv_evd", online=True),
+        param(solution="stv_power", online=True),
+        param(solution="ref_channel", online=False),
+        param(solution="stv_evd", online=False),
+        param(solution="stv_power", online=False),
+    ])
+    def test_MVDR(self, solution, online):
+        tensor = common_utils.get_whitenoise(sample_rate=8000, n_channels=4)
+        spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
+        mask = torch.rand(spectrogram.shape[-2:])
+        self._assert_consistency_complex(
+            MVDR(solution=solution, online=online),
+            (spectrogram, mask)
+        )
diff --git a/test/torchaudio_unittest/example/beamforming/transforms_cpu_test.py b/test/torchaudio_unittest/example/beamforming/transforms_cpu_test.py
@@ -0,0 +1,14 @@
+import torch
+
+from torchaudio_unittest.common_utils import PytorchTestCase
+from . transforms_test_impl import TransformsTestBase
+
+
+class TransformsCPUFloat32Test(TransformsTestBase, PytorchTestCase):
+    device = 'cpu'
+    dtype = torch.float32
+
+
+class TransformsCPUFloat64Test(TransformsTestBase, PytorchTestCase):
+    device = 'cpu'
+    dtype = torch.float64
diff --git a/test/torchaudio_unittest/example/beamforming/transforms_cuda_test.py b/test/torchaudio_unittest/example/beamforming/transforms_cuda_test.py
@@ -0,0 +1,19 @@
+import torch
+
+from torchaudio_unittest.common_utils import (
+    PytorchTestCase,
+    skipIfNoCuda,
+)
+from . transforms_test_impl import TransformsTestBase
+
+
+@skipIfNoCuda
+class TransformsCPUFloat32Test(TransformsTestBase, PytorchTestCase):
+    device = 'cuda'
+    dtype = torch.float32
+
+
+@skipIfNoCuda
+class TransformsCPUFloat64Test(TransformsTestBase, PytorchTestCase):
+    device = 'cpu'
+    dtype = torch.float64
diff --git a/test/torchaudio_unittest/example/beamforming/transforms_test_impl.py b/test/torchaudio_unittest/example/beamforming/transforms_test_impl.py
@@ -0,0 +1,60 @@
+from typing import Optional
+
+import numpy as np
+import torch
+from beamforming.mvdr import PSD
+from parameterized import parameterized, param
+
+from torchaudio_unittest.common_utils import (
+    TestBaseMixin,
+    get_whitenoise,
+    get_spectrogram,
+)
+
+
+def psd_numpy(
+        X: np.array,
+        mask: Optional[np.array],
+        multi_mask: bool = False,
+        normalize: bool = True,
+        eps: float = 1e-15
+) -> np.array:
+    X_conj = np.conj(X)
+    psd_X = np.einsum("...cft,...eft->...ftce", X, X_conj)
+    if mask is not None:
+        if multi_mask:
+            mask = mask.mean(axis=-3)
+        if normalize:
+            mask = mask / (mask.sum(axis=-1, keepdims=True) + eps)
+        psd = psd_X * mask[..., None, None]
+    else:
+        psd = psd_X
+
+    psd = psd.sum(axis=-3)
+
+    return torch.tensor(psd, dtype=torch.cdouble)
+
+
+class TransformsTestBase(TestBaseMixin):
+    @parameterized.expand([
+        param(0.5, 1, True, False),
+        param(0.5, 1, None, False),
+        param(1, 4, True, True),
+        param(1, 6, None, True),
+    ])
+    def test_psd(self, duration, channel, mask, multi_mask):
+        """Providing dtype changes the kernel cache dtype"""
+        transform = PSD(multi_mask)
+        waveform = get_whitenoise(sample_rate=8000, duration=duration, n_channels=channel)
+        spectrogram = get_spectrogram(waveform, n_fft=400)  # (channel, freq, time)
+        spectrogram = spectrogram.to(torch.cdouble)
+        if mask is not None:
+            if multi_mask:
+                mask = torch.rand(spectrogram.shape[-3:])
+            else:
+                mask = torch.rand(spectrogram.shape[-2:])
+            psd_np = psd_numpy(spectrogram.detach().numpy(), mask.detach().numpy(), multi_mask)
+        else:
+            psd_np = psd_numpy(spectrogram.detach().numpy(), mask, multi_mask)
+        psd = transform(spectrogram, mask)
+        self.assertEqual(psd, psd_np, atol=1e-5, rtol=1e-5)