wav2vec/WavDecoder.py

"""
This module defines the WavDecoder class, used to read WAV and AIFF files from
disk and decode them into channel-separated integers.
"""
import wave
import aifc
import struct
from collections import namedtuple
import logging

logger = logging.getLogger(__name__)


try:
    xrange
except NameError:
    # in Python3 xrange has been renamed to range
    xrange = range

Point = namedtuple('Point', ['x', 'y'])

# The Python 2.7 version of the wave module does not use a namedtuple as the
# return value of getparams(), so we define it here for cross-compatibility
_wave_params = namedtuple('_wave_params',
                          'nchannels sampwidth framerate nframes comptype compname')


class WavDecoder(object):
    """
    A wrapper around the standard library's wave and aifc (and compatible)
    modules to make reading from WAV files easier. It decodes the raw bytes into
    a list of Points, one list for each channel

    It also optionally scales the data to a maximum width and height (the height
    according to the bitdepth of the samples).

    It also optionally downsamples the data if downtoss is set. One sample out
    of every `downtoss` samples is kept and the rest are tossed out. This is a
    very brutal form of downsampling which will both remove high frequencies and
    cause aliasing (no low-pass filtering is applied before decimating).

    It's interface is simple:
        - init with a `filename` (and some optional parameters, see below)
        - call `open()` to open the underlying object returned by the wave or
        aifc module
        - call `next()` to return the next block of decoded frames (if `bs` ==
        0, return all frames.
        - call `close()` to close and reset everything (can then repeat from
        `open()`)

    Use it as a context manager to ensure `close()` is called. Use as an
    iterator to process all frames:
        >>> wd = WavDecoder('filename')
        >>> with wd as data:
        >>>     for frames in data:
        >>>         print(frames)
    """
    def __init__(self, filename, decoder_class=wave, endchar=None,
                 max_width=0, max_height=0, bs=0, downtoss=1, signed=None):
        """
        Args:
            filename (str): Name of waveform file
            decoder_class (Class): either wave or aifc or a compatible class
                name
            endchar (str): the `struct.unpack()` character which determines
                endianness of the data ('<' == little endian; '>' == big
                endian).  Defaults to '<'. This should only need to be set
                explicitly if trying to decode a big-endian WAV or a
                little-endian AIFF (which are non-standard).
            max_width (Number): scale the x-axis values so that the largest
                sample number is no greater than `max_width`. If `max_width` is
                <= 0, then don't scale. Defaults to 0.
            max_height (Number): scale the y-axis values so that the largest
                possible (according to the bitdepth) sample value is no greater
                than
            `max_height`. If `max_width` is <= 0, then don't scale.  Defaults to
                0.
            bs (int): The block size as number of frames to stream from disk on
                every call to `next()` (a frame is a sample * nchannels). If bs
                == 0, then the entire WAV file will be read into memory before
                being re-serialized.
            downtoss (int): Keep every 1 out of every `downtoss` samples. This
                is a brutal way to downsample which clobbers high frequencies
                and causes aliasing. Defaults to 1 (so that no downsampling
                occurs by default).
            signed (bool): True to force PCM data to be treated as signed; False
                to force data to be treated as unsigned. By default (None) data
                will be treated as signed except in the case of 8-bit WAV which
                is unsigned.
        """
        self._filename = filename
        self.decoder = decoder_class
        self.max_width = max_width
        self.max_height = max_height
        self.bs = bs
        self._downtoss = downtoss
        if endchar is None:
            if self.decoder == aifc:
                # AIFF is encoded big-endian
                self.endchar = ">"
            else:
                self.endchar = "<"
        self.signed = signed
        self._reset()
        logger.info("WavDecoder initialized for %s" % filename)

    def __iter__(self):
        return self

    def __enter__(self):
        self.open()
        logger.debug("Entered context manager for %s" % self._filename)
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
        logger.debug("Exited contextmanager for %s" % self._filename)

    def _reset(self):
        self._wav_file = None
        self.params = None
        self.width = None
        self.height = None
        self._samp_fmt = None
        # index keeps track of the next frame in the _wav_file
        # We can't rely on the Wav_read.tell() because the docs say it is
        # implementation specific.
        self.index = None

    def open(self):
        """
        Open the underlying WAV or AIFF file, and set instance variables
        according to the file's parameters.
        """
        wf = self.decoder.open(self._filename, 'rb')
        self._wav_file = wf
        self.index = 0
        self.params = _wave_params(*wf.getparams())
        if self.max_width <= 0:
            # if max_width is set to 0 then use full width of waveform
            self.width = self.params.nframes
        else:
            self.width = min(self.max_width, self.params.nframes)

        if self.max_height <= 0:
            # If max-height is set at 0, then use full bitdepth
            self.height = 2**(self.params.sampwidth*8) - 1
        else:
            self.height = min(self.max_height,
                              2**(self.params.sampwidth * 8 - 1))
        logger.debug("height set to %d" % self.height)

        if self.signed is None:
            self.signed = (self.params.sampwidth == 1) and (self.decoder ==
                                                            wave)

        samp_fmt = self.struct_fmt_char

        self._samp_fmt = samp_fmt
        logger.debug("_samp_fmt set to %s" % self._samp_fmt)
        logger.info("Opened WavDecoder for %s" % self._filename)

    def close(self):
        """
        Close and reset decoder and underlying wave file.
        """
        self._wav_file.close()
        self._reset()

    def scale_x(self, x):
        """
        Scale `x` according to `max_width`
        """
        # (explicit cast to float needed for Python2)
        return x*min(1.0, float(self.width)/self.params.nframes)

    def scale_y(self, y):
        """
        Scale 'y' according to `max_height`
        """
        sampwidth = self.params.sampwidth
        bitdepth = sampwidth * 8
        divisor = 2**(bitdepth-1)
        scale = (self.height * 0.5)/divisor
        if sampwidth == 1 and not self.signed:
            # 8-bit wav files are unsigned
            y -= divisor
        return y * scale

    @property
    def struct_fmt_char(self):
        """
        Calculates the character to use with `struct.unpack()` to decode sample
        bytes compatible with the data file's sample width.

        Supported PCM file formats:
            - 8-bit unsigned WAV
            - 8-bit signed AIFF
            - 16-bit signed WAV (little endian)and AIFF (big endian)
            - 32-bit signed WAV (little endian)and AIFF (big endian)

        Raises ValueError if `filename` is not a supported file type.

        see: https://docs.python.org/library/struct.html
        """
        sampwidth = self.params.sampwidth
        if sampwidth == 1 and not self.signed:
            logger.info("unsigned 8-bit ('B')")
            return 'B'
        elif sampwidth == 1:
            logger.info("signed 8-bit ('b')")
            return 'b'
        elif sampwidth == 2:
            logger.info("signed 16-bit ('h')")
            return 'h'
        elif sampwidth == 4:
            logger.info("signed 32-bit ('h')")
            return 'i'
        else:
            raise ValueError("Unsupported file type.")

    def next(self):
        """
        Read and decode the next bs frames and return channel-separated data.

        Returns data as a list of Points for each channel:
        [
         [Point(x=1, y=4), ...] # chan 1
         [Point(x=3, y=435), ..] # chan 2
        ]
        """
        if self._wav_file is None:
            # Likely user didn't open(), do it for them:
            logger.info(("The Wav_reader does not exist; probably open() was"
                         " not called. Calling it now..."))
            self.open()
        p = self.params
        if self.bs == 0:
            # Read all frames into memory if bs == 0:
            frames = p.nframes
        else:
            frames = self.bs
        next_index = self.index + frames

        # check bounds
        if next_index > p.nframes:
            frames = p.nframes - self.index
            if frames <= 0:
                logger.debug("No more frames")
                raise StopIteration

        wav_bytes = self._wav_file.readframes(frames)
        logger.debug("Read %d frames" % frames)
        fmt = self._samp_fmt
        fmt_str = '%s%d%s' % (self.endchar, p.nchannels * frames, fmt)
        data = struct.unpack(fmt_str, wav_bytes)

        # Extract the tuples of integers into a list of Points for each channel:
        start = self.index + 1
        sep_data = []
        for chan in xrange(0, p.nchannels):
            chan_data = data[chan::p.nchannels]
            # downsample:
            chan_data = chan_data[::self._downtoss]
            chan_points = []
            for i, sample in enumerate(chan_data):
                x = self.scale_x(i + start)
                y = self.scale_y(sample)
                chan_points.append(Point(x, y))
            sep_data.append(chan_points)
        self.index += frames
        return sep_data

    # alias for python3-style iterators:
    __next__ = next