Recording audio

This was really an offshoot from last year's mucking around that resulted in the opus posts earlier this year.

Today we're going to be looking at a couple of things - the most basic file format for wave files, and the code needed to capture into them.

File format

The WAV file format is reasonably well described in Wikipedia. RIFF is a classic tag/length/value scheme, which is also what DXBC and DXIL use.

The minimal tags/sections we produce are these.

I simply added this function over to the (eliding some of the WriteFile error checks).

HRESULT DirectX::WriteWAVDataToFile(
  const wchar_t* szFileName,
  const DirectX::WAVData& data) noexcept
{
  if (!szFileName)
    return E_INVALIDARG;

  if (data.seek != nullptr || data.seekCount != 0 ||
      data.loopLength != 0 || data.loopStart != 0)
  {
    return HRESULT_FROM_WIN32(ERROR_NOT_SUPPORTED);
  }

  // open the file
#if (_WIN32_WINNT >= _WIN32_WINNT_WIN8)
  ScopedHandle hFile(safe_handle(CreateFile2(
    szFileName,
    GENERIC_WRITE, FILE_SHARE_READ, CREATE_ALWAYS,
    nullptr)));
#else
  ScopedHandle hFile(safe_handle(CreateFileW(
    szFileName,
    GENERIC_WRITE, FILE_SHARE_READ,
    nullptr,
    CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL,
    nullptr)));
#endif

  if (!hFile)
  {
    return HRESULT_FROM_WIN32(GetLastError());
  }

  DWORD written;
  // RIFF 'RIFF'
  {
    RIFFChunkHeader header;
    header.tag = FOURCC_RIFF_TAG;
    header.size =
      sizeof(header.riff) +
      sizeof(RIFFChunk) +
      data.wfx->cbSize + sizeof(WAVEFORMATEX) +
      sizeof(RIFFChunk) + data.audioBytes;
    header.riff = FOURCC_WAVE_FILE_TAG;
    WriteFile(hFile.get(), &header, sizeof(header), &written, NULL);
  }

  // RIFF 'fmt '
  {
    RIFFChunk format;
    format.tag = FOURCC_FORMAT_TAG;
    format.size = data.wfx->cbSize + sizeof(WAVEFORMATEX);
    WriteFile(hFile.get(), &format, sizeof(format), &written, NULL);
    WriteFile(hFile.get(), data.wfx, data.wfx->cbSize + sizeof(WAVEFORMATEX), &written, NULL);
  }

  // RIFF 'data'
  {
    RIFFChunk dataChunk;
    dataChunk.tag = FOURCC_DATA_TAG;
    dataChunk.size = data.audioBytes;
    WriteFile(hFile.get(), &dataChunk, sizeof(dataChunk), &written, NULL);
    WriteFile(hFile.get(), data.startAudio, data.audioBytes, &written, NULL);
  }

  return S_OK;
}

Capturing audio

This is pretty well covered in the Capturing a stream article on the Microsoft website, so I'm going to focus on the differences here.

You can simply drop this code on the project we had back in the day.

#include <audioclient.h>
#include <mmdeviceapi.h>

//...

#define REFTIMES_PER_SEC  10000000
#define REFTIMES_PER_MILLISEC  10000

const CLSID CLSID_MMDeviceEnumerator = __uuidof(MMDeviceEnumerator);
const IID IID_IMMDeviceEnumerator = __uuidof(IMMDeviceEnumerator);
const IID IID_IAudioClient = __uuidof(IAudioClient);
const IID IID_IAudioCaptureClient = __uuidof(IAudioCaptureClient);

HRESULT RunFileRecord()
{
  // Based on https://learn.microsoft.com/en-us/windows/win32/coreaudio/capturing-a-stream
  HRESULT hr = S_OK;
  REFERENCE_TIME hnsRequestedDuration = REFTIMES_PER_SEC;
  REFERENCE_TIME hnsActualDuration;
  UINT32 bufferFrameCount;
  UINT32 numFramesAvailable;
  IMMDeviceEnumerator* pEnumerator = NULL;
  IMMDevice* pDevice = NULL;
  IAudioClient* pAudioClient = NULL;
  IAudioCaptureClient* pCaptureClient = NULL;
  WAVEFORMATEX* pwfx = NULL;
  WAVEFORMATEXTENSIBLE *extensible = NULL;
  UINT32 packetLength = 0;
  BOOL bDone = FALSE;
  BYTE* pData;
  DWORD flags;
  std::unique_ptr<byte[]> localFormat;
  std::unique_ptr<byte[]> outData;
  size_t outDataSize;
  byte *outDataCursor, *outDataEnd;
  const unsigned captureSeconds = 4;

  IFC(CoCreateInstance(
    CLSID_MMDeviceEnumerator, NULL,
    CLSCTX_ALL, IID_IMMDeviceEnumerator,
    (void**)&pEnumerator));
  IFC(pEnumerator->GetDefaultAudioEndpoint(eCapture, eConsole, &pDevice));
  IFC(pDevice->Activate(IID_IAudioClient, CLSCTX_ALL, NULL, (void**)&pAudioClient));
  IFC(pAudioClient->GetMixFormat(&pwfx));
  IFC(pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, 0,
     hnsRequestedDuration, 0, pwfx, NULL));
  IFC(pAudioClient->GetBufferSize(&bufferFrameCount));
  IFC(pAudioClient->GetService(IID_IAudioCaptureClient, (void**)&pCaptureClient));

  // Save the format for later
  localFormat.reset(new byte[pwfx->cbSize + sizeof(WAVEFORMATEX)]);
  memcpy(localFormat.get(), pwfx, pwfx->cbSize + sizeof(WAVEFORMATEX));
  wprintf(
    L"Data format tag=%u channels=%u samples-per-sec=%u "
    L"avg-bytes-per-sec=%u block-align=%u bits-per-mono-sample=%u extra-size=%u\n",
    (unsigned)pwfx->wFormatTag, (unsigned)pwfx->nChannels,
    (unsigned)pwfx->nSamplesPerSec, (unsigned)pwfx->nAvgBytesPerSec,
    (unsigned)pwfx->nBlockAlign, (unsigned)pwfx->wBitsPerSample,
    (unsigned)pwfx->cbSize);

  // Check supported format (should probably support WAVE_FORMAT_PCM too)
  if (pwfx->wFormatTag != WAVE_FORMAT_IEEE_FLOAT)
  {
    if (pwfx->wFormatTag != WAVE_FORMAT_EXTENSIBLE)
    {
      wprintf(
        L"Only supporting WAVE_FORMAT_IEEE_FLOAT(%u) or WAVE_FORMAT_EXTENSIBLE(%u) format\n",
        (unsigned)WAVE_FORMAT_PCM,
        (unsigned)WAVE_FORMAT_EXTENSIBLE);
      IFC(HRESULT_FROM_WIN32(ERROR_NOT_SUPPORTED));
    }
    extensible = (WAVEFORMATEXTENSIBLE*)pwfx;
    if (extensible->SubFormat != KSDATAFORMAT_SUBTYPE_IEEE_FLOAT)
    {
      wprintf(
        L"Only supporting KSDATAFORMAT_SUBTYPE_IEEE_FLOAT for WAVE_FORMAT_EXTENSIBLE(%u) format\n",
        (unsigned)WAVE_FORMAT_EXTENSIBLE);
      IFC(HRESULT_FROM_WIN32(ERROR_NOT_SUPPORTED));
    }
  }

  // Set up a buffer for captured data
  outDataSize = captureSeconds * pwfx->nAvgBytesPerSec;
  outData.reset(new byte[outDataSize]);
  outDataCursor = outData.get();
  outDataEnd = outDataCursor + outDataSize;

  // Calculate the actual duration of the allocated buffer.
  hnsActualDuration = (double)REFTIMES_PER_SEC * bufferFrameCount / pwfx->nSamplesPerSec;
  IFC(pAudioClient->Start());
  // Each loop fills about half of the shared buffer.
  while (bDone == FALSE)
  {
    // Sleep for half the buffer duration.
    Sleep(hnsActualDuration / REFTIMES_PER_MILLISEC / 2);

    IFC(pCaptureClient->GetNextPacketSize(&packetLength));

    while (packetLength != 0)
    {
      // Get the available data in the shared buffer.
      IFC(pCaptureClient->GetBuffer(&pData, &numFramesAvailable, &flags, NULL, NULL));

      // Copy the available capture data or generate silence.
      size_t copySize = std::min(
        outDataEnd - outDataCursor,
        (ptrdiff_t)(numFramesAvailable * pwfx->nBlockAlign));
      if (flags & AUDCLNT_BUFFERFLAGS_SILENT)
      {
        memset(outDataCursor, 0, copySize);
      }
      else
      {
        memcpy(outDataCursor, pData, copySize);
      }
      outDataCursor += copySize;
      bDone = outDataCursor >= outDataEnd;

      IFC(pCaptureClient->ReleaseBuffer(numFramesAvailable));
      IFC(pCaptureClient->GetNextPacketSize(&packetLength));
    }
  }
  IFC(pAudioClient->Stop());
  // Write out the data in a WAV file.
  {
    DirectX::WAVData wavData = {};
    wavData.audioBytes = outDataSize;
    wavData.wfx = (const WAVEFORMATEX*)localFormat.get();
    wavData.startAudio = outData.get();
    DirectX::WriteWAVDataToFile(LR"(C:\nobackup\scratch.wav)", wavData);
  }
Cleanup:
  CoTaskMemFree(pwfx);
  if (pEnumerator) pEnumerator->Release();
  if (pDevice) pDevice->Release();
  if (pAudioClient) pAudioClient->Release();
  if (pCaptureClient) pCaptureClient->Release();
  return hr;
}

The interesting differences are these.

Happy audio recording!

Tags:  audio

Home