This was really an offshoot from last year's mucking around that resulted in the opus posts earlier this year.
Today we're going to be looking at a couple of things - the most basic file format for wave files, and the code needed to capture into them.
The WAV file format is reasonably well described in Wikipedia. RIFF is a classic tag/length/value scheme, which is also what DXBC and DXIL use.
The minimal tags/sections we produce are these.
WAVE
WAVEFORMATEX
or an extension thereofI simply added this function over to the (eliding some of the WriteFile error checks).
HRESULT DirectX::WriteWAVDataToFile(
const wchar_t* szFileName,
const DirectX::WAVData& data) noexcept
{
if (!szFileName)
return E_INVALIDARG;
if (data.seek != nullptr || data.seekCount != 0 ||
data.loopLength != 0 || data.loopStart != 0)
{
return HRESULT_FROM_WIN32(ERROR_NOT_SUPPORTED);
}
// open the file
#if (_WIN32_WINNT >= _WIN32_WINNT_WIN8)
ScopedHandle hFile(safe_handle(CreateFile2(
szFileName,
GENERIC_WRITE, FILE_SHARE_READ, CREATE_ALWAYS,
nullptr)));
#else
ScopedHandle hFile(safe_handle(CreateFileW(
szFileName,
GENERIC_WRITE, FILE_SHARE_READ,
nullptr,
CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL,
nullptr)));
#endif
if (!hFile)
{
return HRESULT_FROM_WIN32(GetLastError());
}
DWORD written;
// RIFF 'RIFF'
{
RIFFChunkHeader header;
header.tag = FOURCC_RIFF_TAG;
header.size =
sizeof(header.riff) +
sizeof(RIFFChunk) +
data.wfx->cbSize + sizeof(WAVEFORMATEX) +
sizeof(RIFFChunk) + data.audioBytes;
header.riff = FOURCC_WAVE_FILE_TAG;
WriteFile(hFile.get(), &header, sizeof(header), &written, NULL);
}
// RIFF 'fmt '
{
RIFFChunk format;
format.tag = FOURCC_FORMAT_TAG;
format.size = data.wfx->cbSize + sizeof(WAVEFORMATEX);
WriteFile(hFile.get(), &format, sizeof(format), &written, NULL);
WriteFile(hFile.get(), data.wfx, data.wfx->cbSize + sizeof(WAVEFORMATEX), &written, NULL);
}
// RIFF 'data'
{
RIFFChunk dataChunk;
dataChunk.tag = FOURCC_DATA_TAG;
dataChunk.size = data.audioBytes;
WriteFile(hFile.get(), &dataChunk, sizeof(dataChunk), &written, NULL);
WriteFile(hFile.get(), data.startAudio, data.audioBytes, &written, NULL);
}
return S_OK;
}
This is pretty well covered in the Capturing a stream article on the Microsoft website, so I'm going to focus on the differences here.
You can simply drop this code on the project we had back in the day.
#include <audioclient.h>
#include <mmdeviceapi.h>
//...
#define REFTIMES_PER_SEC 10000000
#define REFTIMES_PER_MILLISEC 10000
const CLSID CLSID_MMDeviceEnumerator = __uuidof(MMDeviceEnumerator);
const IID IID_IMMDeviceEnumerator = __uuidof(IMMDeviceEnumerator);
const IID IID_IAudioClient = __uuidof(IAudioClient);
const IID IID_IAudioCaptureClient = __uuidof(IAudioCaptureClient);
HRESULT RunFileRecord()
{
// Based on https://learn.microsoft.com/en-us/windows/win32/coreaudio/capturing-a-stream
HRESULT hr = S_OK;
REFERENCE_TIME hnsRequestedDuration = REFTIMES_PER_SEC;
REFERENCE_TIME hnsActualDuration;
UINT32 bufferFrameCount;
UINT32 numFramesAvailable;
IMMDeviceEnumerator* pEnumerator = NULL;
IMMDevice* pDevice = NULL;
IAudioClient* pAudioClient = NULL;
IAudioCaptureClient* pCaptureClient = NULL;
WAVEFORMATEX* pwfx = NULL;
WAVEFORMATEXTENSIBLE *extensible = NULL;
UINT32 packetLength = 0;
BOOL bDone = FALSE;
BYTE* pData;
DWORD flags;
std::unique_ptr<byte[]> localFormat;
std::unique_ptr<byte[]> outData;
size_t outDataSize;
byte *outDataCursor, *outDataEnd;
const unsigned captureSeconds = 4;
IFC(CoCreateInstance(
CLSID_MMDeviceEnumerator, NULL,
CLSCTX_ALL, IID_IMMDeviceEnumerator,
(void**)&pEnumerator));
IFC(pEnumerator->GetDefaultAudioEndpoint(eCapture, eConsole, &pDevice));
IFC(pDevice->Activate(IID_IAudioClient, CLSCTX_ALL, NULL, (void**)&pAudioClient));
IFC(pAudioClient->GetMixFormat(&pwfx));
IFC(pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, 0,
hnsRequestedDuration, 0, pwfx, NULL));
IFC(pAudioClient->GetBufferSize(&bufferFrameCount));
IFC(pAudioClient->GetService(IID_IAudioCaptureClient, (void**)&pCaptureClient));
// Save the format for later
localFormat.reset(new byte[pwfx->cbSize + sizeof(WAVEFORMATEX)]);
memcpy(localFormat.get(), pwfx, pwfx->cbSize + sizeof(WAVEFORMATEX));
wprintf(
L"Data format tag=%u channels=%u samples-per-sec=%u "
L"avg-bytes-per-sec=%u block-align=%u bits-per-mono-sample=%u extra-size=%u\n",
(unsigned)pwfx->wFormatTag, (unsigned)pwfx->nChannels,
(unsigned)pwfx->nSamplesPerSec, (unsigned)pwfx->nAvgBytesPerSec,
(unsigned)pwfx->nBlockAlign, (unsigned)pwfx->wBitsPerSample,
(unsigned)pwfx->cbSize);
// Check supported format (should probably support WAVE_FORMAT_PCM too)
if (pwfx->wFormatTag != WAVE_FORMAT_IEEE_FLOAT)
{
if (pwfx->wFormatTag != WAVE_FORMAT_EXTENSIBLE)
{
wprintf(
L"Only supporting WAVE_FORMAT_IEEE_FLOAT(%u) or WAVE_FORMAT_EXTENSIBLE(%u) format\n",
(unsigned)WAVE_FORMAT_PCM,
(unsigned)WAVE_FORMAT_EXTENSIBLE);
IFC(HRESULT_FROM_WIN32(ERROR_NOT_SUPPORTED));
}
extensible = (WAVEFORMATEXTENSIBLE*)pwfx;
if (extensible->SubFormat != KSDATAFORMAT_SUBTYPE_IEEE_FLOAT)
{
wprintf(
L"Only supporting KSDATAFORMAT_SUBTYPE_IEEE_FLOAT for WAVE_FORMAT_EXTENSIBLE(%u) format\n",
(unsigned)WAVE_FORMAT_EXTENSIBLE);
IFC(HRESULT_FROM_WIN32(ERROR_NOT_SUPPORTED));
}
}
// Set up a buffer for captured data
outDataSize = captureSeconds * pwfx->nAvgBytesPerSec;
outData.reset(new byte[outDataSize]);
outDataCursor = outData.get();
outDataEnd = outDataCursor + outDataSize;
// Calculate the actual duration of the allocated buffer.
hnsActualDuration = (double)REFTIMES_PER_SEC * bufferFrameCount / pwfx->nSamplesPerSec;
IFC(pAudioClient->Start());
// Each loop fills about half of the shared buffer.
while (bDone == FALSE)
{
// Sleep for half the buffer duration.
Sleep(hnsActualDuration / REFTIMES_PER_MILLISEC / 2);
IFC(pCaptureClient->GetNextPacketSize(&packetLength));
while (packetLength != 0)
{
// Get the available data in the shared buffer.
IFC(pCaptureClient->GetBuffer(&pData, &numFramesAvailable, &flags, NULL, NULL));
// Copy the available capture data or generate silence.
size_t copySize = std::min(
outDataEnd - outDataCursor,
(ptrdiff_t)(numFramesAvailable * pwfx->nBlockAlign));
if (flags & AUDCLNT_BUFFERFLAGS_SILENT)
{
memset(outDataCursor, 0, copySize);
}
else
{
memcpy(outDataCursor, pData, copySize);
}
outDataCursor += copySize;
bDone = outDataCursor >= outDataEnd;
IFC(pCaptureClient->ReleaseBuffer(numFramesAvailable));
IFC(pCaptureClient->GetNextPacketSize(&packetLength));
}
}
IFC(pAudioClient->Stop());
// Write out the data in a WAV file.
{
DirectX::WAVData wavData = {};
wavData.audioBytes = outDataSize;
wavData.wfx = (const WAVEFORMATEX*)localFormat.get();
wavData.startAudio = outData.get();
DirectX::WriteWAVDataToFile(LR"(C:\nobackup\scratch.wav)", wavData);
}
Cleanup:
CoTaskMemFree(pwfx);
if (pEnumerator) pEnumerator->Release();
if (pDevice) pDevice->Release();
if (pAudioClient) pAudioClient->Release();
if (pCaptureClient) pCaptureClient->Release();
return hr;
}
The interesting differences are these.
nBlockAlign'
units.Happy audio recording!
Tags: audio