Purebasic Alexa with WASAPI
Posted: Thu Sep 11, 2025 2:19 pm
Hey guys, I wanted to try building my own Alex with Purebasic.
What already works:
Device selection, level detection, recording of microphone sound for a maximum of 5 seconds.
The recording will then be transcribed later with OpenAI/Whisper, and ChatGPT will be used to analyze the spoken text and assign it to defined commands.
What isn't working properly yet is the recording. The sound is poor and compressed.
Can anyone help?
What already works:
Device selection, level detection, recording of microphone sound for a maximum of 5 seconds.
The recording will then be transcribed later with OpenAI/Whisper, and ChatGPT will be used to analyze the spoken text and assign it to defined commands.
What isn't working properly yet is the recording. The sound is poor and compressed.
Can anyone help?
Code: Select all
EnableExplicit
Procedure SetGUID(*g.GUID, d1.l, d2.w, d3.w, b0.b, b1.b, b2.b, b3.b, b4.b, b5.b, b6.b, b7.b)
*g\Data1=d1 : *g\Data2=d2 : *g\Data3=d3
*g\Data4[0]=b0 : *g\Data4[1]=b1 : *g\Data4[2]=b2 : *g\Data4[3]=b3
*g\Data4[4]=b4 : *g\Data4[5]=b5 : *g\Data4[6]=b6 : *g\Data4[7]=b7
EndProcedure
; ---------- Constants ----------
#WAVE_FORMAT_PCM = 1
#WAVE_FORMAT_IEEE_FLOAT = 3
#WAVE_FORMAT_EXTENSIBLE = $FFFE
#AUDCLNT_SHAREMODE_SHARED = 0
#AUDCLNT_STREAMFLAGS_NONE = 0
#AUDCLNT_BUFFERFLAGS_SILENT = $00000002
#COINIT_MULTITHREADED = 0
#CLSCTX_INPROC_SERVER = 1
#S_OK = 0
#DEVICE_STATE_ACTIVE = $00000001
#STGM_READ = 0
; UI IDs
#Win = 100
#CmbDev = 101
#BtnRefresh = 102
#BtnStart = 103
#BtnStop = 104
#Canvas = 105
#Log = 106
#ChkAPI = 107
#InpURL = 108
#InpBearer = 109
#SliderThresh = 110
#SpinMin = 111
#SpinHang = 112
#SpinMaxSeg = 113
#TimerUI = 199
; Defaults
#CURL_PATH = "curl"
#API_URL_DEFAULT = "https://api.example.com/transcribe"
#API_BEARER_DEFAULT = ""
#VAD_THRESH_DEFAULT = 900.0
#VAD_MIN_MS_DEFAULT = 400
#VAD_HANG_MS_DEFAULT = 600
#MAX_SEG_MS_DEFAULT = 5000
; ---------- PROPVARIANT (x64-safe) ----------
Structure PROPERTYKEY
fmtid.GUID
pid.l
EndStructure
Structure PROPVARIANT
vt.w
wReserved1.w
wReserved2.w
wReserved3.w
uPtr.q ; Union-Pointer (z.B. LPWSTR)
uPtr2.q ; Padding/zweiter Slot
EndStructure
#VT_LPWSTR = 31
; ---------- COM Interfaces (Pointer-Parameter untypisiert!) ----------
Interface IMMDeviceEnumerator
QueryInterface(*iid.GUID, *ppv)
AddRef() : Release()
EnumAudioEndpoints(dataFlow.l, dwStateMask.l, *ppDevices)
GetDefaultAudioEndpoint(dataFlow.l, role.l, *ppDevice)
GetDevice(pwstrDeviceId.i, *ppDevice)
RegisterEndpointNotificationCallback(*client)
UnregisterEndpointNotificationCallback(*client)
EndInterface
Interface IMMDeviceCollection
QueryInterface(*iid.GUID, *ppv)
AddRef() : Release()
GetCount(*pc)
Item(uIndex.l, *ppDevice)
EndInterface
Interface IMMDevice
QueryInterface(*iid.GUID, *ppv)
AddRef() : Release()
Activate(*iid.GUID, dwClsCtx.l, *pActivationParams, *ppInterface)
OpenPropertyStore(stgmAccess.l, *ppProperties)
GetId(*ppwstrId)
GetState(*pdwState)
EndInterface
Interface IPropertyStore
QueryInterface(*iid.GUID, *ppv)
AddRef() : Release()
GetCount(*cProps)
GetAt(i.l, *pkey.PROPERTYKEY)
GetValue(*key.PROPERTYKEY, *pv.PROPVARIANT)
SetValue(*key.PROPERTYKEY, *pv.PROPVARIANT)
Commit()
EndInterface
Interface IAudioClient
QueryInterface(*iid.GUID, *ppv)
AddRef() : Release()
Initialize(ShareMode.l, StreamFlags.l, hnsBufferDuration.q, hnsPeriodicity.q, *pFormat.WAVEFORMATEX, *pSessionGuid.GUID)
GetBufferSize(*pNumBufferFrames)
GetStreamLatency(*phnsLatency)
GetCurrentPadding(*pNumPaddingFrames)
IsFormatSupported(ShareMode.l, *pFormat.WAVEFORMATEX, *ppClosestMatch)
GetMixFormat(*ppDeviceFormat)
GetDevicePeriod(*phnsDefaultDevicePeriod, *phnsMinimumDevicePeriod)
Start() : Stop() : Reset()
SetEventHandle(hEvent.i)
GetService(*iid.GUID, *ppv)
EndInterface
Interface IAudioCaptureClient
QueryInterface(*iid.GUID, *ppv)
AddRef() : Release()
GetBuffer(*ppData, *pNumFramesToRead, *pdwFlags, *pu64DevicePosition, *pu64QPCPosition)
ReleaseBuffer(NumFramesRead.l)
GetNextPacketSize(*pNumFramesInNextPacket)
EndInterface
; ---------- Imports ----------
Import "ole32.lib"
CoInitializeEx(*pvReserved, coInit.l)
CoCreateInstance(*rclsid.GUID, *pUnkOuter, dwClsContext.l, *riid.GUID, *ppv)
CoTaskMemFree(*pv)
CoUninitialize()
PropVariantClear(*pv.PROPVARIANT)
EndImport
; ---------- GUIDs ----------
Global CLSID_MMDeviceEnumerator.GUID
Global IID_IMMDeviceEnumerator.GUID
Global IID_IMMDevice.GUID
Global IID_IMMDeviceCollection.GUID
Global IID_IPropertyStore.GUID
Global IID_IAudioClient.GUID
Global IID_IAudioCaptureClient.GUID
Global KSDATAFORMAT_SUBTYPE_PCM.GUID
Global KSDATAFORMAT_SUBTYPE_IEEE_FLOAT.GUID
Global PKEY_Device_FriendlyName.PROPERTYKEY
SetGUID(@CLSID_MMDeviceEnumerator, $BCDE0395, $E52F, $467C, $8E, $3D, $C4, $57, $92, $91, $69, $2E)
SetGUID(@IID_IMMDeviceEnumerator, $A95664D2, $9614, $4F35, $A7, $46, $DE, $8D, $B6, $36, $17, $E6)
SetGUID(@IID_IMMDevice, $D666063F, $1587, $4E43, $81, $F1, $B9, $63, $F5, $54, $E6, $4C)
SetGUID(@IID_IMMDeviceCollection, $0BD7A1BE, $7A1A, $44DB, $83, $97, $C0, $F9, $26, $C3, $99, $C4)
SetGUID(@IID_IPropertyStore, $886D8EEB, $8CF2, $4446, $8D, $02, $CD, $BA, $1D, $BD, $CF, $99)
SetGUID(@IID_IAudioClient, $1CB9AD4C, $DBFA, $4C32, $B1, $78, $C2, $F5, $68, $A7, $03, $B2)
SetGUID(@IID_IAudioCaptureClient, $C8ADBD64, $E71E, $48A0, $A4, $DE, $18, $5C, $39, $5C, $D3, $17)
SetGUID(@KSDATAFORMAT_SUBTYPE_PCM, $00000001, $0000, $0010, $80, $00, $00, $AA, $00, $38, $9B, $71)
SetGUID(@KSDATAFORMAT_SUBTYPE_IEEE_FLOAT,$00000003, $0000, $0010, $80, $00, $00, $AA, $00, $38, $9B, $71)
; PKEY_Device_FriendlyName = {A45C254E-DF1C-4EFD-8020-67D146A850E0}, 14
SetGUID(@PKEY_Device_FriendlyName\fmtid, $A45C254E, $DF1C, $4EFD, $80, $20, $67, $D1, $46, $A8, $50, $E0)
PKEY_Device_FriendlyName\pid = 14
; ---------- Globals ----------
Global *gEnum.IMMDeviceEnumerator, *gDev.IMMDevice, *gClient.IAudioClient, *gCap.IAudioCaptureClient
Global *gMixFmt.WAVEFORMATEX, *gInitFmt.WAVEFORMATEX, g_initFmtOwner.i = 0
Global g_running.i = #False
Global g_rms.d = 0.0, g_peak.d = 0.0
; Device list
Global Dim g_devNames.s(0)
Global Dim g_devIds.s(0)
Global g_devCount.i = 0
; WAV Out PCM16 (aus tatsächlichem Init-Format abgeleitet!)
Global g_outFmt.WAVEFORMATEX
Global segMem, segSize.l = 0, segCap.l = 0
; VAD / Segmentation
Global g_vadThresh.d = #VAD_THRESH_DEFAULT
Global g_vadMinMs.i = #VAD_MIN_MS_DEFAULT
Global g_vadHangMs.i = #VAD_HANG_MS_DEFAULT
Global g_maxSegMs.i = #MAX_SEG_MS_DEFAULT
Global speechStart.q = 0
Global lastVoice.q = 0
Global voicedBytes.q = 0
Global isSpeech.i = #False
; API
Global g_apiEnabled.i = #False
Global g_apiUrl.s = #API_URL_DEFAULT
Global g_apiBearer.s = #API_BEARER_DEFAULT
; ---------- Log ----------
Procedure Log_(msg.s)
If IsGadget(#Log)
AddGadgetItem(#Log, -1, FormatDate("[%hh:%ii:%ss] ", Date()) + msg)
SetGadgetState(#Log, CountGadgetItems(#Log)-1)
EndIf
EndProcedure
; ---------- Helpers ----------
Procedure SetupOutPCM16_FromInit(*init.WAVEFORMATEX)
; Leitet den WAV-Header DIREKT aus dem tatsächlich initialisierten Format ab
g_outFmt\wFormatTag = #WAVE_FORMAT_PCM
g_outFmt\nChannels = *init\nChannels
g_outFmt\nSamplesPerSec = *init\nSamplesPerSec
g_outFmt\wBitsPerSample = 16
g_outFmt\nBlockAlign = g_outFmt\nChannels * 2
g_outFmt\nAvgBytesPerSec= g_outFmt\nSamplesPerSec * g_outFmt\nBlockAlign
g_outFmt\cbSize = 0
EndProcedure
Procedure SegReset()
If segMem : FreeMemory(segMem) : EndIf
segCap = 44 + g_outFmt\nAvgBytesPerSec * Int(g_maxSegMs/1000.0 + 1) + 65536
segMem = AllocateMemory(segCap) : segSize = 0
EndProcedure
Procedure SegAppend(*src, bytes.l)
If segSize + bytes > segCap : bytes = segCap - segSize : EndIf
If bytes > 0 : CopyMemory(*src, segMem + segSize, bytes) : segSize + bytes : EndIf
EndProcedure
Procedure WriteWAV(file.s, *data, dataLen.l, *fmt.WAVEFORMATEX)
Protected f = CreateFile(#PB_Any, file)
If f
WriteString(f, "RIFF", #PB_Ascii) : WriteLong(f, dataLen + 36)
WriteString(f, "WAVE", #PB_Ascii)
WriteString(f, "fmt ", #PB_Ascii) : WriteLong(f, 16)
WriteWord(f, *fmt\wFormatTag) : WriteWord(f, *fmt\nChannels)
WriteLong(f, *fmt\nSamplesPerSec) : WriteLong(f, *fmt\nAvgBytesPerSec)
WriteWord(f, *fmt\nBlockAlign) : WriteWord(f, *fmt\wBitsPerSample)
WriteString(f, "data", #PB_Ascii) : WriteLong(f, dataLen)
WriteData(f, *data, dataLen) : CloseFile(f) : ProcedureReturn #True
EndIf
ProcedureReturn #False
EndProcedure
Procedure.s TempWav()
ProcedureReturn GetTemporaryDirectory() + "seg_" + Str(Date()) + "_" + Str(Random(9999)) + ".wav"
EndProcedure
Procedure.d RMS16Mono(*buf, frames.l, chans.i)
Protected i, s.l, v.d, acc.d, stride.l = chans * 2, *p = *buf
For i=0 To frames-1
s = PeekW(*p) : If s > 32767 : s - 65536 : EndIf
v = s : acc + v*v : *p + stride
Next
If frames=0 : ProcedureReturn 0 : EndIf
ProcedureReturn Sqr(acc / frames)
EndProcedure
Procedure Float32ToPCM16(*src, frames.l, chans.i, srcBlock.i, *dst)
Protected i, ch, outIndex.l = 0, v.f
For i=0 To frames-1
For ch=0 To chans-1
v = PeekF(*src + i*srcBlock + ch*4)
If v > 1.0 : v = 1.0 : ElseIf v < -1.0 : v = -1.0 : EndIf
PokeW(*dst + outIndex, Int(v * 32767.0)) : outIndex + 2
Next
Next
EndProcedure
Procedure.i IsIEEEFloat32(*fmt.WAVEFORMATEX)
If *fmt\wBitsPerSample <> 32 : ProcedureReturn #False : EndIf
If *fmt\wFormatTag = #WAVE_FORMAT_IEEE_FLOAT : ProcedureReturn #True : EndIf
If *fmt\wFormatTag = #WAVE_FORMAT_EXTENSIBLE
Protected *ext.WAVEFORMATEXTENSIBLE = *fmt
If CompareMemory(@*ext\SubFormat, @KSDATAFORMAT_SUBTYPE_IEEE_FLOAT, SizeOf(GUID)) = 0
ProcedureReturn #True
EndIf
EndIf
ProcedureReturn #False
EndProcedure
; ---------- Device Enum ----------
; Interface IPropertyStore ; re-declare for safety (PB sometimes needs it before Import)
; QueryInterface(*iid.GUID, *ppv)
; AddRef() : Release()
; GetCount(*cProps.l)
; GetAt(i.l, *pkey.PROPERTYKEY)
; GetValue(*key.PROPERTYKEY, *pv.PROPVARIANT)
; SetValue(*key.PROPERTYKEY, *pv.PROPVARIANT)
; Commit()
; EndInterface
Global *gEnum.IMMDeviceEnumerator
Global Dim g_devNames.s(0)
Global Dim g_devIds.s(0)
Global g_devCount.i = 0
Import "ole32.lib"
EndImport
Global PKEY_Device_FriendlyName.PROPERTYKEY
SetGUID(@PKEY_Device_FriendlyName\fmtid, $A45C254E, $DF1C, $4EFD, $80, $20, $67, $D1, $46, $A8, $50, $E0)
PKEY_Device_FriendlyName\pid = 14
Procedure.i RefreshDeviceList()
If *gEnum = 0
If CoCreateInstance(@CLSID_MMDeviceEnumerator, #Null, #CLSCTX_INPROC_SERVER, @IID_IMMDeviceEnumerator, @*gEnum)
Log_("MMDeviceEnumerator nicht verfügbar.") : ProcedureReturn 0
EndIf
EndIf
Protected *col.IMMDeviceCollection
If *gEnum\EnumAudioEndpoints(1, #DEVICE_STATE_ACTIVE, @*col) ; eCapture
Log_("EnumAudioEndpoints fehlgeschlagen.") : ProcedureReturn 0
EndIf
Protected count.l
*col\GetCount(@count)
ReDim g_devNames.s(count-1)
ReDim g_devIds.s(count-1)
g_devCount = count
ClearGadgetItems(#CmbDev)
Protected i, *dev.IMMDevice, *store.IPropertyStore, pv.PROPVARIANT, name.s, *wz, id.s
For i=0 To count-1
*col\Item(i, @*dev)
; FriendlyName lesen
name = "Unbekannt"
If *dev\OpenPropertyStore(#STGM_READ, @*store) = 0
If *store\GetValue(@PKEY_Device_FriendlyName, @pv) = 0
If pv\vt = #VT_LPWSTR And pv\uPtr
name = PeekS(pv\uPtr, -1, #PB_Unicode)
CoTaskMemFree(pv\uPtr) : pv\uPtr = 0
EndIf
PropVariantClear(@pv)
EndIf
*store\Release()
EndIf
; Device-ID
*dev\GetId(@*wz)
If *wz
id = PeekS(*wz, -1, #PB_Unicode)
CoTaskMemFree(*wz)
Else
id = "unknown-id"
EndIf
g_devNames(i) = name
g_devIds(i) = id
AddGadgetItem(#CmbDev, -1, name)
*dev\Release()
Next
*col\Release()
If count>0 : SetGadgetState(#CmbDev, 0) : EndIf
Log_("Geräteliste aktualisiert (" + Str(count) + " aktiv).")
ProcedureReturn count
EndProcedure
; ---------- Start/Stop ----------
Global selectedId.s
Procedure.i WASAPI_Start()
CoInitializeEx(#Null, #COINIT_MULTITHREADED)
Protected hr.l
If *gEnum = 0
If CoCreateInstance(@CLSID_MMDeviceEnumerator, #Null, #CLSCTX_INPROC_SERVER, @IID_IMMDeviceEnumerator, @*gEnum)
Log_("MMDeviceEnumerator nicht verfügbar.") : ProcedureReturn #False
EndIf
EndIf
; Resolve selected device
Protected sel = GetGadgetState(#CmbDev)
If sel < 0 Or sel >= g_devCount
Log_("Kein Gerät ausgewählt – nutze Standard.")
If *gEnum\GetDefaultAudioEndpoint(1, 1, @*gDev) ; eCapture,eMultimedia
Log_("Kein Standard-Aufnahmegerät.") : ProcedureReturn #False
EndIf
Else
selectedId = g_devIds(sel)
Protected bytes = StringByteLength(selectedId, #PB_Unicode) + 2
Protected *wsz = AllocateMemory(bytes)
PokeS(*wsz, selectedId, -1, #PB_Unicode)
If *gEnum\GetDevice(*wsz, @*gDev)
Log_("GetDevice() fehlgeschlagen, nutze Standard.")
*gEnum\GetDefaultAudioEndpoint(1, 1, @*gDev)
EndIf
FreeMemory(*wsz)
EndIf
If *gDev\Activate(@IID_IAudioClient, #CLSCTX_INPROC_SERVER, #Null, @*gClient)
Log_("IAudioClient Activate() fehlgeschlagen.") : ProcedureReturn #False
EndIf
If *gInitFmt
If g_initFmtOwner=1 : FreeMemory(*gInitFmt) : ElseIf g_initFmtOwner=2 : CoTaskMemFree(*gInitFmt) : EndIf
*gInitFmt = 0 : g_initFmtOwner = 0
EndIf
; PCM16 versuchen: 48k -> 44.1k (Mono)
Protected desired.WAVEFORMATEX, *closest.WAVEFORMATEX = 0
desired\wFormatTag = #WAVE_FORMAT_PCM
desired\nChannels = 1
desired\wBitsPerSample = 16
desired\nBlockAlign = 2
desired\cbSize = 0
desired\nSamplesPerSec = 48000 : desired\nAvgBytesPerSec = 96000
hr = *gClient\IsFormatSupported(#AUDCLNT_SHAREMODE_SHARED, @desired, @*closest)
If hr <> #S_OK
desired\nSamplesPerSec = 44100 : desired\nAvgBytesPerSec = 88200
hr = *gClient\IsFormatSupported(#AUDCLNT_SHAREMODE_SHARED, @desired, @*closest)
EndIf
If hr = #S_OK
*gInitFmt = AllocateMemory(SizeOf(WAVEFORMATEX))
CopyMemory(@desired, *gInitFmt, SizeOf(WAVEFORMATEX))
g_initFmtOwner = 1
; WICHTIG: WAV-Header aus dem tatsächlichen Init-Format ableiten
SetupOutPCM16_FromInit(*gInitFmt)
SegReset()
If *gClient\Initialize(#AUDCLNT_SHAREMODE_SHARED, #AUDCLNT_STREAMFLAGS_NONE, 0, 0, *gInitFmt, #Null)
Log_("Initialize(PCM16) fehlgeschlagen.") : ProcedureReturn #False
EndIf
Log_("InitFmt: Tag=" + Str(*gInitFmt\wFormatTag) + " Bits=" + Str(*gInitFmt\wBitsPerSample) +
" SR=" + Str(*gInitFmt\nSamplesPerSec) + " Ch=" + Str(*gInitFmt\nChannels) +
" BlockAlign=" + Str(*gInitFmt\nBlockAlign))
Log_("Input: PCM16/" + Str(*gInitFmt\nSamplesPerSec) + " Hz/" + Str(*gInitFmt\nChannels) + " ch (direkt)")
Else
; Fallback: Mix/Closest → akzeptiere nur Float32
If *closest
*gInitFmt = *closest : g_initFmtOwner = 2
Else
If *gClient\GetMixFormat(@*gMixFmt)
Log_("GetMixFormat() fehlgeschlagen.") : ProcedureReturn #False
EndIf
*gInitFmt = *gMixFmt : g_initFmtOwner = 2
EndIf
If IsIEEEFloat32(*gInitFmt) = #False
Log_("Abbruch: Gerät liefert weder PCM16 noch Float32. Tag="+Str(*gInitFmt\wFormatTag)+" Bits="+Str(*gInitFmt\wBitsPerSample))
ProcedureReturn #False
EndIf
; WICHTIG: WAV-Header aus dem tatsächlichen (Float32) Init-Format ableiten
SetupOutPCM16_FromInit(*gInitFmt)
SegReset()
If *gClient\Initialize(#AUDCLNT_SHAREMODE_SHARED, #AUDCLNT_STREAMFLAGS_NONE, 0, 0, *gInitFmt, #Null)
Log_("Initialize(Fallback Float32) fehlgeschlagen.") : ProcedureReturn #False
EndIf
Log_("Input: Float32/" + Str(*gInitFmt\nSamplesPerSec) + " Hz/" + Str(*gInitFmt\nChannels) + " ch → convert to PCM16")
EndIf
If *gClient\GetService(@IID_IAudioCaptureClient, @*gCap)
Log_("GetService(IAudioCaptureClient) fehlgeschlagen.") : ProcedureReturn #False
EndIf
If *gClient\Start()
Log_("Start() fehlgeschlagen.") : ProcedureReturn #False
EndIf
g_running = #True : g_rms = 0 : g_peak = 0
speechStart = 0 : lastVoice = ElapsedMilliseconds() : voicedBytes = 0 : isSpeech = #False
ProcedureReturn #True
EndProcedure
Procedure WASAPI_Stop()
If g_running : *gClient\Stop() : g_running = #False : EndIf
If *gCap : *gCap\Release() : *gCap = 0 : EndIf
If *gClient : *gClient\Release() : *gClient = 0 : EndIf
If *gDev : *gDev\Release() : *gDev = 0 : EndIf
If *gEnum : *gEnum\Release() : *gEnum = 0 : EndIf
If *gInitFmt
If g_initFmtOwner=1 : FreeMemory(*gInitFmt) : ElseIf g_initFmtOwner=2 : CoTaskMemFree(*gInitFmt) : EndIf
*gInitFmt = 0 : g_initFmtOwner = 0
EndIf
If *gMixFmt : CoTaskMemFree(*gMixFmt) : *gMixFmt = 0 : EndIf
CoUninitialize()
Log_("Gestoppt.")
EndProcedure
; ---------- API Upload ----------
Procedure.s TranscribeViaAPI(wav.s)
If g_apiEnabled = #False : ProcedureReturn "" : EndIf
If FileSize(wav) <= 0
Log_("API: Datei fehlt -> " + wav) : ProcedureReturn "" : EndIf
Protected tmp.s = GetTemporaryDirectory() + "asr_" + Str(Date()) + "_" + Str(Random(9999)) + ".json"
Protected args.s = "-s -S -X POST -H " + Chr(34) + "Accept: application/json" + Chr(34) + " "
If g_apiBearer <> ""
args + "-H " + Chr(34) + "Authorization: Bearer " + g_apiBearer + Chr(34) + " "
EndIf
args + "-F " + Chr(34) + "file=@" + wav + ";type=audio/wav" + Chr(34) + " "
args + "-o " + Chr(34) + tmp + Chr(34) + " " + g_apiUrl
If RunProgram(#CURL_PATH, args, "", #PB_Program_Wait | #PB_Program_Hide)
If ReadFile(#PB_Any, tmp)
Protected res.s = ""
While Eof(#PB_Any) = 0 : res + ReadString(#PB_Any, #PB_File_IgnoreEOL) : Wend
CloseFile(#PB_Any) : DeleteFile(tmp)
ProcedureReturn res
EndIf
EndIf
ProcedureReturn ""
EndProcedure
; ---------- Poll + VAD ----------
Procedure Poll()
If g_running = #False Or *gCap = 0 : ProcedureReturn : EndIf
Protected nxt.l
While *gCap\GetNextPacketSize(@nxt) = 0 And nxt > 0
Protected *pData, frames.l, flags.l, devPos.q, qpc.q
If *gCap\GetBuffer(@*pData, @frames, @flags, @devPos, @qpc) <> 0 : Break : EndIf
If frames > 0
If (flags & #AUDCLNT_BUFFERFLAGS_SILENT) = 0
Static *tmp
Protected outBytes.l = frames * (g_outFmt\nBlockAlign)
If *tmp = 0 : *tmp = AllocateMemory(outBytes) : ElseIf MemorySize(*tmp) < outBytes : *tmp = ReAllocateMemory(*tmp, outBytes) : EndIf
If *gInitFmt\wFormatTag = #WAVE_FORMAT_PCM And *gInitFmt\wBitsPerSample = 16
; SegAppend(*pData, outBytes)
Protected srcBytes.l = frames * (*gInitFmt\nBlockAlign)
SegAppend(*pData, srcBytes)
g_rms = RMS16Mono(*pData, frames, g_outFmt\nChannels)
Else
Float32ToPCM16(*pData, frames, *gInitFmt\nChannels, *gInitFmt\nBlockAlign, *tmp)
SegAppend(*tmp, outBytes)
g_rms = RMS16Mono(*tmp, frames, g_outFmt\nChannels)
EndIf
If g_rms > g_peak : g_peak = g_rms : EndIf
voicedBytes + outBytes
lastVoice = ElapsedMilliseconds()
Else
g_rms * 0.9
EndIf
; --- VAD Logik ---
Protected now.q = ElapsedMilliseconds()
Protected minBytes.l = g_outFmt\nAvgBytesPerSec * (g_vadMinMs/1000.0)
If g_rms >= g_vadThresh And voicedBytes >= minBytes And isSpeech = #False
isSpeech = #True : speechStart = now
Log_(">> Speech START")
EndIf
If isSpeech
If (now - lastVoice) > g_vadHangMs Or (now - speechStart) > g_maxSegMs
Protected wav.s = TempWav()
If WriteWAV(wav, segMem, segSize, @g_outFmt)
Log_("Segment gespeichert: " + wav + " (" + Str(segSize) + " B)")
If g_apiEnabled
Protected res.s = TranscribeViaAPI(wav)
If res <> "" : Log_("API: " + Left(res, 200)) : Else : Log_("API: keine Antwort/Fehler") : EndIf
EndIf
EndIf
isSpeech = #False : voicedBytes=0 : SegReset()
Log_("<< Speech END")
EndIf
Else
If (now - lastVoice) > 3000 And segSize > 0
SegReset() : voicedBytes = 0
EndIf
EndIf
EndIf
*gCap\ReleaseBuffer(frames)
Wend
EndProcedure
; ---------- UI ----------
Procedure DrawMeter()
Protected w = GadgetWidth(#Canvas), h = GadgetHeight(#Canvas)
If StartDrawing(CanvasOutput(#Canvas))
Box(0,0,w,h, RGB(25,25,28))
Protected pad=10, barW = w-2*pad
Protected n.d = g_rms / 4000.0 : If n>1:n=1:EndIf
Protected peakN.d = g_peak/4000.0 : If peakN>1:peakN=1:EndIf
Protected y = h - pad
Box(pad, pad, barW, h-2*pad, RGB(45,45,55))
Protected lvlH = Int((h-2*pad)*n)
Box(pad, y - lvlH, barW, lvlH, RGB(70,190,110))
Line(pad, y - Int((h-2*pad)*peakN), barW, 0, RGB(220,220,90))
DrawingMode(#PB_2DDrawing_Transparent)
DrawText(pad, 4, "RMS: " + StrD(g_rms,1) + " Peak: " + StrD(g_peak,1), RGB(220,220,230))
StopDrawing()
EndIf
EndProcedure
Procedure BuildUI()
OpenWindow(#Win, 200, 120, 760, 520, "WASAPI Recorder — SR-Fix", #PB_Window_SystemMenu | #PB_Window_ScreenCentered)
TextGadget(#PB_Any, 10, 12, 60, 22, "Gerät:")
ComboBoxGadget(#CmbDev, 70, 10, 360, 24)
ButtonGadget(#BtnRefresh, 440, 10, 80, 24, "Refresh")
ButtonGadget(#BtnStart, 530, 10, 80, 24, "Start")
ButtonGadget(#BtnStop, 620, 10, 80, 24, "Stop") : DisableGadget(#BtnStop, #True)
CanvasGadget(#Canvas, 10, 50, 730, 220)
; VAD Controls
TextGadget(#PB_Any, 10, 280, 120, 20, "VAD-Schwelle")
TrackBarGadget(#SliderThresh, 10, 300, 220, 24, 0, 4000) : SetGadgetState(#SliderThresh, Int(g_vadThresh))
TextGadget(#PB_Any, 240, 280, 160, 20, "Min. Sprachdauer (ms)")
SpinGadget(#SpinMin, 240, 300, 120, 24, 50, 5000, #PB_Spin_Numeric) : SetGadgetState(#SpinMin, g_vadMinMs)
TextGadget(#PB_Any, 370, 280, 120, 20, "Hangover (ms)")
SpinGadget(#SpinHang, 370, 300, 120, 24, 100, 5000, #PB_Spin_Numeric) : SetGadgetState(#SpinHang, g_vadHangMs)
TextGadget(#PB_Any, 500, 280, 130, 20, "Max Segment (ms)")
SpinGadget(#SpinMaxSeg, 500, 300, 120, 24, 1000, 60000, #PB_Spin_Numeric) : SetGadgetState(#SpinMaxSeg, g_maxSegMs)
; API Controls
CheckBoxGadget(#ChkAPI, 10, 340, 220, 22, "Transkription via API aktiv")
SetGadgetState(#ChkAPI, g_apiEnabled)
TextGadget(#PB_Any, 10, 370, 80, 20, "API URL")
StringGadget(#InpURL, 90, 368, 300, 24, #API_URL_DEFAULT)
TextGadget(#PB_Any, 400, 370, 110, 20, "Bearer Token")
StringGadget(#InpBearer, 510, 368, 230, 24, #API_BEARER_DEFAULT)
ListViewGadget(#Log, 10, 400, 730, 110)
Log_("Bereit. WAV-Header folgt exakt dem initiierten Format.")
RefreshDeviceList()
AddWindowTimer(#Win, #TimerUI, 33)
EndProcedure
; ---------- Main ----------
BuildUI()
Define ev, gad
Repeat
ev = WaitWindowEvent(10)
Select ev
Case #PB_Event_Timer
If EventTimer() = #TimerUI
DrawMeter()
If g_running : Poll() : EndIf
EndIf
Case #PB_Event_Gadget
gad = EventGadget()
Select gad
Case #BtnRefresh
RefreshDeviceList()
Case #BtnStart
g_apiEnabled = GetGadgetState(#ChkAPI)
g_apiUrl = GetGadgetText(#InpURL)
g_apiBearer = GetGadgetText(#InpBearer)
g_vadThresh = GetGadgetState(#SliderThresh)
g_vadMinMs = GetGadgetState(#SpinMin)
g_vadHangMs = GetGadgetState(#SpinHang)
g_maxSegMs = GetGadgetState(#SpinMaxSeg)
If WASAPI_Start()
DisableGadget(#BtnStart, #True) : DisableGadget(#BtnStop, #False)
EndIf
Case #BtnStop
WASAPI_Stop()
DisableGadget(#BtnStart, #False) : DisableGadget(#BtnStop, #True)
Case #SliderThresh : g_vadThresh = GetGadgetState(#SliderThresh)
Case #SpinMin : g_vadMinMs = GetGadgetState(#SpinMin)
Case #SpinHang : g_vadHangMs = GetGadgetState(#SpinHang)
Case #SpinMaxSeg : g_maxSegMs = GetGadgetState(#SpinMaxSeg)
Case #ChkAPI : g_apiEnabled= GetGadgetState(#ChkAPI)
Case #InpURL : g_apiUrl = GetGadgetText(#InpURL)
Case #InpBearer : g_apiBearer = GetGadgetText(#InpBearer)
EndSelect
Case #PB_Event_CloseWindow
Break
EndSelect
ForEver
WASAPI_Stop()
End