Page 1 of 1

Purebasic Alexa with WASAPI

Posted: Thu Sep 11, 2025 2:19 pm
by dige
Hey guys, I wanted to try building my own Alex with Purebasic.

What already works:
Device selection, level detection, recording of microphone sound for a maximum of 5 seconds.

The recording will then be transcribed later with OpenAI/Whisper, and ChatGPT will be used to analyze the spoken text and assign it to defined commands.

What isn't working properly yet is the recording. The sound is poor and compressed.

Can anyone help?

Code: Select all

EnableExplicit
Procedure SetGUID(*g.GUID, d1.l, d2.w, d3.w, b0.b, b1.b, b2.b, b3.b, b4.b, b5.b, b6.b, b7.b)
  *g\Data1=d1 : *g\Data2=d2 : *g\Data3=d3
  *g\Data4[0]=b0 : *g\Data4[1]=b1 : *g\Data4[2]=b2 : *g\Data4[3]=b3
  *g\Data4[4]=b4 : *g\Data4[5]=b5 : *g\Data4[6]=b6 : *g\Data4[7]=b7
EndProcedure

; ---------- Constants ----------
#WAVE_FORMAT_PCM              = 1
#WAVE_FORMAT_IEEE_FLOAT       = 3
#WAVE_FORMAT_EXTENSIBLE       = $FFFE
#AUDCLNT_SHAREMODE_SHARED     = 0
#AUDCLNT_STREAMFLAGS_NONE     = 0
#AUDCLNT_BUFFERFLAGS_SILENT   = $00000002
#COINIT_MULTITHREADED         = 0
#CLSCTX_INPROC_SERVER         = 1
#S_OK                         = 0
#DEVICE_STATE_ACTIVE          = $00000001
#STGM_READ                    = 0

; UI IDs
#Win = 100
#CmbDev = 101
#BtnRefresh = 102
#BtnStart = 103
#BtnStop = 104
#Canvas = 105
#Log = 106
#ChkAPI = 107
#InpURL = 108
#InpBearer = 109
#SliderThresh = 110
#SpinMin = 111
#SpinHang = 112
#SpinMaxSeg = 113
#TimerUI = 199

; Defaults
#CURL_PATH           = "curl"
#API_URL_DEFAULT     = "https://api.example.com/transcribe" 
#API_BEARER_DEFAULT  = ""
#VAD_THRESH_DEFAULT  = 900.0
#VAD_MIN_MS_DEFAULT  = 400
#VAD_HANG_MS_DEFAULT = 600
#MAX_SEG_MS_DEFAULT  = 5000

; ---------- PROPVARIANT (x64-safe) ----------
Structure PROPERTYKEY
  fmtid.GUID
  pid.l
EndStructure

Structure PROPVARIANT
  vt.w
  wReserved1.w
  wReserved2.w
  wReserved3.w
  uPtr.q   ; Union-Pointer (z.B. LPWSTR)
  uPtr2.q  ; Padding/zweiter Slot
EndStructure

#VT_LPWSTR = 31

; ---------- COM Interfaces (Pointer-Parameter untypisiert!) ----------
Interface IMMDeviceEnumerator
  QueryInterface(*iid.GUID, *ppv)
  AddRef() : Release()
  EnumAudioEndpoints(dataFlow.l, dwStateMask.l, *ppDevices)
  GetDefaultAudioEndpoint(dataFlow.l, role.l, *ppDevice)
  GetDevice(pwstrDeviceId.i, *ppDevice)
  RegisterEndpointNotificationCallback(*client)
  UnregisterEndpointNotificationCallback(*client)
EndInterface

Interface IMMDeviceCollection
  QueryInterface(*iid.GUID, *ppv)
  AddRef() : Release()
  GetCount(*pc)
  Item(uIndex.l, *ppDevice)
EndInterface

Interface IMMDevice
  QueryInterface(*iid.GUID, *ppv)
  AddRef() : Release()
  Activate(*iid.GUID, dwClsCtx.l, *pActivationParams, *ppInterface)
  OpenPropertyStore(stgmAccess.l, *ppProperties)
  GetId(*ppwstrId)
  GetState(*pdwState)
EndInterface

Interface IPropertyStore
  QueryInterface(*iid.GUID, *ppv)
  AddRef() : Release()
  GetCount(*cProps)
  GetAt(i.l, *pkey.PROPERTYKEY)
  GetValue(*key.PROPERTYKEY, *pv.PROPVARIANT)
  SetValue(*key.PROPERTYKEY, *pv.PROPVARIANT)
  Commit()
EndInterface

Interface IAudioClient
  QueryInterface(*iid.GUID, *ppv)
  AddRef() : Release()
  Initialize(ShareMode.l, StreamFlags.l, hnsBufferDuration.q, hnsPeriodicity.q, *pFormat.WAVEFORMATEX, *pSessionGuid.GUID)
  GetBufferSize(*pNumBufferFrames)
  GetStreamLatency(*phnsLatency)
  GetCurrentPadding(*pNumPaddingFrames)
  IsFormatSupported(ShareMode.l, *pFormat.WAVEFORMATEX, *ppClosestMatch)
  GetMixFormat(*ppDeviceFormat)
  GetDevicePeriod(*phnsDefaultDevicePeriod, *phnsMinimumDevicePeriod)
  Start() : Stop() : Reset()
  SetEventHandle(hEvent.i)
  GetService(*iid.GUID, *ppv)
EndInterface

Interface IAudioCaptureClient
  QueryInterface(*iid.GUID, *ppv)
  AddRef() : Release()
  GetBuffer(*ppData, *pNumFramesToRead, *pdwFlags, *pu64DevicePosition, *pu64QPCPosition)
  ReleaseBuffer(NumFramesRead.l)
  GetNextPacketSize(*pNumFramesInNextPacket)
EndInterface

; ---------- Imports ----------
Import "ole32.lib"
  CoInitializeEx(*pvReserved, coInit.l)
  CoCreateInstance(*rclsid.GUID, *pUnkOuter, dwClsContext.l, *riid.GUID, *ppv)
  CoTaskMemFree(*pv)
  CoUninitialize()
  PropVariantClear(*pv.PROPVARIANT)
EndImport

; ---------- GUIDs ----------
Global CLSID_MMDeviceEnumerator.GUID
Global IID_IMMDeviceEnumerator.GUID
Global IID_IMMDevice.GUID
Global IID_IMMDeviceCollection.GUID
Global IID_IPropertyStore.GUID
Global IID_IAudioClient.GUID
Global IID_IAudioCaptureClient.GUID
Global KSDATAFORMAT_SUBTYPE_PCM.GUID
Global KSDATAFORMAT_SUBTYPE_IEEE_FLOAT.GUID
Global PKEY_Device_FriendlyName.PROPERTYKEY

SetGUID(@CLSID_MMDeviceEnumerator,       $BCDE0395, $E52F, $467C, $8E, $3D, $C4, $57, $92, $91, $69, $2E)
SetGUID(@IID_IMMDeviceEnumerator,        $A95664D2, $9614, $4F35, $A7, $46, $DE, $8D, $B6, $36, $17, $E6)
SetGUID(@IID_IMMDevice,                  $D666063F, $1587, $4E43, $81, $F1, $B9, $63, $F5, $54, $E6, $4C)
SetGUID(@IID_IMMDeviceCollection,        $0BD7A1BE, $7A1A, $44DB, $83, $97, $C0, $F9, $26, $C3, $99, $C4)
SetGUID(@IID_IPropertyStore,             $886D8EEB, $8CF2, $4446, $8D, $02, $CD, $BA, $1D, $BD, $CF, $99)
SetGUID(@IID_IAudioClient,               $1CB9AD4C, $DBFA, $4C32, $B1, $78, $C2, $F5, $68, $A7, $03, $B2)
SetGUID(@IID_IAudioCaptureClient,        $C8ADBD64, $E71E, $48A0, $A4, $DE, $18, $5C, $39, $5C, $D3, $17)
SetGUID(@KSDATAFORMAT_SUBTYPE_PCM,       $00000001, $0000, $0010, $80, $00, $00, $AA, $00, $38, $9B, $71)
SetGUID(@KSDATAFORMAT_SUBTYPE_IEEE_FLOAT,$00000003, $0000, $0010, $80, $00, $00, $AA, $00, $38, $9B, $71)
; PKEY_Device_FriendlyName = {A45C254E-DF1C-4EFD-8020-67D146A850E0}, 14
SetGUID(@PKEY_Device_FriendlyName\fmtid, $A45C254E, $DF1C, $4EFD, $80, $20, $67, $D1, $46, $A8, $50, $E0)
PKEY_Device_FriendlyName\pid = 14

; ---------- Globals ----------
Global *gEnum.IMMDeviceEnumerator, *gDev.IMMDevice, *gClient.IAudioClient, *gCap.IAudioCaptureClient
Global *gMixFmt.WAVEFORMATEX, *gInitFmt.WAVEFORMATEX, g_initFmtOwner.i = 0
Global g_running.i = #False
Global g_rms.d = 0.0, g_peak.d = 0.0

; Device list
Global Dim g_devNames.s(0)
Global Dim g_devIds.s(0)
Global g_devCount.i = 0

; WAV Out PCM16 (aus tatsächlichem Init-Format abgeleitet!)
Global g_outFmt.WAVEFORMATEX
Global segMem, segSize.l = 0, segCap.l = 0

; VAD / Segmentation
Global g_vadThresh.d     = #VAD_THRESH_DEFAULT
Global g_vadMinMs.i      = #VAD_MIN_MS_DEFAULT
Global g_vadHangMs.i     = #VAD_HANG_MS_DEFAULT
Global g_maxSegMs.i      = #MAX_SEG_MS_DEFAULT
Global speechStart.q     = 0
Global lastVoice.q       = 0
Global voicedBytes.q     = 0
Global isSpeech.i        = #False

; API
Global g_apiEnabled.i    = #False
Global g_apiUrl.s        = #API_URL_DEFAULT
Global g_apiBearer.s     = #API_BEARER_DEFAULT

; ---------- Log ----------
Procedure Log_(msg.s)
  If IsGadget(#Log)
    AddGadgetItem(#Log, -1, FormatDate("[%hh:%ii:%ss] ", Date()) + msg)
    SetGadgetState(#Log, CountGadgetItems(#Log)-1)
  EndIf
EndProcedure

; ---------- Helpers ----------
Procedure SetupOutPCM16_FromInit(*init.WAVEFORMATEX)
  ; Leitet den WAV-Header DIREKT aus dem tatsächlich initialisierten Format ab
  g_outFmt\wFormatTag     = #WAVE_FORMAT_PCM
  g_outFmt\nChannels      = *init\nChannels
  g_outFmt\nSamplesPerSec = *init\nSamplesPerSec
  g_outFmt\wBitsPerSample = 16
  g_outFmt\nBlockAlign    = g_outFmt\nChannels * 2
  g_outFmt\nAvgBytesPerSec= g_outFmt\nSamplesPerSec * g_outFmt\nBlockAlign
  g_outFmt\cbSize         = 0
EndProcedure

Procedure SegReset()
  If segMem : FreeMemory(segMem) : EndIf
  segCap = 44 + g_outFmt\nAvgBytesPerSec * Int(g_maxSegMs/1000.0 + 1) + 65536
  segMem = AllocateMemory(segCap) : segSize = 0
EndProcedure

Procedure SegAppend(*src, bytes.l)
  If segSize + bytes > segCap : bytes = segCap - segSize : EndIf
  If bytes > 0 : CopyMemory(*src, segMem + segSize, bytes) : segSize + bytes : EndIf
EndProcedure

Procedure WriteWAV(file.s, *data, dataLen.l, *fmt.WAVEFORMATEX)
  Protected f = CreateFile(#PB_Any, file)
  If f
    WriteString(f, "RIFF", #PB_Ascii) : WriteLong(f, dataLen + 36)
    WriteString(f, "WAVE", #PB_Ascii)
    WriteString(f, "fmt ", #PB_Ascii) : WriteLong(f, 16)
    WriteWord(f, *fmt\wFormatTag) : WriteWord(f, *fmt\nChannels)
    WriteLong(f, *fmt\nSamplesPerSec) : WriteLong(f, *fmt\nAvgBytesPerSec)
    WriteWord(f, *fmt\nBlockAlign) : WriteWord(f, *fmt\wBitsPerSample)
    WriteString(f, "data", #PB_Ascii) : WriteLong(f, dataLen)
    WriteData(f, *data, dataLen) : CloseFile(f) : ProcedureReturn #True
  EndIf
  ProcedureReturn #False
EndProcedure

Procedure.s TempWav()
  ProcedureReturn GetTemporaryDirectory() + "seg_" + Str(Date()) + "_" + Str(Random(9999)) + ".wav"
EndProcedure

Procedure.d RMS16Mono(*buf, frames.l, chans.i)
  Protected i, s.l, v.d, acc.d, stride.l = chans * 2, *p = *buf
  For i=0 To frames-1
    s = PeekW(*p) : If s > 32767 : s - 65536 : EndIf
    v = s : acc + v*v : *p + stride
  Next
  If frames=0 : ProcedureReturn 0 : EndIf
  ProcedureReturn Sqr(acc / frames)
EndProcedure

Procedure Float32ToPCM16(*src, frames.l, chans.i, srcBlock.i, *dst)
  Protected i, ch, outIndex.l = 0, v.f
  For i=0 To frames-1
    For ch=0 To chans-1
      v = PeekF(*src + i*srcBlock + ch*4)
      If v > 1.0 : v = 1.0 : ElseIf v < -1.0 : v = -1.0 : EndIf
      PokeW(*dst + outIndex, Int(v * 32767.0)) : outIndex + 2
    Next
  Next
EndProcedure

Procedure.i IsIEEEFloat32(*fmt.WAVEFORMATEX)
  If *fmt\wBitsPerSample <> 32 : ProcedureReturn #False : EndIf
  If *fmt\wFormatTag = #WAVE_FORMAT_IEEE_FLOAT : ProcedureReturn #True : EndIf
  If *fmt\wFormatTag = #WAVE_FORMAT_EXTENSIBLE
    Protected *ext.WAVEFORMATEXTENSIBLE = *fmt
    If CompareMemory(@*ext\SubFormat, @KSDATAFORMAT_SUBTYPE_IEEE_FLOAT, SizeOf(GUID)) = 0
      ProcedureReturn #True
    EndIf
  EndIf
  ProcedureReturn #False
EndProcedure

; ---------- Device Enum ----------
; Interface IPropertyStore  ; re-declare for safety (PB sometimes needs it before Import)
;   QueryInterface(*iid.GUID, *ppv)
;   AddRef() : Release()
;   GetCount(*cProps.l)
;   GetAt(i.l, *pkey.PROPERTYKEY)
;   GetValue(*key.PROPERTYKEY, *pv.PROPVARIANT)
;   SetValue(*key.PROPERTYKEY, *pv.PROPVARIANT)
;   Commit()
; EndInterface

Global *gEnum.IMMDeviceEnumerator

Global Dim g_devNames.s(0)
Global Dim g_devIds.s(0)
Global g_devCount.i = 0

Import "ole32.lib"
EndImport

Global PKEY_Device_FriendlyName.PROPERTYKEY

SetGUID(@PKEY_Device_FriendlyName\fmtid, $A45C254E, $DF1C, $4EFD, $80, $20, $67, $D1, $46, $A8, $50, $E0)
PKEY_Device_FriendlyName\pid = 14

Procedure.i RefreshDeviceList()
  If *gEnum = 0
    If CoCreateInstance(@CLSID_MMDeviceEnumerator, #Null, #CLSCTX_INPROC_SERVER, @IID_IMMDeviceEnumerator, @*gEnum)
      Log_("MMDeviceEnumerator nicht verfügbar.") : ProcedureReturn 0
    EndIf
  EndIf

  Protected *col.IMMDeviceCollection
  If *gEnum\EnumAudioEndpoints(1, #DEVICE_STATE_ACTIVE, @*col)  ; eCapture
    Log_("EnumAudioEndpoints fehlgeschlagen.") : ProcedureReturn 0
  EndIf

  Protected count.l
  *col\GetCount(@count)

  ReDim g_devNames.s(count-1)
  ReDim g_devIds.s(count-1)
  g_devCount = count

  ClearGadgetItems(#CmbDev)

  Protected i, *dev.IMMDevice, *store.IPropertyStore, pv.PROPVARIANT, name.s, *wz, id.s
  For i=0 To count-1
    *col\Item(i, @*dev)

    ; FriendlyName lesen
    name = "Unbekannt"
    If *dev\OpenPropertyStore(#STGM_READ, @*store) = 0
      If *store\GetValue(@PKEY_Device_FriendlyName, @pv) = 0
        If pv\vt = #VT_LPWSTR And pv\uPtr
          name = PeekS(pv\uPtr, -1, #PB_Unicode)
          CoTaskMemFree(pv\uPtr) : pv\uPtr = 0
        EndIf
        PropVariantClear(@pv)
      EndIf
      *store\Release()
    EndIf

    ; Device-ID
    *dev\GetId(@*wz)
    If *wz
      id = PeekS(*wz, -1, #PB_Unicode)
      CoTaskMemFree(*wz)
    Else
      id = "unknown-id"
    EndIf

    g_devNames(i) = name
    g_devIds(i)   = id
    AddGadgetItem(#CmbDev, -1, name)

    *dev\Release()
  Next

  *col\Release()
  If count>0 : SetGadgetState(#CmbDev, 0) : EndIf
  Log_("Geräteliste aktualisiert (" + Str(count) + " aktiv).")
  ProcedureReturn count
EndProcedure

; ---------- Start/Stop ----------
Global selectedId.s

Procedure.i WASAPI_Start()
  CoInitializeEx(#Null, #COINIT_MULTITHREADED)

  Protected hr.l
  If *gEnum = 0
    If CoCreateInstance(@CLSID_MMDeviceEnumerator, #Null, #CLSCTX_INPROC_SERVER, @IID_IMMDeviceEnumerator, @*gEnum)
      Log_("MMDeviceEnumerator nicht verfügbar.") : ProcedureReturn #False
    EndIf
  EndIf

  ; Resolve selected device
  Protected sel = GetGadgetState(#CmbDev)
  If sel < 0 Or sel >= g_devCount
    Log_("Kein Gerät ausgewählt – nutze Standard.")
    If *gEnum\GetDefaultAudioEndpoint(1, 1, @*gDev) ; eCapture,eMultimedia
      Log_("Kein Standard-Aufnahmegerät.") : ProcedureReturn #False
    EndIf
  Else
    selectedId = g_devIds(sel)
    Protected bytes = StringByteLength(selectedId, #PB_Unicode) + 2
    Protected *wsz = AllocateMemory(bytes)
    PokeS(*wsz, selectedId, -1, #PB_Unicode)
    If *gEnum\GetDevice(*wsz, @*gDev)
      Log_("GetDevice() fehlgeschlagen, nutze Standard.")
      *gEnum\GetDefaultAudioEndpoint(1, 1, @*gDev)
    EndIf
    FreeMemory(*wsz)
  EndIf

  If *gDev\Activate(@IID_IAudioClient, #CLSCTX_INPROC_SERVER, #Null, @*gClient)
    Log_("IAudioClient Activate() fehlgeschlagen.") : ProcedureReturn #False
  EndIf

  If *gInitFmt
    If g_initFmtOwner=1 : FreeMemory(*gInitFmt) : ElseIf g_initFmtOwner=2 : CoTaskMemFree(*gInitFmt) : EndIf
    *gInitFmt = 0 : g_initFmtOwner = 0
  EndIf

  ; PCM16 versuchen: 48k -> 44.1k (Mono)
  Protected desired.WAVEFORMATEX, *closest.WAVEFORMATEX = 0
  desired\wFormatTag     = #WAVE_FORMAT_PCM
  desired\nChannels      = 1
  desired\wBitsPerSample = 16
  desired\nBlockAlign    = 2
  desired\cbSize         = 0

  desired\nSamplesPerSec = 48000 : desired\nAvgBytesPerSec = 96000
  hr = *gClient\IsFormatSupported(#AUDCLNT_SHAREMODE_SHARED, @desired, @*closest)
  If hr <> #S_OK
    desired\nSamplesPerSec = 44100 : desired\nAvgBytesPerSec = 88200
    hr = *gClient\IsFormatSupported(#AUDCLNT_SHAREMODE_SHARED, @desired, @*closest)
  EndIf

  If hr = #S_OK
    *gInitFmt = AllocateMemory(SizeOf(WAVEFORMATEX))
    CopyMemory(@desired, *gInitFmt, SizeOf(WAVEFORMATEX))
    g_initFmtOwner = 1
    ; WICHTIG: WAV-Header aus dem tatsächlichen Init-Format ableiten
    SetupOutPCM16_FromInit(*gInitFmt)
    SegReset()
    If *gClient\Initialize(#AUDCLNT_SHAREMODE_SHARED, #AUDCLNT_STREAMFLAGS_NONE, 0, 0, *gInitFmt, #Null)
      Log_("Initialize(PCM16) fehlgeschlagen.") : ProcedureReturn #False
    EndIf
    
    Log_("InitFmt: Tag=" + Str(*gInitFmt\wFormatTag) + " Bits=" + Str(*gInitFmt\wBitsPerSample) +
         " SR=" + Str(*gInitFmt\nSamplesPerSec) + " Ch=" + Str(*gInitFmt\nChannels) +
         " BlockAlign=" + Str(*gInitFmt\nBlockAlign))
    
    Log_("Input: PCM16/" + Str(*gInitFmt\nSamplesPerSec) + " Hz/" + Str(*gInitFmt\nChannels) + " ch (direkt)")

  Else
    ; Fallback: Mix/Closest → akzeptiere nur Float32
    If *closest
      *gInitFmt = *closest : g_initFmtOwner = 2
    Else
      If *gClient\GetMixFormat(@*gMixFmt)
        Log_("GetMixFormat() fehlgeschlagen.") : ProcedureReturn #False
      EndIf
      *gInitFmt = *gMixFmt : g_initFmtOwner = 2
    EndIf

    If IsIEEEFloat32(*gInitFmt) = #False
      Log_("Abbruch: Gerät liefert weder PCM16 noch Float32. Tag="+Str(*gInitFmt\wFormatTag)+" Bits="+Str(*gInitFmt\wBitsPerSample))
      ProcedureReturn #False
    EndIf

    ; WICHTIG: WAV-Header aus dem tatsächlichen (Float32) Init-Format ableiten
    SetupOutPCM16_FromInit(*gInitFmt)
    SegReset()
    If *gClient\Initialize(#AUDCLNT_SHAREMODE_SHARED, #AUDCLNT_STREAMFLAGS_NONE, 0, 0, *gInitFmt, #Null)
      Log_("Initialize(Fallback Float32) fehlgeschlagen.") : ProcedureReturn #False
    EndIf
    Log_("Input: Float32/" + Str(*gInitFmt\nSamplesPerSec) + " Hz/" + Str(*gInitFmt\nChannels) + " ch → convert to PCM16")
  EndIf

  If *gClient\GetService(@IID_IAudioCaptureClient, @*gCap)
    Log_("GetService(IAudioCaptureClient) fehlgeschlagen.") : ProcedureReturn #False
  EndIf

  If *gClient\Start()
    Log_("Start() fehlgeschlagen.") : ProcedureReturn #False
  EndIf

  g_running = #True : g_rms = 0 : g_peak = 0
  speechStart = 0 : lastVoice = ElapsedMilliseconds() : voicedBytes = 0 : isSpeech = #False
  ProcedureReturn #True
EndProcedure

Procedure WASAPI_Stop()
  If g_running : *gClient\Stop() : g_running = #False : EndIf
  If *gCap    : *gCap\Release()    : *gCap    = 0 : EndIf
  If *gClient : *gClient\Release() : *gClient = 0 : EndIf
  If *gDev    : *gDev\Release()    : *gDev    = 0 : EndIf
  If *gEnum   : *gEnum\Release()   : *gEnum   = 0 : EndIf

  If *gInitFmt
    If g_initFmtOwner=1 : FreeMemory(*gInitFmt) : ElseIf g_initFmtOwner=2 : CoTaskMemFree(*gInitFmt) : EndIf
    *gInitFmt = 0 : g_initFmtOwner = 0
  EndIf
  If *gMixFmt : CoTaskMemFree(*gMixFmt) : *gMixFmt = 0 : EndIf

  CoUninitialize()
  Log_("Gestoppt.")
EndProcedure

; ---------- API Upload ----------
Procedure.s TranscribeViaAPI(wav.s)
  If g_apiEnabled = #False : ProcedureReturn "" : EndIf
  If FileSize(wav) <= 0
    Log_("API: Datei fehlt -> " + wav) : ProcedureReturn "" : EndIf

  Protected tmp.s = GetTemporaryDirectory() + "asr_" + Str(Date()) + "_" + Str(Random(9999)) + ".json"
  Protected args.s = "-s -S -X POST -H " + Chr(34) + "Accept: application/json" + Chr(34) + " "

  If g_apiBearer <> ""
    args + "-H " + Chr(34) + "Authorization: Bearer " + g_apiBearer + Chr(34) + " "
  EndIf

  args + "-F " + Chr(34) + "file=@" + wav + ";type=audio/wav" + Chr(34) + " "
  args + "-o " + Chr(34) + tmp + Chr(34) + " " + g_apiUrl

  If RunProgram(#CURL_PATH, args, "", #PB_Program_Wait | #PB_Program_Hide)
    If ReadFile(#PB_Any, tmp)
      Protected res.s = ""
      While Eof(#PB_Any) = 0 : res + ReadString(#PB_Any, #PB_File_IgnoreEOL) : Wend
      CloseFile(#PB_Any) : DeleteFile(tmp)
      ProcedureReturn res
    EndIf
  EndIf
  ProcedureReturn ""
EndProcedure

; ---------- Poll + VAD ----------
Procedure Poll()
  If g_running = #False Or *gCap = 0 : ProcedureReturn : EndIf

  Protected nxt.l
  While *gCap\GetNextPacketSize(@nxt) = 0 And nxt > 0
    Protected *pData, frames.l, flags.l, devPos.q, qpc.q
    If *gCap\GetBuffer(@*pData, @frames, @flags, @devPos, @qpc) <> 0 : Break : EndIf

    If frames > 0
      If (flags & #AUDCLNT_BUFFERFLAGS_SILENT) = 0
        Static *tmp
        Protected outBytes.l = frames * (g_outFmt\nBlockAlign)
        If *tmp = 0 : *tmp = AllocateMemory(outBytes) : ElseIf MemorySize(*tmp) < outBytes : *tmp = ReAllocateMemory(*tmp, outBytes) : EndIf

        If *gInitFmt\wFormatTag = #WAVE_FORMAT_PCM And *gInitFmt\wBitsPerSample = 16
          ; SegAppend(*pData, outBytes)
          Protected srcBytes.l = frames * (*gInitFmt\nBlockAlign)
          SegAppend(*pData, srcBytes)
          
          g_rms = RMS16Mono(*pData, frames, g_outFmt\nChannels)
        Else
          Float32ToPCM16(*pData, frames, *gInitFmt\nChannels, *gInitFmt\nBlockAlign, *tmp)
          SegAppend(*tmp, outBytes)
          g_rms = RMS16Mono(*tmp, frames, g_outFmt\nChannels)
        EndIf

        If g_rms > g_peak : g_peak = g_rms : EndIf
        voicedBytes + outBytes
        lastVoice = ElapsedMilliseconds()
      Else
        g_rms * 0.9
      EndIf

      ; --- VAD Logik ---
      Protected now.q = ElapsedMilliseconds()
      Protected minBytes.l = g_outFmt\nAvgBytesPerSec * (g_vadMinMs/1000.0)

      If g_rms >= g_vadThresh And voicedBytes >= minBytes And isSpeech = #False
        isSpeech = #True : speechStart = now
        Log_(">> Speech START")
      EndIf

      If isSpeech
        If (now - lastVoice) > g_vadHangMs Or (now - speechStart) > g_maxSegMs
          Protected wav.s = TempWav()
          If WriteWAV(wav, segMem, segSize, @g_outFmt)
            Log_("Segment gespeichert: " + wav + " (" + Str(segSize) + " B)")
            If g_apiEnabled
              Protected res.s = TranscribeViaAPI(wav)
              If res <> "" : Log_("API: " + Left(res, 200)) : Else : Log_("API: keine Antwort/Fehler") : EndIf
            EndIf
          EndIf
          isSpeech = #False : voicedBytes=0 : SegReset()
          Log_("<< Speech END")
        EndIf
      Else
        If (now - lastVoice) > 3000 And segSize > 0
          SegReset() : voicedBytes = 0
        EndIf
      EndIf
    EndIf

    *gCap\ReleaseBuffer(frames)
  Wend
EndProcedure

; ---------- UI ----------
Procedure DrawMeter()
  Protected w = GadgetWidth(#Canvas), h = GadgetHeight(#Canvas)
  If StartDrawing(CanvasOutput(#Canvas))
    Box(0,0,w,h, RGB(25,25,28))
    Protected pad=10, barW = w-2*pad
    Protected n.d = g_rms / 4000.0 : If n>1:n=1:EndIf
    Protected peakN.d = g_peak/4000.0 : If peakN>1:peakN=1:EndIf
    Protected y = h - pad
    Box(pad, pad, barW, h-2*pad, RGB(45,45,55))
    Protected lvlH = Int((h-2*pad)*n)
    Box(pad, y - lvlH, barW, lvlH, RGB(70,190,110))
    Line(pad, y - Int((h-2*pad)*peakN), barW, 0, RGB(220,220,90))
    DrawingMode(#PB_2DDrawing_Transparent)
    DrawText(pad, 4, "RMS: " + StrD(g_rms,1) + "  Peak: " + StrD(g_peak,1), RGB(220,220,230))
    StopDrawing()
  EndIf
EndProcedure

Procedure BuildUI()
  OpenWindow(#Win, 200, 120, 760, 520, "WASAPI Recorder — SR-Fix", #PB_Window_SystemMenu | #PB_Window_ScreenCentered)

  TextGadget(#PB_Any, 10, 12, 60, 22, "Gerät:")
  ComboBoxGadget(#CmbDev, 70, 10, 360, 24)
  ButtonGadget(#BtnRefresh, 440, 10, 80, 24, "Refresh")
  ButtonGadget(#BtnStart,  530, 10, 80, 24, "Start")
  ButtonGadget(#BtnStop,   620, 10, 80, 24, "Stop") : DisableGadget(#BtnStop, #True)

  CanvasGadget(#Canvas, 10, 50, 730, 220)

  ; VAD Controls
  TextGadget(#PB_Any, 10, 280, 120, 20, "VAD-Schwelle")
  TrackBarGadget(#SliderThresh, 10, 300, 220, 24, 0, 4000) : SetGadgetState(#SliderThresh, Int(g_vadThresh))
  TextGadget(#PB_Any, 240, 280, 160, 20, "Min. Sprachdauer (ms)")
  SpinGadget(#SpinMin, 240, 300, 120, 24, 50, 5000, #PB_Spin_Numeric) : SetGadgetState(#SpinMin, g_vadMinMs)
  TextGadget(#PB_Any, 370, 280, 120, 20, "Hangover (ms)")
  SpinGadget(#SpinHang, 370, 300, 120, 24, 100, 5000, #PB_Spin_Numeric) : SetGadgetState(#SpinHang, g_vadHangMs)
  TextGadget(#PB_Any, 500, 280, 130, 20, "Max Segment (ms)")
  SpinGadget(#SpinMaxSeg, 500, 300, 120, 24, 1000, 60000, #PB_Spin_Numeric) : SetGadgetState(#SpinMaxSeg, g_maxSegMs)

  ; API Controls
  CheckBoxGadget(#ChkAPI, 10, 340, 220, 22, "Transkription via API aktiv")
  SetGadgetState(#ChkAPI, g_apiEnabled)
  TextGadget(#PB_Any, 10, 370, 80, 20, "API URL")
  StringGadget(#InpURL, 90, 368, 300, 24, #API_URL_DEFAULT)
  TextGadget(#PB_Any, 400, 370, 110, 20, "Bearer Token")
  StringGadget(#InpBearer, 510, 368, 230, 24, #API_BEARER_DEFAULT)

  ListViewGadget(#Log, 10, 400, 730, 110)

  Log_("Bereit. WAV-Header folgt exakt dem initiierten Format.")
  RefreshDeviceList()
  AddWindowTimer(#Win, #TimerUI, 33)
EndProcedure

; ---------- Main ----------
BuildUI()
Define ev, gad
Repeat
  ev = WaitWindowEvent(10)
  Select ev
    Case #PB_Event_Timer
      If EventTimer() = #TimerUI
        DrawMeter()
        If g_running : Poll() : EndIf
      EndIf

    Case #PB_Event_Gadget
      gad = EventGadget()
      Select gad
        Case #BtnRefresh
          RefreshDeviceList()

        Case #BtnStart
          g_apiEnabled = GetGadgetState(#ChkAPI)
          g_apiUrl     = GetGadgetText(#InpURL)
          g_apiBearer  = GetGadgetText(#InpBearer)
          g_vadThresh  = GetGadgetState(#SliderThresh)
          g_vadMinMs   = GetGadgetState(#SpinMin)
          g_vadHangMs  = GetGadgetState(#SpinHang)
          g_maxSegMs   = GetGadgetState(#SpinMaxSeg)
          If WASAPI_Start()
            DisableGadget(#BtnStart, #True) : DisableGadget(#BtnStop, #False)
          EndIf

        Case #BtnStop
          WASAPI_Stop()
          DisableGadget(#BtnStart, #False) : DisableGadget(#BtnStop, #True)

        Case #SliderThresh : g_vadThresh = GetGadgetState(#SliderThresh)
        Case #SpinMin      : g_vadMinMs  = GetGadgetState(#SpinMin)
        Case #SpinHang     : g_vadHangMs = GetGadgetState(#SpinHang)
        Case #SpinMaxSeg   : g_maxSegMs  = GetGadgetState(#SpinMaxSeg)
        Case #ChkAPI       : g_apiEnabled= GetGadgetState(#ChkAPI)
        Case #InpURL       : g_apiUrl    = GetGadgetText(#InpURL)
        Case #InpBearer    : g_apiBearer = GetGadgetText(#InpBearer)
      EndSelect

    Case #PB_Event_CloseWindow
      Break
  EndSelect
ForEver

WASAPI_Stop()
End


Re: Purebasic Alexa with WASAPI

Posted: Fri Sep 12, 2025 3:26 am
by idle
I can't get it to record on my win 11 system.
I can get it to record with miniaudio via the default recording device mic and also loopback with andkMK's audioclient
If it's coming out choppy sounding maybe your not writing the wave file properly. I had similar problem in the mpg module initially

This is how I'm wrting out to wav in the mpg module

Code: Select all

 #REPLAY_RATE	=	44100
 #REPLAY_DEPTH	=	16
 #REPLAY_SAMPLELEN	= (#REPLAY_DEPTH/8)
   
Procedure.i MPG_CreateSound(*mpg.mpg,AudioStream,instances=0) 
    
    Protected.i Result, HeaderSize, DataSize
    Protected *WAVBuffer, *RiffPtr.RIFFStructure, *fmtPtr.fmtStructure, *dataPtr.dataStructure, *audioPtr.word
    Protected *frame.plm_samples,file$,fn,sample,pos,a
    
    If *mpg\soundtype = #MPG_SOUND 
      datasize = (*mpg\duration+1) * #REPLAY_RATE * 4 
    Else 
      datasize = (*mpg\duration+1) * #REPLAY_RATE * 2 
    EndIf   
    
    plm_Set_Video_enabled(*mpg\plm,0)
    
    HeaderSize = SizeOf(RIFFStructure)
    HeaderSize + SizeOf(fmtStructure)
    HeaderSize + SizeOf(dataStructure)
    
    *WAVBuffer = AllocateMemory(HeaderSize + DataSize, #PB_Memory_NoClear)
    If *WAVBuffer
      
      *RiffPtr = *WAVBuffer
      PokeS(@*RiffPtr\Riff, "RIFF", 4, #PB_Ascii|#PB_String_NoZero)
      *RiffPtr\Length = HeaderSize + DataSize - 8
      PokeS(@*RiffPtr\Wave, "WAVE", 4, #PB_Ascii|#PB_String_NoZero)
      
      *fmtPtr = *WAVBuffer + SizeOf(RIFFStructure)
      PokeS(@*fmtPtr\fmt, "fmt ", 4, #PB_Ascii|#PB_String_NoZero)
      *fmtPtr\Length = SizeOf(fmtStructure) - 8
      *fmtPtr\Format = 1
      If *mpg\soundtype = #MPG_SOUND  ;stereo 
        *fmtPtr\Channels = 2 
      Else 
        *fmtPtr\Channels = 1 
      EndIf   
      *fmtPtr\SampleRate = 44100                ;#REPLAY_RATE   
      *fmtPtr\BitsPerSample = 16                   ;#REPLAY_DEPTH
      *fmtPtr\BlockAlign =  4;                         #REPLAY_SAMPLELEN * 2
      *fmtPtr\BytesPerSecond = 44100*2      ;#REPLAY_RATE * #REPLAY_SAMPLELEN
      
      *dataPtr = *WAVBuffer + SizeOf(RIFFStructure) + SizeOf(fmtStructure)
      PokeS(@*dataPtr\Signature, "data", 4, #PB_Ascii|#PB_String_NoZero)
      *dataPtr\Length = DataSize
      *audioPtr = *WAVBuffer + HeaderSize 
      
      Repeat 
        *frame = plm_decode_audio(*mpg\plm) 
        If *frame      
          If *mpg\soundtype = #MPG_SOUND ; stereo 
            For sample = 0 To (*frame\count*2)-1  
              *audioPtr\w= *frame\interleaved[sample] * (32767)  
              *audioPtr+2 
              pos + 2  
            Next
          Else 
            For sample = 0 To (*frame\count*2)-1 Step 2  ;mono  
              *audioPtr\w= ((*frame\interleaved[sample] + *frame\interleaved[sample+1]) *0.5) * (32767)  
              *audioPtr+2 
              pos + 2  
            Next
          EndIf   
          
        EndIf 
      Until *frame = 0  
      
      *dataPtr\Length = pos
      
      If *mpg\soundtype = #MPG_SOUND3D 
        Add3DArchive(GetTemporaryDirectory(), #PB_3DArchive_FileSystem)
        file$ = GetTemporaryDirectory()+"tempmpg.wav"
        fn = CreateFile(#PB_Any,file$) 
        If fn 
          WriteData(fn,*WAVBuffer,pos+HeaderSize) 
          CloseFile(fn) 
        EndIf
        If instances > #REPLAY_MAXNBSOUND : instances = #REPLAY_MAXNBSOUND : EndIf 
          For a = 0 To instances
            *mpg\sound[a] = LoadSound3D(#PB_Any,"tempmpg.wav")
           Next  
          DeleteFile(file$) 
        Else   
          *mpg\sound = CatchSound(-1, *WAVBuffer)
        EndIf   
      FreeMemory(*WAVBuffer)
    EndIf
    
    plm_Set_Video_enabled(*mpg\plm,1) 
    plm_rewind(*mpg\plm) 
    
    ProcedureReturn #True
    
  EndProcedure
 
I will try to do a recording example to wav from miniaudio so it can be cross platform and hopefully it will work once 6.30b2 is out though I'm not sure if Fred has the device info in the PB implementation but I'm sure he can recompile it.

Re: Purebasic Alexa with WASAPI

Posted: Fri Sep 12, 2025 7:49 am
by dige
Thanks, Idle, that sounds promising. Thank you very much in advance. 👍