The difficulty seems to be in accessing the speaker out, not in actual JS Speech SDK code. If I can somehow wrangle the speaker out into a MediaStream, then I can then use the code AudioConfig.fromStreamInput(myMediaStream); to set up input for audio transcription.

I have found something related How to get a media stream of the speaker's output to transfer it over the network or record it?

The accepted solution returns an error along the line of speaker.addTrack(stream.getAudioTracks()[0].clone());"Cannot read properties of undefined (reading 'clone')"

Below is my implementation

 const getAudioStream = async () => {
    const speaker = new MediaStream;
    if (navigator.getDisplayMedia) {
        navigator.getDisplayMedia({
            video: true ,
            audio: true
        }).then(stream => {
            speaker.addTrack(stream.getAudioTracks()[0].clone());
            // stopping and removing the video track to enhance the performance
            stream.getVideoTracks()[0].stop();
            stream.removeTrack(stream.getVideoTracks()[0]);
        }).catch((error) => {
            console.log(error);
        });
    } else if (navigator.mediaDevices.getDisplayMedia) {
        navigator.mediaDevices.getDisplayMedia({
            video: true ,
            audio: true
        }).then(stream => {
            speaker.addTrack(stream.getAudioTracks()[0].clone());
            // stopping and removing the video track to enhance the performance
            stream.getVideoTracks()[0].stop();
            stream.removeTrack(stream.getVideoTracks()[0]);
        }).catch((error) => {
            console.log(error);
        });
    }
   return speaker;
}

useEffect(() => { const startStreaming = async () => { const speaker = await getAudioStream(); const audioConfig = AudioConfig.fromStreamInput(speaker); speechrecognizer.audioConfig = audioConfig; } startStreaming(); }, []);

I have implemented a hook in past projects that records the streams from both speaker and mic

import { useEffect, useRef, useState } from 'react';

export const useAudioRecorder = () => { const [isRecording, setIsRecording] = useState(false); const [audioBlob, setAudioBlob] = useState(null); const mediaRecorder = useRef(null); const audioStream = useRef(null);

const startRecording = async () => { try { const microphoneStream = await navigator.mediaDevices.getUserMedia({ audio: true }); const speakerStream = await       navigator.mediaDevices.getUserMedia({ audio: { echoCancellation: false } });
  const audioContext = new AudioContext();
  const microphoneSource = audioContext.createMediaStreamSource(microphoneStream);
  const speakerSource = audioContext.createMediaStreamSource(speakerStream);
  const mixedOutput = audioContext.createMediaStreamDestination();

  microphoneSource.connect(mixedOutput);
  speakerSource.connect(mixedOutput);

  audioStream.current = mixedOutput.stream;
  mediaRecorder.current = new MediaRecorder(audioStream.current);
  
  mediaRecorder.current.ondataavailable = handleDataAvailable;
  mediaRecorder.current.start();
  
  setIsRecording(true);
} catch (error) {
  console.error('Error starting recording:', error);
}
};

const stopRecording = () => { if (mediaRecorder.current && isRecording) { mediaRecorder.current.stop(); setIsRecording(false); } };

const handleDataAvailable = (event) => { const audioBlob = new Blob([event.data], { type: 'audio/wav' }); setAudioBlob(audioBlob); };

useEffect(() => () => { // Clean up the streams when the component unmounts if (audioStream.current) { audioStream.current.getTracks().forEach(track => track.stop()); } }, []);

return { isRecording, audioBlob, startRecording, stopRecording }; };

Tried to use similar approach in my getAudioStream function, but it only gets mic's but not speaker's

1

There are 1 best solutions below

0
guest271314 On

There is no specified way to capture system audio exclusively using Web API's.

Firefox exposes monitor devices for getUserMedia(), Chromium-based browsers don't.

After 5 years Chromium based browsers can now capture system audio using getDisplayMedia() by passing the command-line switch --enable-features=PulseaudioLoopbackForScreenShare, see [Linux] System loopback audio capture. For some references to capturing system audio on Chromium based browsers, see captureSystemAudio References.

I found that volume of the cpatured device decreases to 8%. Using --disable-features=WebRtcAllowInputVolumeAdjustment.

var stream = await navigator.mediaDevices.getDisplayMedia({
  // We're not going to be using the video track
  video: {
    width: 0,
    height: 0,
    frameRate: 0,
    displaySurface: "monitor",
  },
  audio: {
    suppressLocalAudioPlayback: false,
    // Speech synthesis audio output is generally 1 channel
    channelCount: 2,
    noiseSuppression: false,
    autoGainControl: false,
    echoCancellation: false,
  },
  systemAudio: "include",
  // Doesn't work for Tab capture
  // preferCurrentTab: true
});

var [audioTrack] = stream.getAudioTracks();

stream.getVideoTracks()[0].stop();

To rely on your own code you can remap an output device to an input device. Here I've remapped to default monitor ("What-U-Hear" from speakers and headphones) to the default input device

pactl load-module module-remap-source \
  master=@DEFAULT_MONITOR@ \
  source_name=speakers source_properties=device.description=Speakers \
&& pactl set-default-source speakers

Thereafter the default input device are speakers, then I can use getUserMedia()

var stream = await navigator.mediaDevices.getUserMedia({
  audio: {
    channelCount: 2,
    sampleRate: 44100,
    noiseSuppression: false,
    autoGainControl: false,
    echoCancellation: false,
  }
});
var [audioTrack] = stream.getAudioTracks();

You'll have to figure out how to remap output devices to input devices on your OS. Some work has been done on this here Screenshare-with-audio-on-Discord-with-Linux.

You can take that approach a step further to create sink-inputs to capture only specific devices, e.g., output from Speech Dispatcher, see Chromium does not support capture of monitor devices by default #17

pactl load-module module-combine-sink \
sink_name=Web_Speech_Sink slaves=$(pacmd list-sinks | grep -A1 "* index" | grep -oP "<\K[^ >]+") \
sink_properties=device.description="Web_Speech_Stream" \
format=s16le \
channels=1 \
rate=22050
pactl load-module module-remap-source \
master=Web_Speech_Sink.monitor \
source_name=Web_Speech_Monitor \
source_properties=device.description=Web_Speech_Output
pactl move-sink-input $(pacmd list-sink-inputs | tac | perl -E'undef$/;$_=<>;/speech-dispatcher-espeak-ng.*?index: (\d+)\n/s;say $1') Web_Speech_Sink

then do something like

const devices = await navigator.mediaDevices.enumerateDevices();
  const device = devices.find(({label}) => label === 'Web_Speech_Output');
  if (track.getSettings().deviceId === device.deviceId) {
    return stream;
  } else {
    track.stop();
    console.log(devices, device);
    return navigator.mediaDevices.getUserMedia({audio: {deviceId: {exact: device.deviceId}}});
  }
})