test code custom speech recognition

2023-06-29 10:10:00 +09:00
parent bc300eae4c
commit 868c84a9eb
25 changed files with 135723 additions and 0 deletions
--- a/AudioRecorder.py
+++ b/AudioRecorder.py
@@ -0,0 +1,49 @@
 import custom_speech_recognition as sr
 import pyaudiowpatch as pyaudio
 from datetime import datetime
 RECORD_TIMEOUT = 3
 ENERGY_THRESHOLD = 1000
 DYNAMIC_ENERGY_THRESHOLD = False
 class BaseRecorder:
    def __init__(self, source):
        self.recorder = sr.Recognizer()
        self.recorder.energy_threshold = ENERGY_THRESHOLD
        self.recorder.dynamic_energy_threshold = DYNAMIC_ENERGY_THRESHOLD
        if source is None:
            raise ValueError("audio source can't be None")
        self.source = source
    def adjust_for_noise(self):
        with self.source:
            self.recorder.adjust_for_ambient_noise(self.source)
    def record_into_queue(self, audio_queue):
        def record_callback(_, audio:sr.AudioData) -> None:
            audio_queue.put((audio.get_raw_data(), datetime.now()))
        self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT)
 class SelectedMicRecorder(BaseRecorder):
    def __init__(self, device):
        source=sr.Microphone(
            device_index=device['index'],
            sample_rate=int(device["defaultSampleRate"]),
        )
        super().__init__(source=source)
        self.adjust_for_noise()
 class SelectedSpeakerRecorder(BaseRecorder):
    def __init__(self, device):
        source = sr.Microphone(speaker=True,
            device_index= device["index"],
            sample_rate=int(device["defaultSampleRate"]),
            chunk_size=pyaudio.get_sample_size(pyaudio.paInt16),
            channels=device["maxInputChannels"]
        )
        super().__init__(source=source)
        self.adjust_for_noise()
--- a/AudioTranscriber.py
+++ b/AudioTranscriber.py
@@ -0,0 +1,79 @@
 import threading
 import custom_speech_recognition as sr
 from datetime import timedelta
 from heapq import merge
 PHRASE_TIMEOUT = 3.05
 MAX_PHRASES = 10
 class AudioTranscriber:
    def __init__(self, source, language):
        self.language = language
        self.transcript_data = []
        self.transcript_changed_event = threading.Event()
        self.audio_recognizer = sr.Recognizer()
        self.audio_sources = {
                "sample_rate": source.SAMPLE_RATE,
                "sample_width": source.SAMPLE_WIDTH,
                "channels": source.channels,
                "last_sample": bytes(),
                "last_spoken": None,
                "new_phrase": True,
        }
    def transcribe_audio_queue(self, audio_queue):
        while True:
            audio, time_spoken = audio_queue.get()
            self.update_last_sample_and_phrase_status(audio, time_spoken)
            text = ''
            try:
                audio_data = self.process_data()
                text = self.audio_recognizer.recognize_google(audio_data, language=self.language)
            except Exception as e:
                pass
            finally:
                pass
            if text != '':
                self.update_transcript(text)
    def update_last_sample_and_phrase_status(self, data, time_spoken):
        source_info = self.audio_sources
        if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=PHRASE_TIMEOUT):
            source_info["last_sample"] = bytes()
            source_info["new_phrase"] = True
        else:
            source_info["new_phrase"] = False
        source_info["last_sample"] += data
        source_info["last_spoken"] = time_spoken
    def process_data(self):
        print(self.audio_sources["last_sample"])
        audio_data = sr.AudioData(self.audio_sources["last_sample"], self.audio_sources["sample_rate"], self.audio_sources["sample_width"])
        return audio_data
    def update_transcript(self, text):
        source_info = self.audio_sources
        transcript = self.transcript_data
        if source_info["new_phrase"] or len(transcript) == 0:
            if len(transcript) > MAX_PHRASES:
                transcript.pop(-1)
            transcript.insert(0, text)
        else:
            transcript[0] = text
    def get_transcript(self):
        if len(self.transcript_data) > 0:
            text = self.transcript_data.pop(-1)
        else:
            text = ""
        return text
    def clear_transcript_data(self):
        self.transcript_data.clear()
        self.audio_sources["last_sample"] = bytes()
        self.audio_sources["new_phrase"] = True
--- a/audio_utils.py
+++ b/audio_utils.py
@@ -0,0 +1,49 @@
 import pyaudiowpatch as pyaudio
 def get_input_device_list():
    devices = []
    with pyaudio.PyAudio() as p:
        wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
        for host_index in range(0, p.get_host_api_count()):
            for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
                device = p.get_device_info_by_host_api_device_index(host_index, device_index)
                if device["hostApi"] == wasapi_info["index"] and device["maxInputChannels"] > 0 and device["isLoopbackDevice"] is False:
                    devices.append(device)
    return devices
 def get_output_device_list():
    devices =[]
    with pyaudio.PyAudio() as p:
        wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
        for device in p.get_loopback_device_info_generator():
            if device["hostApi"] == wasapi_info["index"] and device["isLoopbackDevice"] is True:
                devices.append(device)
    return devices
 def get_default_input_device():
    with pyaudio.PyAudio() as p:
        wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
        defaultInputDevice = wasapi_info["defaultInputDevice"]
        for host_index in range(0, p.get_host_api_count()):
            for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
                device = p.get_device_info_by_host_api_device_index(host_index, device_index)
                if device["index"] == defaultInputDevice:
                    default_device = device
                    return default_device
 def get_default_output_device():
    with pyaudio.PyAudio() as p:
        wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
        defaultOutputDevice = wasapi_info["defaultOutputDevice"]
        for host_index in range(0, p.get_host_api_count()):
            for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
                device = p.get_device_info_by_host_api_device_index(host_index, device_index)
                if device["index"] == defaultOutputDevice:
                    default_speakers = device
                    if not default_speakers["isLoopbackDevice"]:
                        for loopback in p.get_loopback_device_info_generator():
                            if default_speakers["name"] in loopback["name"]:
                                default_device = loopback
                                return default_device
--- a/custom_speech_recognition/init.py
+++ b/custom_speech_recognition/init.py
--- a/custom_speech_recognition/main.py
+++ b/custom_speech_recognition/main.py
@@ -0,0 +1,24 @@
 import custom_speech_recognition as sr
 r = sr.Recognizer()
 m = sr.Microphone()
 try:
    print("A moment of silence, please...")
    with m as source: r.adjust_for_ambient_noise(source)
    print("Set minimum energy threshold to {}".format(r.energy_threshold))
    while True:
        print("Say something!")
        with m as source: audio = r.listen(source)
        print("Got it! Now to recognize it...")
        try:
            # recognize speech using Google Speech Recognition
            value = r.recognize_google(audio)
            print("You said {}".format(value))
        except sr.UnknownValueError:
            print("Oops! Didn't catch that")
        except sr.RequestError as e:
            print("Uh oh! Couldn't request results from Google Speech Recognition service; {0}".format(e))
 except KeyboardInterrupt:
    pass
--- a/custom_speech_recognition/audio.py
+++ b/custom_speech_recognition/audio.py
@@ -0,0 +1,317 @@
 import aifc
 import audioop
 import io
 import os
 import platform
 import stat
 import subprocess
 import sys
 import wave
 class AudioData(object):
    """
    Creates a new ``AudioData`` instance, which represents mono audio data.
    The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format.
    The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample.
    The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz).
    Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly.
    """
    def __init__(self, frame_data, sample_rate, sample_width):
        assert sample_rate > 0, "Sample rate must be a positive integer"
        assert (
            sample_width % 1 == 0 and 1 <= sample_width <= 4
        ), "Sample width must be between 1 and 4 inclusive"
        self.frame_data = frame_data
        self.sample_rate = sample_rate
        self.sample_width = int(sample_width)
    def get_segment(self, start_ms=None, end_ms=None):
        """
        Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in.
        If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end.
        """
        assert (
            start_ms is None or start_ms >= 0
        ), "``start_ms`` must be a non-negative number"
        assert end_ms is None or end_ms >= (
            0 if start_ms is None else start_ms
        ), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``"
        if start_ms is None:
            start_byte = 0
        else:
            start_byte = int(
                (start_ms * self.sample_rate * self.sample_width) // 1000
            )
        if end_ms is None:
            end_byte = len(self.frame_data)
        else:
            end_byte = int(
                (end_ms * self.sample_rate * self.sample_width) // 1000
            )
        return AudioData(
            self.frame_data[start_byte:end_byte],
            self.sample_rate,
            self.sample_width,
        )
    def get_raw_data(self, convert_rate=None, convert_width=None):
        """
        Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance.
        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
        Writing these bytes directly to a file results in a valid `RAW/PCM audio file <https://en.wikipedia.org/wiki/Raw_audio_format>`__.
        """
        assert (
            convert_rate is None or convert_rate > 0
        ), "Sample rate to convert to must be a positive integer"
        assert convert_width is None or (
            convert_width % 1 == 0 and 1 <= convert_width <= 4
        ), "Sample width to convert to must be between 1 and 4 inclusive"
        raw_data = self.frame_data
        # make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples)
        if self.sample_width == 1:
            raw_data = audioop.bias(
                raw_data, 1, -128
            )  # subtract 128 from every sample to make them act like signed samples
        # resample audio at the desired rate if specified
        if convert_rate is not None and self.sample_rate != convert_rate:
            raw_data, _ = audioop.ratecv(
                raw_data,
                self.sample_width,
                1,
                self.sample_rate,
                convert_rate,
                None,
            )
        # convert samples to desired sample width if specified
        if convert_width is not None and self.sample_width != convert_width:
            if (
                convert_width == 3
            ):  # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866)
                raw_data = audioop.lin2lin(
                    raw_data, self.sample_width, 4
                )  # convert audio into 32-bit first, which is always supported
                try:
                    audioop.bias(
                        b"", 3, 0
                    )  # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do)
                except (
                    audioop.error
                ):  # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less)
                    raw_data = b"".join(
                        raw_data[i + 1 : i + 4]
                        for i in range(0, len(raw_data), 4)
                    )  # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample
                else:  # 24-bit audio fully supported, we don't need to shim anything
                    raw_data = audioop.lin2lin(
                        raw_data, self.sample_width, convert_width
                    )
            else:
                raw_data = audioop.lin2lin(
                    raw_data, self.sample_width, convert_width
                )
        # if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again
        if convert_width == 1:
            raw_data = audioop.bias(
                raw_data, 1, 128
            )  # add 128 to every sample to make them act like unsigned samples again
        return raw_data
    def get_wav_data(self, convert_rate=None, convert_width=None, nchannels = 1):
        """
        Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance.
        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
        Writing these bytes directly to a file results in a valid `WAV file <https://en.wikipedia.org/wiki/WAV>`__.
        """
        raw_data = self.get_raw_data(convert_rate, convert_width)
        sample_rate = (
            self.sample_rate if convert_rate is None else convert_rate
        )
        sample_width = (
            self.sample_width if convert_width is None else convert_width
        )
        # generate the WAV file contents
        with io.BytesIO() as wav_file:
            wav_writer = wave.open(wav_file, "wb")
            try:  # note that we can't use context manager, since that was only added in Python 3.4
                wav_writer.setframerate(sample_rate)
                wav_writer.setsampwidth(sample_width)
                wav_writer.setnchannels(nchannels)
                wav_writer.writeframes(raw_data)
                wav_data = wav_file.getvalue()
            finally:  # make sure resources are cleaned up
                wav_writer.close()
        return wav_data
    def get_aiff_data(self, convert_rate=None, convert_width=None):
        """
        Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance.
        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
        Writing these bytes directly to a file results in a valid `AIFF-C file <https://en.wikipedia.org/wiki/Audio_Interchange_File_Format>`__.
        """
        raw_data = self.get_raw_data(convert_rate, convert_width)
        sample_rate = (
            self.sample_rate if convert_rate is None else convert_rate
        )
        sample_width = (
            self.sample_width if convert_width is None else convert_width
        )
        # the AIFF format is big-endian, so we need to convert the little-endian raw data to big-endian
        if hasattr(
            audioop, "byteswap"
        ):  # ``audioop.byteswap`` was only added in Python 3.4
            raw_data = audioop.byteswap(raw_data, sample_width)
        else:  # manually reverse the bytes of each sample, which is slower but works well enough as a fallback
            raw_data = raw_data[sample_width - 1 :: -1] + b"".join(
                raw_data[i + sample_width : i : -1]
                for i in range(sample_width - 1, len(raw_data), sample_width)
            )
        # generate the AIFF-C file contents
        with io.BytesIO() as aiff_file:
            aiff_writer = aifc.open(aiff_file, "wb")
            try:  # note that we can't use context manager, since that was only added in Python 3.4
                aiff_writer.setframerate(sample_rate)
                aiff_writer.setsampwidth(sample_width)
                aiff_writer.setnchannels(1)
                aiff_writer.writeframes(raw_data)
                aiff_data = aiff_file.getvalue()
            finally:  # make sure resources are cleaned up
                aiff_writer.close()
        return aiff_data
    def get_flac_data(self, convert_rate=None, convert_width=None):
        """
        Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance.
        Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC.
        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
        Writing these bytes directly to a file results in a valid `FLAC file <https://en.wikipedia.org/wiki/FLAC>`__.
        """
        assert convert_width is None or (
            convert_width % 1 == 0 and 1 <= convert_width <= 3
        ), "Sample width to convert to must be between 1 and 3 inclusive"
        if (
            self.sample_width > 3 and convert_width is None
        ):  # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder
            convert_width = 3  # the largest supported sample width is 24-bit, so we'll limit the sample width to that
        # run the FLAC converter with the WAV data to get the FLAC data
        wav_data = self.get_wav_data(convert_rate, convert_width)
        flac_converter = get_flac_converter()
        if (
            os.name == "nt"
        ):  # on Windows, specify that the process is to be started without showing a console window
            startup_info = subprocess.STARTUPINFO()
            startup_info.dwFlags |= (
                subprocess.STARTF_USESHOWWINDOW
            )  # specify that the wShowWindow field of `startup_info` contains a value
            startup_info.wShowWindow = (
                subprocess.SW_HIDE
            )  # specify that the console window should be hidden
        else:
            startup_info = None  # default startupinfo
        process = subprocess.Popen(
            [
                flac_converter,
                "--stdout",
                "--totally-silent",  # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output
                "--best",  # highest level of compression available
                "-",  # the input FLAC file contents will be given in stdin
            ],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            startupinfo=startup_info,
        )
        flac_data, stderr = process.communicate(wav_data)
        return flac_data
 def get_flac_converter():
    """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found."""
    flac_converter = shutil_which("flac")  # check for installed version first
    if flac_converter is None:  # flac utility is not installed
        base_path = os.path.dirname(
            os.path.abspath(__file__)
        )  # directory of the current module file, where all the FLAC bundled binaries are stored
        system, machine = platform.system(), platform.machine()
        if system == "Windows" and machine in {
            "i686",
            "i786",
            "x86",
            "x86_64",
            "AMD64",
        }:
            flac_converter = os.path.join(base_path, "flac-win32.exe")
        elif system == "Darwin" and machine in {
            "i686",
            "i786",
            "x86",
            "x86_64",
            "AMD64",
        }:
            flac_converter = os.path.join(base_path, "flac-mac")
        elif system == "Linux" and machine in {"i686", "i786", "x86"}:
            flac_converter = os.path.join(base_path, "flac-linux-x86")
        elif system == "Linux" and machine in {"x86_64", "AMD64"}:
            flac_converter = os.path.join(base_path, "flac-linux-x86_64")
        else:  # no FLAC converter available
            raise OSError(
                "FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent"
            )
    # mark FLAC converter as executable if possible
    try:
        # handle known issue when running on docker:
        # run executable right after chmod() may result in OSError "Text file busy"
        # fix: flush FS with sync
        if not os.access(flac_converter, os.X_OK):
            stat_info = os.stat(flac_converter)
            os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC)
            if "Linux" in platform.system():
                os.sync() if sys.version_info >= (3, 3) else os.system("sync")
    except OSError:
        pass
    return flac_converter
 def shutil_which(pgm):
    """Python 2 compatibility: backport of ``shutil.which()`` from Python 3"""
    path = os.getenv("PATH")
    for p in path.split(os.path.pathsep):
        p = os.path.join(p, pgm)
        if os.path.exists(p) and os.access(p, os.X_OK):
            return p
--- a/custom_speech_recognition/exceptions.py
+++ b/custom_speech_recognition/exceptions.py
@@ -0,0 +1,22 @@
 class SetupError(Exception):
    pass
 class WaitTimeoutError(Exception):
    pass
 class RequestError(Exception):
    pass
 class UnknownValueError(Exception):
    pass
 class TranscriptionNotReady(Exception):
    pass
 class TranscriptionFailed(Exception):
    pass
--- a/custom_speech_recognition/flac-linux-x86
+++ b/custom_speech_recognition/flac-linux-x86
--- a/custom_speech_recognition/flac-linux-x86_64
+++ b/custom_speech_recognition/flac-linux-x86_64
--- a/custom_speech_recognition/flac-mac
+++ b/custom_speech_recognition/flac-mac
--- a/custom_speech_recognition/flac-win32.exe
+++ b/custom_speech_recognition/flac-win32.exe
--- a/custom_speech_recognition/pocketsphinx-data/en-US/LICENSE.txt
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/LICENSE.txt
@@ -0,0 +1,31 @@
 Copyright (c) 1999-2015 Carnegie Mellon University.  All rights
 reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 are met:
 1. Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer. 
 2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in
   the documentation and/or other materials provided with the
   distribution.
 This work was supported in part by funding from the Defense Advanced 
 Research Projects Agency and the National Science Foundation of the 
 United States of America, and the CMU Sphinx Speech Consortium.
 THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
 ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
 NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/README
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/README
@@ -0,0 +1,34 @@
 /* ====================================================================
 * Copyright (c) 2015 Alpha Cephei Inc. All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY ALPHA CEPHEI INC. ``AS IS'' AND.
 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,.
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ALPHA CEPHEI INC.
 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT.
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,.
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY.
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT.
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE.
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * ====================================================================
 *
 */
 This directory contains generic US english acoustic model trained with
 latest sphinxtrain.
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/feat.params
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/feat.params
@@ -0,0 +1,12 @@
 -lowerf 130
 -upperf 6800
 -nfilt 25
 -transform dct
 -lifter 22
 -feat 1s_c_d_dd
 -svspec 0-12/13-25/26-38
 -agc none
 -cmn current
 -varnorm no
 -model ptm
 -cmninit 40,3,-1
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/mdef
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/mdef
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/means
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/means
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/noisedict
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/noisedict
@@ -0,0 +1,5 @@
 <s> SIL
 </s> SIL
 <sil> SIL
 [NOISE] +NSN+
 [SPEECH] +SPN+
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/sendump
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/sendump
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/transition_matrices
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/transition_matrices
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/variances
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/variances
--- a/custom_speech_recognition/pocketsphinx-data/en-US/language-model.lm.bin
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/language-model.lm.bin
--- a/custom_speech_recognition/pocketsphinx-data/en-US/pronounciation-dictionary.dict
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/pronounciation-dictionary.dict
--- a/custom_speech_recognition/recognizers/init.py
+++ b/custom_speech_recognition/recognizers/init.py
--- a/custom_speech_recognition/recognizers/whisper.py
+++ b/custom_speech_recognition/recognizers/whisper.py
@@ -0,0 +1,42 @@
 from __future__ import annotations
 import os
 from io import BytesIO
 from custom_speech_recognition.audio import AudioData
 from custom_speech_recognition.exceptions import SetupError
 def recognize_whisper_api(
    recognizer,
    audio_data: "AudioData",
    *,
    model: str = "whisper-1",
    api_key: str | None = None,
 ):
    """
    Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API.
    This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings <https://platform.openai.com/account/api-keys>`__.
    Detail: https://platform.openai.com/docs/guides/speech-to-text
    Raises a ``speech_recognition.exceptions.SetupError`` exception if there are any issues with the openai installation, or the environment variable is missing.
    """
    if not isinstance(audio_data, AudioData):
        raise ValueError("``audio_data`` must be an ``AudioData`` instance")
    if api_key is None and os.environ.get("OPENAI_API_KEY") is None:
        raise SetupError("Set environment variable ``OPENAI_API_KEY``")
    try:
        import openai
    except ImportError:
        raise SetupError(
            "missing openai module: ensure that openai is set up correctly."
        )
    wav_data = BytesIO(audio_data.get_wav_data())
    wav_data.name = "SpeechRecognition_audio.wav"
    transcript = openai.Audio.transcribe(model, wav_data, api_key=api_key)
    return transcript["text"]
--- a/test_main.py
+++ b/test_main.py
@@ -0,0 +1,38 @@
 import time
 import threading
 import queue
 import AudioTranscriber
 import AudioRecorder
 import audio_utils
 mic_audio_queue = queue.Queue()
 mic_device = audio_utils.get_default_input_device()
 mic_audio_recorder = AudioRecorder.SelectedMicRecorder(mic_device)
 mic_audio_recorder.record_into_queue(mic_audio_queue)
 mic_transcriber = AudioTranscriber.AudioTranscriber(source=mic_audio_recorder.source, language="ja-JP")
 mic_transcribe = threading.Thread(target=mic_transcriber.transcribe_audio_queue, args=(mic_audio_queue,))
 mic_transcribe.daemon = True
 mic_transcribe.start()
 time.sleep(2)
 spk_audio_queue = queue.Queue()
 spk_device = audio_utils.get_default_output_device()
 spk_audio_recorder = AudioRecorder.SelectedSpeakerRecorder(spk_device)
 spk_audio_recorder.record_into_queue(spk_audio_queue)
 spk_transcriber = AudioTranscriber.AudioTranscriber(source=mic_audio_recorder.source, language="ja-JP")
 spk_transcribe = threading.Thread(target=spk_transcriber.transcribe_audio_queue, args=(spk_audio_queue,))
 spk_transcribe.daemon = True
 spk_transcribe.start()
 while True:
    text = mic_transcriber.get_transcript()
    if len(text) > 0:
        print("mic:", text)
    # text = spk_transcriber.get_transcript()
    # if len(text) > 0:
    #     print("spk:", text)
    time.sleep(0.1)