test code custom speech recognition

2023-06-29 10:10:00 +09:00
parent bc300eae4c
commit 868c84a9eb
25 changed files with 135723 additions and 0 deletions
--- a/AudioRecorder.py
+++ b/AudioRecorder.py
@@ -0,0 +1,49 @@
+import custom_speech_recognition as sr
+import pyaudiowpatch as pyaudio
+from datetime import datetime
+
+RECORD_TIMEOUT = 3
+ENERGY_THRESHOLD = 1000
+DYNAMIC_ENERGY_THRESHOLD = False
+
+class BaseRecorder:
+    def __init__(self, source):
+        self.recorder = sr.Recognizer()
+        self.recorder.energy_threshold = ENERGY_THRESHOLD
+        self.recorder.dynamic_energy_threshold = DYNAMIC_ENERGY_THRESHOLD
+
+        if source is None:
+            raise ValueError("audio source can't be None")
+
+        self.source = source
+
+    def adjust_for_noise(self):
+        with self.source:
+            self.recorder.adjust_for_ambient_noise(self.source)
+
+    def record_into_queue(self, audio_queue):
+        def record_callback(_, audio:sr.AudioData) -> None:
+            audio_queue.put((audio.get_raw_data(), datetime.now()))
+
+        self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT)
+
+class SelectedMicRecorder(BaseRecorder):
+    def __init__(self, device):
+        source=sr.Microphone(
+            device_index=device['index'],
+            sample_rate=int(device["defaultSampleRate"]),
+        )
+        super().__init__(source=source)
+        self.adjust_for_noise()
+
+class SelectedSpeakerRecorder(BaseRecorder):
+    def __init__(self, device):
+
+        source = sr.Microphone(speaker=True,
+            device_index= device["index"],
+            sample_rate=int(device["defaultSampleRate"]),
+            chunk_size=pyaudio.get_sample_size(pyaudio.paInt16),
+            channels=device["maxInputChannels"]
+        )
+        super().__init__(source=source)
+        self.adjust_for_noise()
--- a/AudioTranscriber.py
+++ b/AudioTranscriber.py
@@ -0,0 +1,79 @@
+
+import threading
+import custom_speech_recognition as sr
+from datetime import timedelta
+from heapq import merge
+
+PHRASE_TIMEOUT = 3.05
+MAX_PHRASES = 10
+
+class AudioTranscriber:
+    def __init__(self, source, language):
+        self.language = language
+        self.transcript_data = []
+        self.transcript_changed_event = threading.Event()
+        self.audio_recognizer = sr.Recognizer()
+        self.audio_sources = {
+                "sample_rate": source.SAMPLE_RATE,
+                "sample_width": source.SAMPLE_WIDTH,
+                "channels": source.channels,
+                "last_sample": bytes(),
+                "last_spoken": None,
+                "new_phrase": True,
+        }
+
+    def transcribe_audio_queue(self, audio_queue):
+        while True:
+            audio, time_spoken = audio_queue.get()
+            self.update_last_sample_and_phrase_status(audio, time_spoken)
+
+            text = ''
+            try:
+                audio_data = self.process_data()
+                text = self.audio_recognizer.recognize_google(audio_data, language=self.language)
+            except Exception as e:
+                pass
+            finally:
+                pass
+
+            if text != '':
+                self.update_transcript(text)
+
+    def update_last_sample_and_phrase_status(self, data, time_spoken):
+        source_info = self.audio_sources
+        if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=PHRASE_TIMEOUT):
+            source_info["last_sample"] = bytes()
+            source_info["new_phrase"] = True
+        else:
+            source_info["new_phrase"] = False
+
+        source_info["last_sample"] += data
+        source_info["last_spoken"] = time_spoken
+
+    def process_data(self):
+        print(self.audio_sources["last_sample"])
+        audio_data = sr.AudioData(self.audio_sources["last_sample"], self.audio_sources["sample_rate"], self.audio_sources["sample_width"])
+        return audio_data
+
+    def update_transcript(self, text):
+        source_info = self.audio_sources
+        transcript = self.transcript_data
+
+        if source_info["new_phrase"] or len(transcript) == 0:
+            if len(transcript) > MAX_PHRASES:
+                transcript.pop(-1)
+            transcript.insert(0, text)
+        else:
+            transcript[0] = text
+
+    def get_transcript(self):
+        if len(self.transcript_data) > 0:
+            text = self.transcript_data.pop(-1)
+        else:
+            text = ""
+        return text
+
+    def clear_transcript_data(self):
+        self.transcript_data.clear()
+        self.audio_sources["last_sample"] = bytes()
+        self.audio_sources["new_phrase"] = True
--- a/audio_utils.py
+++ b/audio_utils.py
@@ -0,0 +1,49 @@
+import pyaudiowpatch as pyaudio
+
+def get_input_device_list():
+    devices = []
+    with pyaudio.PyAudio() as p:
+        wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
+        for host_index in range(0, p.get_host_api_count()):
+            for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
+                device = p.get_device_info_by_host_api_device_index(host_index, device_index)
+                if device["hostApi"] == wasapi_info["index"] and device["maxInputChannels"] > 0 and device["isLoopbackDevice"] is False:
+                    devices.append(device)
+    return devices
+
+def get_output_device_list():
+    devices =[]
+    with pyaudio.PyAudio() as p:
+        wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
+        for device in p.get_loopback_device_info_generator():
+            if device["hostApi"] == wasapi_info["index"] and device["isLoopbackDevice"] is True:
+                devices.append(device)
+    return devices
+
+def get_default_input_device():
+    with pyaudio.PyAudio() as p:
+        wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
+        defaultInputDevice = wasapi_info["defaultInputDevice"]
+
+        for host_index in range(0, p.get_host_api_count()):
+            for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
+                device = p.get_device_info_by_host_api_device_index(host_index, device_index)
+                if device["index"] == defaultInputDevice:
+                    default_device = device
+                    return default_device
+
+def get_default_output_device():
+    with pyaudio.PyAudio() as p:
+        wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
+        defaultOutputDevice = wasapi_info["defaultOutputDevice"]
+
+        for host_index in range(0, p.get_host_api_count()):
+            for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
+                device = p.get_device_info_by_host_api_device_index(host_index, device_index)
+                if device["index"] == defaultOutputDevice:
+                    default_speakers = device
+                    if not default_speakers["isLoopbackDevice"]:
+                        for loopback in p.get_loopback_device_info_generator():
+                            if default_speakers["name"] in loopback["name"]:
+                                default_device = loopback
+                                return default_device
--- a/custom_speech_recognition/init.py
+++ b/custom_speech_recognition/init.py
--- a/custom_speech_recognition/main.py
+++ b/custom_speech_recognition/main.py
@@ -0,0 +1,24 @@
+import custom_speech_recognition as sr
+
+r = sr.Recognizer()
+m = sr.Microphone()
+
+try:
+    print("A moment of silence, please...")
+    with m as source: r.adjust_for_ambient_noise(source)
+    print("Set minimum energy threshold to {}".format(r.energy_threshold))
+    while True:
+        print("Say something!")
+        with m as source: audio = r.listen(source)
+        print("Got it! Now to recognize it...")
+        try:
+            # recognize speech using Google Speech Recognition
+            value = r.recognize_google(audio)
+
+            print("You said {}".format(value))
+        except sr.UnknownValueError:
+            print("Oops! Didn't catch that")
+        except sr.RequestError as e:
+            print("Uh oh! Couldn't request results from Google Speech Recognition service; {0}".format(e))
+except KeyboardInterrupt:
+    pass
--- a/custom_speech_recognition/audio.py
+++ b/custom_speech_recognition/audio.py
@@ -0,0 +1,317 @@
+import aifc
+import audioop
+import io
+import os
+import platform
+import stat
+import subprocess
+import sys
+import wave
+
+
+class AudioData(object):
+    """
+    Creates a new ``AudioData`` instance, which represents mono audio data.
+
+    The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format.
+
+    The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample.
+
+    The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz).
+
+    Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly.
+    """
+
+    def __init__(self, frame_data, sample_rate, sample_width):
+        assert sample_rate > 0, "Sample rate must be a positive integer"
+        assert (
+            sample_width % 1 == 0 and 1 <= sample_width <= 4
+        ), "Sample width must be between 1 and 4 inclusive"
+        self.frame_data = frame_data
+        self.sample_rate = sample_rate
+        self.sample_width = int(sample_width)
+
+    def get_segment(self, start_ms=None, end_ms=None):
+        """
+        Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in.
+
+        If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end.
+        """
+        assert (
+            start_ms is None or start_ms >= 0
+        ), "``start_ms`` must be a non-negative number"
+        assert end_ms is None or end_ms >= (
+            0 if start_ms is None else start_ms
+        ), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``"
+        if start_ms is None:
+            start_byte = 0
+        else:
+            start_byte = int(
+                (start_ms * self.sample_rate * self.sample_width) // 1000
+            )
+        if end_ms is None:
+            end_byte = len(self.frame_data)
+        else:
+            end_byte = int(
+                (end_ms * self.sample_rate * self.sample_width) // 1000
+            )
+        return AudioData(
+            self.frame_data[start_byte:end_byte],
+            self.sample_rate,
+            self.sample_width,
+        )
+
+    def get_raw_data(self, convert_rate=None, convert_width=None):
+        """
+        Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance.
+
+        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
+
+        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
+
+        Writing these bytes directly to a file results in a valid `RAW/PCM audio file <https://en.wikipedia.org/wiki/Raw_audio_format>`__.
+        """
+        assert (
+            convert_rate is None or convert_rate > 0
+        ), "Sample rate to convert to must be a positive integer"
+        assert convert_width is None or (
+            convert_width % 1 == 0 and 1 <= convert_width <= 4
+        ), "Sample width to convert to must be between 1 and 4 inclusive"
+
+        raw_data = self.frame_data
+
+        # make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples)
+        if self.sample_width == 1:
+            raw_data = audioop.bias(
+                raw_data, 1, -128
+            )  # subtract 128 from every sample to make them act like signed samples
+
+        # resample audio at the desired rate if specified
+        if convert_rate is not None and self.sample_rate != convert_rate:
+            raw_data, _ = audioop.ratecv(
+                raw_data,
+                self.sample_width,
+                1,
+                self.sample_rate,
+                convert_rate,
+                None,
+            )
+
+        # convert samples to desired sample width if specified
+        if convert_width is not None and self.sample_width != convert_width:
+            if (
+                convert_width == 3
+            ):  # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866)
+                raw_data = audioop.lin2lin(
+                    raw_data, self.sample_width, 4
+                )  # convert audio into 32-bit first, which is always supported
+                try:
+                    audioop.bias(
+                        b"", 3, 0
+                    )  # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do)
+                except (
+                    audioop.error
+                ):  # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less)
+                    raw_data = b"".join(
+                        raw_data[i + 1 : i + 4]
+                        for i in range(0, len(raw_data), 4)
+                    )  # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample
+                else:  # 24-bit audio fully supported, we don't need to shim anything
+                    raw_data = audioop.lin2lin(
+                        raw_data, self.sample_width, convert_width
+                    )
+            else:
+                raw_data = audioop.lin2lin(
+                    raw_data, self.sample_width, convert_width
+                )
+
+        # if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again
+        if convert_width == 1:
+            raw_data = audioop.bias(
+                raw_data, 1, 128
+            )  # add 128 to every sample to make them act like unsigned samples again
+
+        return raw_data
+
+    def get_wav_data(self, convert_rate=None, convert_width=None, nchannels = 1):
+        """
+        Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance.
+
+        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
+
+        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
+
+        Writing these bytes directly to a file results in a valid `WAV file <https://en.wikipedia.org/wiki/WAV>`__.
+        """
+        raw_data = self.get_raw_data(convert_rate, convert_width)
+        sample_rate = (
+            self.sample_rate if convert_rate is None else convert_rate
+        )
+        sample_width = (
+            self.sample_width if convert_width is None else convert_width
+        )
+
+        # generate the WAV file contents
+        with io.BytesIO() as wav_file:
+            wav_writer = wave.open(wav_file, "wb")
+            try:  # note that we can't use context manager, since that was only added in Python 3.4
+                wav_writer.setframerate(sample_rate)
+                wav_writer.setsampwidth(sample_width)
+                wav_writer.setnchannels(nchannels)
+                wav_writer.writeframes(raw_data)
+                wav_data = wav_file.getvalue()
+            finally:  # make sure resources are cleaned up
+                wav_writer.close()
+        return wav_data
+
+    def get_aiff_data(self, convert_rate=None, convert_width=None):
+        """
+        Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance.
+
+        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
+
+        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
+
+        Writing these bytes directly to a file results in a valid `AIFF-C file <https://en.wikipedia.org/wiki/Audio_Interchange_File_Format>`__.
+        """
+        raw_data = self.get_raw_data(convert_rate, convert_width)
+        sample_rate = (
+            self.sample_rate if convert_rate is None else convert_rate
+        )
+        sample_width = (
+            self.sample_width if convert_width is None else convert_width
+        )
+
+        # the AIFF format is big-endian, so we need to convert the little-endian raw data to big-endian
+        if hasattr(
+            audioop, "byteswap"
+        ):  # ``audioop.byteswap`` was only added in Python 3.4
+            raw_data = audioop.byteswap(raw_data, sample_width)
+        else:  # manually reverse the bytes of each sample, which is slower but works well enough as a fallback
+            raw_data = raw_data[sample_width - 1 :: -1] + b"".join(
+                raw_data[i + sample_width : i : -1]
+                for i in range(sample_width - 1, len(raw_data), sample_width)
+            )
+
+        # generate the AIFF-C file contents
+        with io.BytesIO() as aiff_file:
+            aiff_writer = aifc.open(aiff_file, "wb")
+            try:  # note that we can't use context manager, since that was only added in Python 3.4
+                aiff_writer.setframerate(sample_rate)
+                aiff_writer.setsampwidth(sample_width)
+                aiff_writer.setnchannels(1)
+                aiff_writer.writeframes(raw_data)
+                aiff_data = aiff_file.getvalue()
+            finally:  # make sure resources are cleaned up
+                aiff_writer.close()
+        return aiff_data
+
+    def get_flac_data(self, convert_rate=None, convert_width=None):
+        """
+        Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance.
+
+        Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC.
+
+        If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
+
+        If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
+
+        Writing these bytes directly to a file results in a valid `FLAC file <https://en.wikipedia.org/wiki/FLAC>`__.
+        """
+        assert convert_width is None or (
+            convert_width % 1 == 0 and 1 <= convert_width <= 3
+        ), "Sample width to convert to must be between 1 and 3 inclusive"
+
+        if (
+            self.sample_width > 3 and convert_width is None
+        ):  # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder
+            convert_width = 3  # the largest supported sample width is 24-bit, so we'll limit the sample width to that
+
+        # run the FLAC converter with the WAV data to get the FLAC data
+        wav_data = self.get_wav_data(convert_rate, convert_width)
+        flac_converter = get_flac_converter()
+        if (
+            os.name == "nt"
+        ):  # on Windows, specify that the process is to be started without showing a console window
+            startup_info = subprocess.STARTUPINFO()
+            startup_info.dwFlags |= (
+                subprocess.STARTF_USESHOWWINDOW
+            )  # specify that the wShowWindow field of `startup_info` contains a value
+            startup_info.wShowWindow = (
+                subprocess.SW_HIDE
+            )  # specify that the console window should be hidden
+        else:
+            startup_info = None  # default startupinfo
+        process = subprocess.Popen(
+            [
+                flac_converter,
+                "--stdout",
+                "--totally-silent",  # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output
+                "--best",  # highest level of compression available
+                "-",  # the input FLAC file contents will be given in stdin
+            ],
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            startupinfo=startup_info,
+        )
+        flac_data, stderr = process.communicate(wav_data)
+        return flac_data
+
+
+def get_flac_converter():
+    """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found."""
+    flac_converter = shutil_which("flac")  # check for installed version first
+    if flac_converter is None:  # flac utility is not installed
+        base_path = os.path.dirname(
+            os.path.abspath(__file__)
+        )  # directory of the current module file, where all the FLAC bundled binaries are stored
+        system, machine = platform.system(), platform.machine()
+        if system == "Windows" and machine in {
+            "i686",
+            "i786",
+            "x86",
+            "x86_64",
+            "AMD64",
+        }:
+            flac_converter = os.path.join(base_path, "flac-win32.exe")
+        elif system == "Darwin" and machine in {
+            "i686",
+            "i786",
+            "x86",
+            "x86_64",
+            "AMD64",
+        }:
+            flac_converter = os.path.join(base_path, "flac-mac")
+        elif system == "Linux" and machine in {"i686", "i786", "x86"}:
+            flac_converter = os.path.join(base_path, "flac-linux-x86")
+        elif system == "Linux" and machine in {"x86_64", "AMD64"}:
+            flac_converter = os.path.join(base_path, "flac-linux-x86_64")
+        else:  # no FLAC converter available
+            raise OSError(
+                "FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent"
+            )
+
+    # mark FLAC converter as executable if possible
+    try:
+        # handle known issue when running on docker:
+        # run executable right after chmod() may result in OSError "Text file busy"
+        # fix: flush FS with sync
+        if not os.access(flac_converter, os.X_OK):
+            stat_info = os.stat(flac_converter)
+            os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC)
+            if "Linux" in platform.system():
+                os.sync() if sys.version_info >= (3, 3) else os.system("sync")
+
+    except OSError:
+        pass
+
+    return flac_converter
+
+
+def shutil_which(pgm):
+    """Python 2 compatibility: backport of ``shutil.which()`` from Python 3"""
+    path = os.getenv("PATH")
+    for p in path.split(os.path.pathsep):
+        p = os.path.join(p, pgm)
+        if os.path.exists(p) and os.access(p, os.X_OK):
+            return p
--- a/custom_speech_recognition/exceptions.py
+++ b/custom_speech_recognition/exceptions.py
@@ -0,0 +1,22 @@
+class SetupError(Exception):
+    pass
+
+
+class WaitTimeoutError(Exception):
+    pass
+
+
+class RequestError(Exception):
+    pass
+
+
+class UnknownValueError(Exception):
+    pass
+
+
+class TranscriptionNotReady(Exception):
+    pass
+
+
+class TranscriptionFailed(Exception):
+    pass
--- a/custom_speech_recognition/flac-linux-x86
+++ b/custom_speech_recognition/flac-linux-x86
--- a/custom_speech_recognition/flac-linux-x86_64
+++ b/custom_speech_recognition/flac-linux-x86_64
--- a/custom_speech_recognition/flac-mac
+++ b/custom_speech_recognition/flac-mac
--- a/custom_speech_recognition/flac-win32.exe
+++ b/custom_speech_recognition/flac-win32.exe
--- a/custom_speech_recognition/pocketsphinx-data/en-US/LICENSE.txt
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/LICENSE.txt
@@ -0,0 +1,31 @@
+Copyright (c) 1999-2015 Carnegie Mellon University.  All rights
+reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer. 
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+This work was supported in part by funding from the Defense Advanced 
+Research Projects Agency and the National Science Foundation of the 
+United States of America, and the CMU Sphinx Speech Consortium.
+
+THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
+ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
+NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ 
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/README
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/README
@@ -0,0 +1,34 @@
+/* ====================================================================
+ * Copyright (c) 2015 Alpha Cephei Inc. All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ALPHA CEPHEI INC. ``AS IS'' AND.
+ * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,.
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ALPHA CEPHEI INC.
+ * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT.
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,.
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY.
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT.
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE.
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ====================================================================
+ *
+ */
+
+This directory contains generic US english acoustic model trained with
+latest sphinxtrain.
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/feat.params
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/feat.params
@@ -0,0 +1,12 @@
+-lowerf 130
+-upperf 6800
+-nfilt 25
+-transform dct
+-lifter 22
+-feat 1s_c_d_dd
+-svspec 0-12/13-25/26-38
+-agc none
+-cmn current
+-varnorm no
+-model ptm
+-cmninit 40,3,-1
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/mdef
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/mdef
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/means
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/means
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/noisedict
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/noisedict
@@ -0,0 +1,5 @@
+<s> SIL
+</s> SIL
+<sil> SIL
+[NOISE] +NSN+
+[SPEECH] +SPN+
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/sendump
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/sendump
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/transition_matrices
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/transition_matrices
--- a/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/variances
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/acoustic-model/variances
--- a/custom_speech_recognition/pocketsphinx-data/en-US/language-model.lm.bin
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/language-model.lm.bin
--- a/custom_speech_recognition/pocketsphinx-data/en-US/pronounciation-dictionary.dict
+++ b/custom_speech_recognition/pocketsphinx-data/en-US/pronounciation-dictionary.dict
--- a/custom_speech_recognition/recognizers/init.py
+++ b/custom_speech_recognition/recognizers/init.py
--- a/custom_speech_recognition/recognizers/whisper.py
+++ b/custom_speech_recognition/recognizers/whisper.py
@@ -0,0 +1,42 @@
+from __future__ import annotations
+
+import os
+from io import BytesIO
+
+from custom_speech_recognition.audio import AudioData
+from custom_speech_recognition.exceptions import SetupError
+
+
+def recognize_whisper_api(
+    recognizer,
+    audio_data: "AudioData",
+    *,
+    model: str = "whisper-1",
+    api_key: str | None = None,
+):
+    """
+    Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API.
+
+    This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings <https://platform.openai.com/account/api-keys>`__.
+
+    Detail: https://platform.openai.com/docs/guides/speech-to-text
+
+    Raises a ``speech_recognition.exceptions.SetupError`` exception if there are any issues with the openai installation, or the environment variable is missing.
+    """
+    if not isinstance(audio_data, AudioData):
+        raise ValueError("``audio_data`` must be an ``AudioData`` instance")
+    if api_key is None and os.environ.get("OPENAI_API_KEY") is None:
+        raise SetupError("Set environment variable ``OPENAI_API_KEY``")
+
+    try:
+        import openai
+    except ImportError:
+        raise SetupError(
+            "missing openai module: ensure that openai is set up correctly."
+        )
+
+    wav_data = BytesIO(audio_data.get_wav_data())
+    wav_data.name = "SpeechRecognition_audio.wav"
+
+    transcript = openai.Audio.transcribe(model, wav_data, api_key=api_key)
+    return transcript["text"]
--- a/test_main.py
+++ b/test_main.py
@@ -0,0 +1,38 @@
+import time
+import threading
+import queue
+import AudioTranscriber
+import AudioRecorder
+import audio_utils
+
+mic_audio_queue = queue.Queue()
+
+mic_device = audio_utils.get_default_input_device()
+mic_audio_recorder = AudioRecorder.SelectedMicRecorder(mic_device)
+mic_audio_recorder.record_into_queue(mic_audio_queue)
+
+mic_transcriber = AudioTranscriber.AudioTranscriber(source=mic_audio_recorder.source, language="ja-JP")
+mic_transcribe = threading.Thread(target=mic_transcriber.transcribe_audio_queue, args=(mic_audio_queue,))
+mic_transcribe.daemon = True
+mic_transcribe.start()
+
+time.sleep(2)
+
+spk_audio_queue = queue.Queue()
+spk_device = audio_utils.get_default_output_device()
+spk_audio_recorder = AudioRecorder.SelectedSpeakerRecorder(spk_device)
+spk_audio_recorder.record_into_queue(spk_audio_queue)
+
+spk_transcriber = AudioTranscriber.AudioTranscriber(source=mic_audio_recorder.source, language="ja-JP")
+spk_transcribe = threading.Thread(target=spk_transcriber.transcribe_audio_queue, args=(spk_audio_queue,))
+spk_transcribe.daemon = True
+spk_transcribe.start()
+
+while True:
+    text = mic_transcriber.get_transcript()
+    if len(text) > 0:
+        print("mic:", text)
+    # text = spk_transcriber.get_transcript()
+    # if len(text) > 0:
+    #     print("spk:", text)
+    time.sleep(0.1)