test code custom speech recognition

This commit is contained in:
misyaguziya
2023-06-29 10:10:00 +09:00
parent bc300eae4c
commit 868c84a9eb
25 changed files with 135723 additions and 0 deletions

49
AudioRecorder.py Normal file
View File

@@ -0,0 +1,49 @@
import custom_speech_recognition as sr
import pyaudiowpatch as pyaudio
from datetime import datetime
RECORD_TIMEOUT = 3
ENERGY_THRESHOLD = 1000
DYNAMIC_ENERGY_THRESHOLD = False
class BaseRecorder:
def __init__(self, source):
self.recorder = sr.Recognizer()
self.recorder.energy_threshold = ENERGY_THRESHOLD
self.recorder.dynamic_energy_threshold = DYNAMIC_ENERGY_THRESHOLD
if source is None:
raise ValueError("audio source can't be None")
self.source = source
def adjust_for_noise(self):
with self.source:
self.recorder.adjust_for_ambient_noise(self.source)
def record_into_queue(self, audio_queue):
def record_callback(_, audio:sr.AudioData) -> None:
audio_queue.put((audio.get_raw_data(), datetime.now()))
self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT)
class SelectedMicRecorder(BaseRecorder):
def __init__(self, device):
source=sr.Microphone(
device_index=device['index'],
sample_rate=int(device["defaultSampleRate"]),
)
super().__init__(source=source)
self.adjust_for_noise()
class SelectedSpeakerRecorder(BaseRecorder):
def __init__(self, device):
source = sr.Microphone(speaker=True,
device_index= device["index"],
sample_rate=int(device["defaultSampleRate"]),
chunk_size=pyaudio.get_sample_size(pyaudio.paInt16),
channels=device["maxInputChannels"]
)
super().__init__(source=source)
self.adjust_for_noise()

79
AudioTranscriber.py Normal file
View File

@@ -0,0 +1,79 @@
import threading
import custom_speech_recognition as sr
from datetime import timedelta
from heapq import merge
PHRASE_TIMEOUT = 3.05
MAX_PHRASES = 10
class AudioTranscriber:
def __init__(self, source, language):
self.language = language
self.transcript_data = []
self.transcript_changed_event = threading.Event()
self.audio_recognizer = sr.Recognizer()
self.audio_sources = {
"sample_rate": source.SAMPLE_RATE,
"sample_width": source.SAMPLE_WIDTH,
"channels": source.channels,
"last_sample": bytes(),
"last_spoken": None,
"new_phrase": True,
}
def transcribe_audio_queue(self, audio_queue):
while True:
audio, time_spoken = audio_queue.get()
self.update_last_sample_and_phrase_status(audio, time_spoken)
text = ''
try:
audio_data = self.process_data()
text = self.audio_recognizer.recognize_google(audio_data, language=self.language)
except Exception as e:
pass
finally:
pass
if text != '':
self.update_transcript(text)
def update_last_sample_and_phrase_status(self, data, time_spoken):
source_info = self.audio_sources
if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=PHRASE_TIMEOUT):
source_info["last_sample"] = bytes()
source_info["new_phrase"] = True
else:
source_info["new_phrase"] = False
source_info["last_sample"] += data
source_info["last_spoken"] = time_spoken
def process_data(self):
print(self.audio_sources["last_sample"])
audio_data = sr.AudioData(self.audio_sources["last_sample"], self.audio_sources["sample_rate"], self.audio_sources["sample_width"])
return audio_data
def update_transcript(self, text):
source_info = self.audio_sources
transcript = self.transcript_data
if source_info["new_phrase"] or len(transcript) == 0:
if len(transcript) > MAX_PHRASES:
transcript.pop(-1)
transcript.insert(0, text)
else:
transcript[0] = text
def get_transcript(self):
if len(self.transcript_data) > 0:
text = self.transcript_data.pop(-1)
else:
text = ""
return text
def clear_transcript_data(self):
self.transcript_data.clear()
self.audio_sources["last_sample"] = bytes()
self.audio_sources["new_phrase"] = True

49
audio_utils.py Normal file
View File

@@ -0,0 +1,49 @@
import pyaudiowpatch as pyaudio
def get_input_device_list():
devices = []
with pyaudio.PyAudio() as p:
wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
for host_index in range(0, p.get_host_api_count()):
for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
device = p.get_device_info_by_host_api_device_index(host_index, device_index)
if device["hostApi"] == wasapi_info["index"] and device["maxInputChannels"] > 0 and device["isLoopbackDevice"] is False:
devices.append(device)
return devices
def get_output_device_list():
devices =[]
with pyaudio.PyAudio() as p:
wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
for device in p.get_loopback_device_info_generator():
if device["hostApi"] == wasapi_info["index"] and device["isLoopbackDevice"] is True:
devices.append(device)
return devices
def get_default_input_device():
with pyaudio.PyAudio() as p:
wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
defaultInputDevice = wasapi_info["defaultInputDevice"]
for host_index in range(0, p.get_host_api_count()):
for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
device = p.get_device_info_by_host_api_device_index(host_index, device_index)
if device["index"] == defaultInputDevice:
default_device = device
return default_device
def get_default_output_device():
with pyaudio.PyAudio() as p:
wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
defaultOutputDevice = wasapi_info["defaultOutputDevice"]
for host_index in range(0, p.get_host_api_count()):
for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
device = p.get_device_info_by_host_api_device_index(host_index, device_index)
if device["index"] == defaultOutputDevice:
default_speakers = device
if not default_speakers["isLoopbackDevice"]:
for loopback in p.get_loopback_device_info_generator():
if default_speakers["name"] in loopback["name"]:
default_device = loopback
return default_device

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,24 @@
import custom_speech_recognition as sr
r = sr.Recognizer()
m = sr.Microphone()
try:
print("A moment of silence, please...")
with m as source: r.adjust_for_ambient_noise(source)
print("Set minimum energy threshold to {}".format(r.energy_threshold))
while True:
print("Say something!")
with m as source: audio = r.listen(source)
print("Got it! Now to recognize it...")
try:
# recognize speech using Google Speech Recognition
value = r.recognize_google(audio)
print("You said {}".format(value))
except sr.UnknownValueError:
print("Oops! Didn't catch that")
except sr.RequestError as e:
print("Uh oh! Couldn't request results from Google Speech Recognition service; {0}".format(e))
except KeyboardInterrupt:
pass

View File

@@ -0,0 +1,317 @@
import aifc
import audioop
import io
import os
import platform
import stat
import subprocess
import sys
import wave
class AudioData(object):
"""
Creates a new ``AudioData`` instance, which represents mono audio data.
The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format.
The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample.
The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz).
Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly.
"""
def __init__(self, frame_data, sample_rate, sample_width):
assert sample_rate > 0, "Sample rate must be a positive integer"
assert (
sample_width % 1 == 0 and 1 <= sample_width <= 4
), "Sample width must be between 1 and 4 inclusive"
self.frame_data = frame_data
self.sample_rate = sample_rate
self.sample_width = int(sample_width)
def get_segment(self, start_ms=None, end_ms=None):
"""
Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in.
If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end.
"""
assert (
start_ms is None or start_ms >= 0
), "``start_ms`` must be a non-negative number"
assert end_ms is None or end_ms >= (
0 if start_ms is None else start_ms
), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``"
if start_ms is None:
start_byte = 0
else:
start_byte = int(
(start_ms * self.sample_rate * self.sample_width) // 1000
)
if end_ms is None:
end_byte = len(self.frame_data)
else:
end_byte = int(
(end_ms * self.sample_rate * self.sample_width) // 1000
)
return AudioData(
self.frame_data[start_byte:end_byte],
self.sample_rate,
self.sample_width,
)
def get_raw_data(self, convert_rate=None, convert_width=None):
"""
Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance.
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
Writing these bytes directly to a file results in a valid `RAW/PCM audio file <https://en.wikipedia.org/wiki/Raw_audio_format>`__.
"""
assert (
convert_rate is None or convert_rate > 0
), "Sample rate to convert to must be a positive integer"
assert convert_width is None or (
convert_width % 1 == 0 and 1 <= convert_width <= 4
), "Sample width to convert to must be between 1 and 4 inclusive"
raw_data = self.frame_data
# make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples)
if self.sample_width == 1:
raw_data = audioop.bias(
raw_data, 1, -128
) # subtract 128 from every sample to make them act like signed samples
# resample audio at the desired rate if specified
if convert_rate is not None and self.sample_rate != convert_rate:
raw_data, _ = audioop.ratecv(
raw_data,
self.sample_width,
1,
self.sample_rate,
convert_rate,
None,
)
# convert samples to desired sample width if specified
if convert_width is not None and self.sample_width != convert_width:
if (
convert_width == 3
): # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866)
raw_data = audioop.lin2lin(
raw_data, self.sample_width, 4
) # convert audio into 32-bit first, which is always supported
try:
audioop.bias(
b"", 3, 0
) # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do)
except (
audioop.error
): # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less)
raw_data = b"".join(
raw_data[i + 1 : i + 4]
for i in range(0, len(raw_data), 4)
) # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample
else: # 24-bit audio fully supported, we don't need to shim anything
raw_data = audioop.lin2lin(
raw_data, self.sample_width, convert_width
)
else:
raw_data = audioop.lin2lin(
raw_data, self.sample_width, convert_width
)
# if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again
if convert_width == 1:
raw_data = audioop.bias(
raw_data, 1, 128
) # add 128 to every sample to make them act like unsigned samples again
return raw_data
def get_wav_data(self, convert_rate=None, convert_width=None, nchannels = 1):
"""
Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance.
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
Writing these bytes directly to a file results in a valid `WAV file <https://en.wikipedia.org/wiki/WAV>`__.
"""
raw_data = self.get_raw_data(convert_rate, convert_width)
sample_rate = (
self.sample_rate if convert_rate is None else convert_rate
)
sample_width = (
self.sample_width if convert_width is None else convert_width
)
# generate the WAV file contents
with io.BytesIO() as wav_file:
wav_writer = wave.open(wav_file, "wb")
try: # note that we can't use context manager, since that was only added in Python 3.4
wav_writer.setframerate(sample_rate)
wav_writer.setsampwidth(sample_width)
wav_writer.setnchannels(nchannels)
wav_writer.writeframes(raw_data)
wav_data = wav_file.getvalue()
finally: # make sure resources are cleaned up
wav_writer.close()
return wav_data
def get_aiff_data(self, convert_rate=None, convert_width=None):
"""
Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance.
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
Writing these bytes directly to a file results in a valid `AIFF-C file <https://en.wikipedia.org/wiki/Audio_Interchange_File_Format>`__.
"""
raw_data = self.get_raw_data(convert_rate, convert_width)
sample_rate = (
self.sample_rate if convert_rate is None else convert_rate
)
sample_width = (
self.sample_width if convert_width is None else convert_width
)
# the AIFF format is big-endian, so we need to convert the little-endian raw data to big-endian
if hasattr(
audioop, "byteswap"
): # ``audioop.byteswap`` was only added in Python 3.4
raw_data = audioop.byteswap(raw_data, sample_width)
else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback
raw_data = raw_data[sample_width - 1 :: -1] + b"".join(
raw_data[i + sample_width : i : -1]
for i in range(sample_width - 1, len(raw_data), sample_width)
)
# generate the AIFF-C file contents
with io.BytesIO() as aiff_file:
aiff_writer = aifc.open(aiff_file, "wb")
try: # note that we can't use context manager, since that was only added in Python 3.4
aiff_writer.setframerate(sample_rate)
aiff_writer.setsampwidth(sample_width)
aiff_writer.setnchannels(1)
aiff_writer.writeframes(raw_data)
aiff_data = aiff_file.getvalue()
finally: # make sure resources are cleaned up
aiff_writer.close()
return aiff_data
def get_flac_data(self, convert_rate=None, convert_width=None):
"""
Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance.
Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC.
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
Writing these bytes directly to a file results in a valid `FLAC file <https://en.wikipedia.org/wiki/FLAC>`__.
"""
assert convert_width is None or (
convert_width % 1 == 0 and 1 <= convert_width <= 3
), "Sample width to convert to must be between 1 and 3 inclusive"
if (
self.sample_width > 3 and convert_width is None
): # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder
convert_width = 3 # the largest supported sample width is 24-bit, so we'll limit the sample width to that
# run the FLAC converter with the WAV data to get the FLAC data
wav_data = self.get_wav_data(convert_rate, convert_width)
flac_converter = get_flac_converter()
if (
os.name == "nt"
): # on Windows, specify that the process is to be started without showing a console window
startup_info = subprocess.STARTUPINFO()
startup_info.dwFlags |= (
subprocess.STARTF_USESHOWWINDOW
) # specify that the wShowWindow field of `startup_info` contains a value
startup_info.wShowWindow = (
subprocess.SW_HIDE
) # specify that the console window should be hidden
else:
startup_info = None # default startupinfo
process = subprocess.Popen(
[
flac_converter,
"--stdout",
"--totally-silent", # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output
"--best", # highest level of compression available
"-", # the input FLAC file contents will be given in stdin
],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
startupinfo=startup_info,
)
flac_data, stderr = process.communicate(wav_data)
return flac_data
def get_flac_converter():
"""Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found."""
flac_converter = shutil_which("flac") # check for installed version first
if flac_converter is None: # flac utility is not installed
base_path = os.path.dirname(
os.path.abspath(__file__)
) # directory of the current module file, where all the FLAC bundled binaries are stored
system, machine = platform.system(), platform.machine()
if system == "Windows" and machine in {
"i686",
"i786",
"x86",
"x86_64",
"AMD64",
}:
flac_converter = os.path.join(base_path, "flac-win32.exe")
elif system == "Darwin" and machine in {
"i686",
"i786",
"x86",
"x86_64",
"AMD64",
}:
flac_converter = os.path.join(base_path, "flac-mac")
elif system == "Linux" and machine in {"i686", "i786", "x86"}:
flac_converter = os.path.join(base_path, "flac-linux-x86")
elif system == "Linux" and machine in {"x86_64", "AMD64"}:
flac_converter = os.path.join(base_path, "flac-linux-x86_64")
else: # no FLAC converter available
raise OSError(
"FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent"
)
# mark FLAC converter as executable if possible
try:
# handle known issue when running on docker:
# run executable right after chmod() may result in OSError "Text file busy"
# fix: flush FS with sync
if not os.access(flac_converter, os.X_OK):
stat_info = os.stat(flac_converter)
os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC)
if "Linux" in platform.system():
os.sync() if sys.version_info >= (3, 3) else os.system("sync")
except OSError:
pass
return flac_converter
def shutil_which(pgm):
"""Python 2 compatibility: backport of ``shutil.which()`` from Python 3"""
path = os.getenv("PATH")
for p in path.split(os.path.pathsep):
p = os.path.join(p, pgm)
if os.path.exists(p) and os.access(p, os.X_OK):
return p

View File

@@ -0,0 +1,22 @@
class SetupError(Exception):
pass
class WaitTimeoutError(Exception):
pass
class RequestError(Exception):
pass
class UnknownValueError(Exception):
pass
class TranscriptionNotReady(Exception):
pass
class TranscriptionFailed(Exception):
pass

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,31 @@
Copyright (c) 1999-2015 Carnegie Mellon University. All rights
reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
This work was supported in part by funding from the Defense Advanced
Research Projects Agency and the National Science Foundation of the
United States of America, and the CMU Sphinx Speech Consortium.
THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@@ -0,0 +1,34 @@
/* ====================================================================
* Copyright (c) 2015 Alpha Cephei Inc. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY ALPHA CEPHEI INC. ``AS IS'' AND.
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,.
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ALPHA CEPHEI INC.
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT.
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,.
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY.
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT.
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE.
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ====================================================================
*
*/
This directory contains generic US english acoustic model trained with
latest sphinxtrain.

View File

@@ -0,0 +1,12 @@
-lowerf 130
-upperf 6800
-nfilt 25
-transform dct
-lifter 22
-feat 1s_c_d_dd
-svspec 0-12/13-25/26-38
-agc none
-cmn current
-varnorm no
-model ptm
-cmninit 40,3,-1

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 MiB

View File

@@ -0,0 +1,5 @@
<s> SIL
</s> SIL
<sil> SIL
[NOISE] +NSN+
[SPEECH] +SPN+

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,42 @@
from __future__ import annotations
import os
from io import BytesIO
from custom_speech_recognition.audio import AudioData
from custom_speech_recognition.exceptions import SetupError
def recognize_whisper_api(
recognizer,
audio_data: "AudioData",
*,
model: str = "whisper-1",
api_key: str | None = None,
):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API.
This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings <https://platform.openai.com/account/api-keys>`__.
Detail: https://platform.openai.com/docs/guides/speech-to-text
Raises a ``speech_recognition.exceptions.SetupError`` exception if there are any issues with the openai installation, or the environment variable is missing.
"""
if not isinstance(audio_data, AudioData):
raise ValueError("``audio_data`` must be an ``AudioData`` instance")
if api_key is None and os.environ.get("OPENAI_API_KEY") is None:
raise SetupError("Set environment variable ``OPENAI_API_KEY``")
try:
import openai
except ImportError:
raise SetupError(
"missing openai module: ensure that openai is set up correctly."
)
wav_data = BytesIO(audio_data.get_wav_data())
wav_data.name = "SpeechRecognition_audio.wav"
transcript = openai.Audio.transcribe(model, wav_data, api_key=api_key)
return transcript["text"]

38
test_main.py Normal file
View File

@@ -0,0 +1,38 @@
import time
import threading
import queue
import AudioTranscriber
import AudioRecorder
import audio_utils
mic_audio_queue = queue.Queue()
mic_device = audio_utils.get_default_input_device()
mic_audio_recorder = AudioRecorder.SelectedMicRecorder(mic_device)
mic_audio_recorder.record_into_queue(mic_audio_queue)
mic_transcriber = AudioTranscriber.AudioTranscriber(source=mic_audio_recorder.source, language="ja-JP")
mic_transcribe = threading.Thread(target=mic_transcriber.transcribe_audio_queue, args=(mic_audio_queue,))
mic_transcribe.daemon = True
mic_transcribe.start()
time.sleep(2)
spk_audio_queue = queue.Queue()
spk_device = audio_utils.get_default_output_device()
spk_audio_recorder = AudioRecorder.SelectedSpeakerRecorder(spk_device)
spk_audio_recorder.record_into_queue(spk_audio_queue)
spk_transcriber = AudioTranscriber.AudioTranscriber(source=mic_audio_recorder.source, language="ja-JP")
spk_transcribe = threading.Thread(target=spk_transcriber.transcribe_audio_queue, args=(spk_audio_queue,))
spk_transcribe.daemon = True
spk_transcribe.start()
while True:
text = mic_transcriber.get_transcript()
if len(text) > 0:
print("mic:", text)
# text = spk_transcriber.get_transcript()
# if len(text) > 0:
# print("spk:", text)
time.sleep(0.1)