test code custom speech recognition
This commit is contained in:
49
AudioRecorder.py
Normal file
49
AudioRecorder.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
import custom_speech_recognition as sr
|
||||||
|
import pyaudiowpatch as pyaudio
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
RECORD_TIMEOUT = 3
|
||||||
|
ENERGY_THRESHOLD = 1000
|
||||||
|
DYNAMIC_ENERGY_THRESHOLD = False
|
||||||
|
|
||||||
|
class BaseRecorder:
|
||||||
|
def __init__(self, source):
|
||||||
|
self.recorder = sr.Recognizer()
|
||||||
|
self.recorder.energy_threshold = ENERGY_THRESHOLD
|
||||||
|
self.recorder.dynamic_energy_threshold = DYNAMIC_ENERGY_THRESHOLD
|
||||||
|
|
||||||
|
if source is None:
|
||||||
|
raise ValueError("audio source can't be None")
|
||||||
|
|
||||||
|
self.source = source
|
||||||
|
|
||||||
|
def adjust_for_noise(self):
|
||||||
|
with self.source:
|
||||||
|
self.recorder.adjust_for_ambient_noise(self.source)
|
||||||
|
|
||||||
|
def record_into_queue(self, audio_queue):
|
||||||
|
def record_callback(_, audio:sr.AudioData) -> None:
|
||||||
|
audio_queue.put((audio.get_raw_data(), datetime.now()))
|
||||||
|
|
||||||
|
self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT)
|
||||||
|
|
||||||
|
class SelectedMicRecorder(BaseRecorder):
|
||||||
|
def __init__(self, device):
|
||||||
|
source=sr.Microphone(
|
||||||
|
device_index=device['index'],
|
||||||
|
sample_rate=int(device["defaultSampleRate"]),
|
||||||
|
)
|
||||||
|
super().__init__(source=source)
|
||||||
|
self.adjust_for_noise()
|
||||||
|
|
||||||
|
class SelectedSpeakerRecorder(BaseRecorder):
|
||||||
|
def __init__(self, device):
|
||||||
|
|
||||||
|
source = sr.Microphone(speaker=True,
|
||||||
|
device_index= device["index"],
|
||||||
|
sample_rate=int(device["defaultSampleRate"]),
|
||||||
|
chunk_size=pyaudio.get_sample_size(pyaudio.paInt16),
|
||||||
|
channels=device["maxInputChannels"]
|
||||||
|
)
|
||||||
|
super().__init__(source=source)
|
||||||
|
self.adjust_for_noise()
|
||||||
79
AudioTranscriber.py
Normal file
79
AudioTranscriber.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
|
||||||
|
import threading
|
||||||
|
import custom_speech_recognition as sr
|
||||||
|
from datetime import timedelta
|
||||||
|
from heapq import merge
|
||||||
|
|
||||||
|
PHRASE_TIMEOUT = 3.05
|
||||||
|
MAX_PHRASES = 10
|
||||||
|
|
||||||
|
class AudioTranscriber:
|
||||||
|
def __init__(self, source, language):
|
||||||
|
self.language = language
|
||||||
|
self.transcript_data = []
|
||||||
|
self.transcript_changed_event = threading.Event()
|
||||||
|
self.audio_recognizer = sr.Recognizer()
|
||||||
|
self.audio_sources = {
|
||||||
|
"sample_rate": source.SAMPLE_RATE,
|
||||||
|
"sample_width": source.SAMPLE_WIDTH,
|
||||||
|
"channels": source.channels,
|
||||||
|
"last_sample": bytes(),
|
||||||
|
"last_spoken": None,
|
||||||
|
"new_phrase": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
def transcribe_audio_queue(self, audio_queue):
|
||||||
|
while True:
|
||||||
|
audio, time_spoken = audio_queue.get()
|
||||||
|
self.update_last_sample_and_phrase_status(audio, time_spoken)
|
||||||
|
|
||||||
|
text = ''
|
||||||
|
try:
|
||||||
|
audio_data = self.process_data()
|
||||||
|
text = self.audio_recognizer.recognize_google(audio_data, language=self.language)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if text != '':
|
||||||
|
self.update_transcript(text)
|
||||||
|
|
||||||
|
def update_last_sample_and_phrase_status(self, data, time_spoken):
|
||||||
|
source_info = self.audio_sources
|
||||||
|
if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=PHRASE_TIMEOUT):
|
||||||
|
source_info["last_sample"] = bytes()
|
||||||
|
source_info["new_phrase"] = True
|
||||||
|
else:
|
||||||
|
source_info["new_phrase"] = False
|
||||||
|
|
||||||
|
source_info["last_sample"] += data
|
||||||
|
source_info["last_spoken"] = time_spoken
|
||||||
|
|
||||||
|
def process_data(self):
|
||||||
|
print(self.audio_sources["last_sample"])
|
||||||
|
audio_data = sr.AudioData(self.audio_sources["last_sample"], self.audio_sources["sample_rate"], self.audio_sources["sample_width"])
|
||||||
|
return audio_data
|
||||||
|
|
||||||
|
def update_transcript(self, text):
|
||||||
|
source_info = self.audio_sources
|
||||||
|
transcript = self.transcript_data
|
||||||
|
|
||||||
|
if source_info["new_phrase"] or len(transcript) == 0:
|
||||||
|
if len(transcript) > MAX_PHRASES:
|
||||||
|
transcript.pop(-1)
|
||||||
|
transcript.insert(0, text)
|
||||||
|
else:
|
||||||
|
transcript[0] = text
|
||||||
|
|
||||||
|
def get_transcript(self):
|
||||||
|
if len(self.transcript_data) > 0:
|
||||||
|
text = self.transcript_data.pop(-1)
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
|
return text
|
||||||
|
|
||||||
|
def clear_transcript_data(self):
|
||||||
|
self.transcript_data.clear()
|
||||||
|
self.audio_sources["last_sample"] = bytes()
|
||||||
|
self.audio_sources["new_phrase"] = True
|
||||||
49
audio_utils.py
Normal file
49
audio_utils.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
import pyaudiowpatch as pyaudio
|
||||||
|
|
||||||
|
def get_input_device_list():
|
||||||
|
devices = []
|
||||||
|
with pyaudio.PyAudio() as p:
|
||||||
|
wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
|
||||||
|
for host_index in range(0, p.get_host_api_count()):
|
||||||
|
for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
|
||||||
|
device = p.get_device_info_by_host_api_device_index(host_index, device_index)
|
||||||
|
if device["hostApi"] == wasapi_info["index"] and device["maxInputChannels"] > 0 and device["isLoopbackDevice"] is False:
|
||||||
|
devices.append(device)
|
||||||
|
return devices
|
||||||
|
|
||||||
|
def get_output_device_list():
|
||||||
|
devices =[]
|
||||||
|
with pyaudio.PyAudio() as p:
|
||||||
|
wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
|
||||||
|
for device in p.get_loopback_device_info_generator():
|
||||||
|
if device["hostApi"] == wasapi_info["index"] and device["isLoopbackDevice"] is True:
|
||||||
|
devices.append(device)
|
||||||
|
return devices
|
||||||
|
|
||||||
|
def get_default_input_device():
|
||||||
|
with pyaudio.PyAudio() as p:
|
||||||
|
wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
|
||||||
|
defaultInputDevice = wasapi_info["defaultInputDevice"]
|
||||||
|
|
||||||
|
for host_index in range(0, p.get_host_api_count()):
|
||||||
|
for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
|
||||||
|
device = p.get_device_info_by_host_api_device_index(host_index, device_index)
|
||||||
|
if device["index"] == defaultInputDevice:
|
||||||
|
default_device = device
|
||||||
|
return default_device
|
||||||
|
|
||||||
|
def get_default_output_device():
|
||||||
|
with pyaudio.PyAudio() as p:
|
||||||
|
wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
|
||||||
|
defaultOutputDevice = wasapi_info["defaultOutputDevice"]
|
||||||
|
|
||||||
|
for host_index in range(0, p.get_host_api_count()):
|
||||||
|
for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
|
||||||
|
device = p.get_device_info_by_host_api_device_index(host_index, device_index)
|
||||||
|
if device["index"] == defaultOutputDevice:
|
||||||
|
default_speakers = device
|
||||||
|
if not default_speakers["isLoopbackDevice"]:
|
||||||
|
for loopback in p.get_loopback_device_info_generator():
|
||||||
|
if default_speakers["name"] in loopback["name"]:
|
||||||
|
default_device = loopback
|
||||||
|
return default_device
|
||||||
1596
custom_speech_recognition/__init__.py
Normal file
1596
custom_speech_recognition/__init__.py
Normal file
File diff suppressed because it is too large
Load Diff
24
custom_speech_recognition/__main__.py
Normal file
24
custom_speech_recognition/__main__.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
import custom_speech_recognition as sr
|
||||||
|
|
||||||
|
r = sr.Recognizer()
|
||||||
|
m = sr.Microphone()
|
||||||
|
|
||||||
|
try:
|
||||||
|
print("A moment of silence, please...")
|
||||||
|
with m as source: r.adjust_for_ambient_noise(source)
|
||||||
|
print("Set minimum energy threshold to {}".format(r.energy_threshold))
|
||||||
|
while True:
|
||||||
|
print("Say something!")
|
||||||
|
with m as source: audio = r.listen(source)
|
||||||
|
print("Got it! Now to recognize it...")
|
||||||
|
try:
|
||||||
|
# recognize speech using Google Speech Recognition
|
||||||
|
value = r.recognize_google(audio)
|
||||||
|
|
||||||
|
print("You said {}".format(value))
|
||||||
|
except sr.UnknownValueError:
|
||||||
|
print("Oops! Didn't catch that")
|
||||||
|
except sr.RequestError as e:
|
||||||
|
print("Uh oh! Couldn't request results from Google Speech Recognition service; {0}".format(e))
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
pass
|
||||||
317
custom_speech_recognition/audio.py
Normal file
317
custom_speech_recognition/audio.py
Normal file
@@ -0,0 +1,317 @@
|
|||||||
|
import aifc
|
||||||
|
import audioop
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import stat
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import wave
|
||||||
|
|
||||||
|
|
||||||
|
class AudioData(object):
|
||||||
|
"""
|
||||||
|
Creates a new ``AudioData`` instance, which represents mono audio data.
|
||||||
|
|
||||||
|
The raw audio data is specified by ``frame_data``, which is a sequence of bytes representing audio samples. This is the frame data structure used by the PCM WAV format.
|
||||||
|
|
||||||
|
The width of each sample, in bytes, is specified by ``sample_width``. Each group of ``sample_width`` bytes represents a single audio sample.
|
||||||
|
|
||||||
|
The audio data is assumed to have a sample rate of ``sample_rate`` samples per second (Hertz).
|
||||||
|
|
||||||
|
Usually, instances of this class are obtained from ``recognizer_instance.record`` or ``recognizer_instance.listen``, or in the callback for ``recognizer_instance.listen_in_background``, rather than instantiating them directly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, frame_data, sample_rate, sample_width):
|
||||||
|
assert sample_rate > 0, "Sample rate must be a positive integer"
|
||||||
|
assert (
|
||||||
|
sample_width % 1 == 0 and 1 <= sample_width <= 4
|
||||||
|
), "Sample width must be between 1 and 4 inclusive"
|
||||||
|
self.frame_data = frame_data
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
self.sample_width = int(sample_width)
|
||||||
|
|
||||||
|
def get_segment(self, start_ms=None, end_ms=None):
|
||||||
|
"""
|
||||||
|
Returns a new ``AudioData`` instance, trimmed to a given time interval. In other words, an ``AudioData`` instance with the same audio data except starting at ``start_ms`` milliseconds in and ending ``end_ms`` milliseconds in.
|
||||||
|
|
||||||
|
If not specified, ``start_ms`` defaults to the beginning of the audio, and ``end_ms`` defaults to the end.
|
||||||
|
"""
|
||||||
|
assert (
|
||||||
|
start_ms is None or start_ms >= 0
|
||||||
|
), "``start_ms`` must be a non-negative number"
|
||||||
|
assert end_ms is None or end_ms >= (
|
||||||
|
0 if start_ms is None else start_ms
|
||||||
|
), "``end_ms`` must be a non-negative number greater or equal to ``start_ms``"
|
||||||
|
if start_ms is None:
|
||||||
|
start_byte = 0
|
||||||
|
else:
|
||||||
|
start_byte = int(
|
||||||
|
(start_ms * self.sample_rate * self.sample_width) // 1000
|
||||||
|
)
|
||||||
|
if end_ms is None:
|
||||||
|
end_byte = len(self.frame_data)
|
||||||
|
else:
|
||||||
|
end_byte = int(
|
||||||
|
(end_ms * self.sample_rate * self.sample_width) // 1000
|
||||||
|
)
|
||||||
|
return AudioData(
|
||||||
|
self.frame_data[start_byte:end_byte],
|
||||||
|
self.sample_rate,
|
||||||
|
self.sample_width,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_raw_data(self, convert_rate=None, convert_width=None):
|
||||||
|
"""
|
||||||
|
Returns a byte string representing the raw frame data for the audio represented by the ``AudioData`` instance.
|
||||||
|
|
||||||
|
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
|
||||||
|
|
||||||
|
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
|
||||||
|
|
||||||
|
Writing these bytes directly to a file results in a valid `RAW/PCM audio file <https://en.wikipedia.org/wiki/Raw_audio_format>`__.
|
||||||
|
"""
|
||||||
|
assert (
|
||||||
|
convert_rate is None or convert_rate > 0
|
||||||
|
), "Sample rate to convert to must be a positive integer"
|
||||||
|
assert convert_width is None or (
|
||||||
|
convert_width % 1 == 0 and 1 <= convert_width <= 4
|
||||||
|
), "Sample width to convert to must be between 1 and 4 inclusive"
|
||||||
|
|
||||||
|
raw_data = self.frame_data
|
||||||
|
|
||||||
|
# make sure unsigned 8-bit audio (which uses unsigned samples) is handled like higher sample width audio (which uses signed samples)
|
||||||
|
if self.sample_width == 1:
|
||||||
|
raw_data = audioop.bias(
|
||||||
|
raw_data, 1, -128
|
||||||
|
) # subtract 128 from every sample to make them act like signed samples
|
||||||
|
|
||||||
|
# resample audio at the desired rate if specified
|
||||||
|
if convert_rate is not None and self.sample_rate != convert_rate:
|
||||||
|
raw_data, _ = audioop.ratecv(
|
||||||
|
raw_data,
|
||||||
|
self.sample_width,
|
||||||
|
1,
|
||||||
|
self.sample_rate,
|
||||||
|
convert_rate,
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# convert samples to desired sample width if specified
|
||||||
|
if convert_width is not None and self.sample_width != convert_width:
|
||||||
|
if (
|
||||||
|
convert_width == 3
|
||||||
|
): # we're converting the audio into 24-bit (workaround for https://bugs.python.org/issue12866)
|
||||||
|
raw_data = audioop.lin2lin(
|
||||||
|
raw_data, self.sample_width, 4
|
||||||
|
) # convert audio into 32-bit first, which is always supported
|
||||||
|
try:
|
||||||
|
audioop.bias(
|
||||||
|
b"", 3, 0
|
||||||
|
) # test whether 24-bit audio is supported (for example, ``audioop`` in Python 3.3 and below don't support sample width 3, while Python 3.4+ do)
|
||||||
|
except (
|
||||||
|
audioop.error
|
||||||
|
): # this version of audioop doesn't support 24-bit audio (probably Python 3.3 or less)
|
||||||
|
raw_data = b"".join(
|
||||||
|
raw_data[i + 1 : i + 4]
|
||||||
|
for i in range(0, len(raw_data), 4)
|
||||||
|
) # since we're in little endian, we discard the first byte from each 32-bit sample to get a 24-bit sample
|
||||||
|
else: # 24-bit audio fully supported, we don't need to shim anything
|
||||||
|
raw_data = audioop.lin2lin(
|
||||||
|
raw_data, self.sample_width, convert_width
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raw_data = audioop.lin2lin(
|
||||||
|
raw_data, self.sample_width, convert_width
|
||||||
|
)
|
||||||
|
|
||||||
|
# if the output is 8-bit audio with unsigned samples, convert the samples we've been treating as signed to unsigned again
|
||||||
|
if convert_width == 1:
|
||||||
|
raw_data = audioop.bias(
|
||||||
|
raw_data, 1, 128
|
||||||
|
) # add 128 to every sample to make them act like unsigned samples again
|
||||||
|
|
||||||
|
return raw_data
|
||||||
|
|
||||||
|
def get_wav_data(self, convert_rate=None, convert_width=None, nchannels = 1):
|
||||||
|
"""
|
||||||
|
Returns a byte string representing the contents of a WAV file containing the audio represented by the ``AudioData`` instance.
|
||||||
|
|
||||||
|
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
|
||||||
|
|
||||||
|
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
|
||||||
|
|
||||||
|
Writing these bytes directly to a file results in a valid `WAV file <https://en.wikipedia.org/wiki/WAV>`__.
|
||||||
|
"""
|
||||||
|
raw_data = self.get_raw_data(convert_rate, convert_width)
|
||||||
|
sample_rate = (
|
||||||
|
self.sample_rate if convert_rate is None else convert_rate
|
||||||
|
)
|
||||||
|
sample_width = (
|
||||||
|
self.sample_width if convert_width is None else convert_width
|
||||||
|
)
|
||||||
|
|
||||||
|
# generate the WAV file contents
|
||||||
|
with io.BytesIO() as wav_file:
|
||||||
|
wav_writer = wave.open(wav_file, "wb")
|
||||||
|
try: # note that we can't use context manager, since that was only added in Python 3.4
|
||||||
|
wav_writer.setframerate(sample_rate)
|
||||||
|
wav_writer.setsampwidth(sample_width)
|
||||||
|
wav_writer.setnchannels(nchannels)
|
||||||
|
wav_writer.writeframes(raw_data)
|
||||||
|
wav_data = wav_file.getvalue()
|
||||||
|
finally: # make sure resources are cleaned up
|
||||||
|
wav_writer.close()
|
||||||
|
return wav_data
|
||||||
|
|
||||||
|
def get_aiff_data(self, convert_rate=None, convert_width=None):
|
||||||
|
"""
|
||||||
|
Returns a byte string representing the contents of an AIFF-C file containing the audio represented by the ``AudioData`` instance.
|
||||||
|
|
||||||
|
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
|
||||||
|
|
||||||
|
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
|
||||||
|
|
||||||
|
Writing these bytes directly to a file results in a valid `AIFF-C file <https://en.wikipedia.org/wiki/Audio_Interchange_File_Format>`__.
|
||||||
|
"""
|
||||||
|
raw_data = self.get_raw_data(convert_rate, convert_width)
|
||||||
|
sample_rate = (
|
||||||
|
self.sample_rate if convert_rate is None else convert_rate
|
||||||
|
)
|
||||||
|
sample_width = (
|
||||||
|
self.sample_width if convert_width is None else convert_width
|
||||||
|
)
|
||||||
|
|
||||||
|
# the AIFF format is big-endian, so we need to convert the little-endian raw data to big-endian
|
||||||
|
if hasattr(
|
||||||
|
audioop, "byteswap"
|
||||||
|
): # ``audioop.byteswap`` was only added in Python 3.4
|
||||||
|
raw_data = audioop.byteswap(raw_data, sample_width)
|
||||||
|
else: # manually reverse the bytes of each sample, which is slower but works well enough as a fallback
|
||||||
|
raw_data = raw_data[sample_width - 1 :: -1] + b"".join(
|
||||||
|
raw_data[i + sample_width : i : -1]
|
||||||
|
for i in range(sample_width - 1, len(raw_data), sample_width)
|
||||||
|
)
|
||||||
|
|
||||||
|
# generate the AIFF-C file contents
|
||||||
|
with io.BytesIO() as aiff_file:
|
||||||
|
aiff_writer = aifc.open(aiff_file, "wb")
|
||||||
|
try: # note that we can't use context manager, since that was only added in Python 3.4
|
||||||
|
aiff_writer.setframerate(sample_rate)
|
||||||
|
aiff_writer.setsampwidth(sample_width)
|
||||||
|
aiff_writer.setnchannels(1)
|
||||||
|
aiff_writer.writeframes(raw_data)
|
||||||
|
aiff_data = aiff_file.getvalue()
|
||||||
|
finally: # make sure resources are cleaned up
|
||||||
|
aiff_writer.close()
|
||||||
|
return aiff_data
|
||||||
|
|
||||||
|
def get_flac_data(self, convert_rate=None, convert_width=None):
|
||||||
|
"""
|
||||||
|
Returns a byte string representing the contents of a FLAC file containing the audio represented by the ``AudioData`` instance.
|
||||||
|
|
||||||
|
Note that 32-bit FLAC is not supported. If the audio data is 32-bit and ``convert_width`` is not specified, then the resulting FLAC will be a 24-bit FLAC.
|
||||||
|
|
||||||
|
If ``convert_rate`` is specified and the audio sample rate is not ``convert_rate`` Hz, the resulting audio is resampled to match.
|
||||||
|
|
||||||
|
If ``convert_width`` is specified and the audio samples are not ``convert_width`` bytes each, the resulting audio is converted to match.
|
||||||
|
|
||||||
|
Writing these bytes directly to a file results in a valid `FLAC file <https://en.wikipedia.org/wiki/FLAC>`__.
|
||||||
|
"""
|
||||||
|
assert convert_width is None or (
|
||||||
|
convert_width % 1 == 0 and 1 <= convert_width <= 3
|
||||||
|
), "Sample width to convert to must be between 1 and 3 inclusive"
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.sample_width > 3 and convert_width is None
|
||||||
|
): # resulting WAV data would be 32-bit, which is not convertable to FLAC using our encoder
|
||||||
|
convert_width = 3 # the largest supported sample width is 24-bit, so we'll limit the sample width to that
|
||||||
|
|
||||||
|
# run the FLAC converter with the WAV data to get the FLAC data
|
||||||
|
wav_data = self.get_wav_data(convert_rate, convert_width)
|
||||||
|
flac_converter = get_flac_converter()
|
||||||
|
if (
|
||||||
|
os.name == "nt"
|
||||||
|
): # on Windows, specify that the process is to be started without showing a console window
|
||||||
|
startup_info = subprocess.STARTUPINFO()
|
||||||
|
startup_info.dwFlags |= (
|
||||||
|
subprocess.STARTF_USESHOWWINDOW
|
||||||
|
) # specify that the wShowWindow field of `startup_info` contains a value
|
||||||
|
startup_info.wShowWindow = (
|
||||||
|
subprocess.SW_HIDE
|
||||||
|
) # specify that the console window should be hidden
|
||||||
|
else:
|
||||||
|
startup_info = None # default startupinfo
|
||||||
|
process = subprocess.Popen(
|
||||||
|
[
|
||||||
|
flac_converter,
|
||||||
|
"--stdout",
|
||||||
|
"--totally-silent", # put the resulting FLAC file in stdout, and make sure it's not mixed with any program output
|
||||||
|
"--best", # highest level of compression available
|
||||||
|
"-", # the input FLAC file contents will be given in stdin
|
||||||
|
],
|
||||||
|
stdin=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
startupinfo=startup_info,
|
||||||
|
)
|
||||||
|
flac_data, stderr = process.communicate(wav_data)
|
||||||
|
return flac_data
|
||||||
|
|
||||||
|
|
||||||
|
def get_flac_converter():
|
||||||
|
"""Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found."""
|
||||||
|
flac_converter = shutil_which("flac") # check for installed version first
|
||||||
|
if flac_converter is None: # flac utility is not installed
|
||||||
|
base_path = os.path.dirname(
|
||||||
|
os.path.abspath(__file__)
|
||||||
|
) # directory of the current module file, where all the FLAC bundled binaries are stored
|
||||||
|
system, machine = platform.system(), platform.machine()
|
||||||
|
if system == "Windows" and machine in {
|
||||||
|
"i686",
|
||||||
|
"i786",
|
||||||
|
"x86",
|
||||||
|
"x86_64",
|
||||||
|
"AMD64",
|
||||||
|
}:
|
||||||
|
flac_converter = os.path.join(base_path, "flac-win32.exe")
|
||||||
|
elif system == "Darwin" and machine in {
|
||||||
|
"i686",
|
||||||
|
"i786",
|
||||||
|
"x86",
|
||||||
|
"x86_64",
|
||||||
|
"AMD64",
|
||||||
|
}:
|
||||||
|
flac_converter = os.path.join(base_path, "flac-mac")
|
||||||
|
elif system == "Linux" and machine in {"i686", "i786", "x86"}:
|
||||||
|
flac_converter = os.path.join(base_path, "flac-linux-x86")
|
||||||
|
elif system == "Linux" and machine in {"x86_64", "AMD64"}:
|
||||||
|
flac_converter = os.path.join(base_path, "flac-linux-x86_64")
|
||||||
|
else: # no FLAC converter available
|
||||||
|
raise OSError(
|
||||||
|
"FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent"
|
||||||
|
)
|
||||||
|
|
||||||
|
# mark FLAC converter as executable if possible
|
||||||
|
try:
|
||||||
|
# handle known issue when running on docker:
|
||||||
|
# run executable right after chmod() may result in OSError "Text file busy"
|
||||||
|
# fix: flush FS with sync
|
||||||
|
if not os.access(flac_converter, os.X_OK):
|
||||||
|
stat_info = os.stat(flac_converter)
|
||||||
|
os.chmod(flac_converter, stat_info.st_mode | stat.S_IEXEC)
|
||||||
|
if "Linux" in platform.system():
|
||||||
|
os.sync() if sys.version_info >= (3, 3) else os.system("sync")
|
||||||
|
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return flac_converter
|
||||||
|
|
||||||
|
|
||||||
|
def shutil_which(pgm):
|
||||||
|
"""Python 2 compatibility: backport of ``shutil.which()`` from Python 3"""
|
||||||
|
path = os.getenv("PATH")
|
||||||
|
for p in path.split(os.path.pathsep):
|
||||||
|
p = os.path.join(p, pgm)
|
||||||
|
if os.path.exists(p) and os.access(p, os.X_OK):
|
||||||
|
return p
|
||||||
22
custom_speech_recognition/exceptions.py
Normal file
22
custom_speech_recognition/exceptions.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
class SetupError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class WaitTimeoutError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class RequestError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class UnknownValueError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TranscriptionNotReady(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TranscriptionFailed(Exception):
|
||||||
|
pass
|
||||||
BIN
custom_speech_recognition/flac-linux-x86
Normal file
BIN
custom_speech_recognition/flac-linux-x86
Normal file
Binary file not shown.
BIN
custom_speech_recognition/flac-linux-x86_64
Normal file
BIN
custom_speech_recognition/flac-linux-x86_64
Normal file
Binary file not shown.
BIN
custom_speech_recognition/flac-mac
Normal file
BIN
custom_speech_recognition/flac-mac
Normal file
Binary file not shown.
BIN
custom_speech_recognition/flac-win32.exe
Normal file
BIN
custom_speech_recognition/flac-win32.exe
Normal file
Binary file not shown.
@@ -0,0 +1,31 @@
|
|||||||
|
Copyright (c) 1999-2015 Carnegie Mellon University. All rights
|
||||||
|
reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions
|
||||||
|
are met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
This work was supported in part by funding from the Defense Advanced
|
||||||
|
Research Projects Agency and the National Science Foundation of the
|
||||||
|
United States of America, and the CMU Sphinx Speech Consortium.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
|
||||||
|
ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
|
||||||
|
NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
@@ -0,0 +1,34 @@
|
|||||||
|
/* ====================================================================
|
||||||
|
* Copyright (c) 2015 Alpha Cephei Inc. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY ALPHA CEPHEI INC. ``AS IS'' AND.
|
||||||
|
* ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,.
|
||||||
|
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ALPHA CEPHEI INC.
|
||||||
|
* NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT.
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,.
|
||||||
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY.
|
||||||
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT.
|
||||||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE.
|
||||||
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
This directory contains generic US english acoustic model trained with
|
||||||
|
latest sphinxtrain.
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
-lowerf 130
|
||||||
|
-upperf 6800
|
||||||
|
-nfilt 25
|
||||||
|
-transform dct
|
||||||
|
-lifter 22
|
||||||
|
-feat 1s_c_d_dd
|
||||||
|
-svspec 0-12/13-25/26-38
|
||||||
|
-agc none
|
||||||
|
-cmn current
|
||||||
|
-varnorm no
|
||||||
|
-model ptm
|
||||||
|
-cmninit 40,3,-1
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 2.8 MiB |
Binary file not shown.
@@ -0,0 +1,5 @@
|
|||||||
|
<s> SIL
|
||||||
|
</s> SIL
|
||||||
|
<sil> SIL
|
||||||
|
[NOISE] +NSN+
|
||||||
|
[SPEECH] +SPN+
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
0
custom_speech_recognition/recognizers/__init__.py
Normal file
0
custom_speech_recognition/recognizers/__init__.py
Normal file
42
custom_speech_recognition/recognizers/whisper.py
Normal file
42
custom_speech_recognition/recognizers/whisper.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
from custom_speech_recognition.audio import AudioData
|
||||||
|
from custom_speech_recognition.exceptions import SetupError
|
||||||
|
|
||||||
|
|
||||||
|
def recognize_whisper_api(
|
||||||
|
recognizer,
|
||||||
|
audio_data: "AudioData",
|
||||||
|
*,
|
||||||
|
model: str = "whisper-1",
|
||||||
|
api_key: str | None = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the OpenAI Whisper API.
|
||||||
|
|
||||||
|
This function requires an OpenAI account; visit https://platform.openai.com/signup, then generate API Key in `User settings <https://platform.openai.com/account/api-keys>`__.
|
||||||
|
|
||||||
|
Detail: https://platform.openai.com/docs/guides/speech-to-text
|
||||||
|
|
||||||
|
Raises a ``speech_recognition.exceptions.SetupError`` exception if there are any issues with the openai installation, or the environment variable is missing.
|
||||||
|
"""
|
||||||
|
if not isinstance(audio_data, AudioData):
|
||||||
|
raise ValueError("``audio_data`` must be an ``AudioData`` instance")
|
||||||
|
if api_key is None and os.environ.get("OPENAI_API_KEY") is None:
|
||||||
|
raise SetupError("Set environment variable ``OPENAI_API_KEY``")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import openai
|
||||||
|
except ImportError:
|
||||||
|
raise SetupError(
|
||||||
|
"missing openai module: ensure that openai is set up correctly."
|
||||||
|
)
|
||||||
|
|
||||||
|
wav_data = BytesIO(audio_data.get_wav_data())
|
||||||
|
wav_data.name = "SpeechRecognition_audio.wav"
|
||||||
|
|
||||||
|
transcript = openai.Audio.transcribe(model, wav_data, api_key=api_key)
|
||||||
|
return transcript["text"]
|
||||||
38
test_main.py
Normal file
38
test_main.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import time
|
||||||
|
import threading
|
||||||
|
import queue
|
||||||
|
import AudioTranscriber
|
||||||
|
import AudioRecorder
|
||||||
|
import audio_utils
|
||||||
|
|
||||||
|
mic_audio_queue = queue.Queue()
|
||||||
|
|
||||||
|
mic_device = audio_utils.get_default_input_device()
|
||||||
|
mic_audio_recorder = AudioRecorder.SelectedMicRecorder(mic_device)
|
||||||
|
mic_audio_recorder.record_into_queue(mic_audio_queue)
|
||||||
|
|
||||||
|
mic_transcriber = AudioTranscriber.AudioTranscriber(source=mic_audio_recorder.source, language="ja-JP")
|
||||||
|
mic_transcribe = threading.Thread(target=mic_transcriber.transcribe_audio_queue, args=(mic_audio_queue,))
|
||||||
|
mic_transcribe.daemon = True
|
||||||
|
mic_transcribe.start()
|
||||||
|
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
spk_audio_queue = queue.Queue()
|
||||||
|
spk_device = audio_utils.get_default_output_device()
|
||||||
|
spk_audio_recorder = AudioRecorder.SelectedSpeakerRecorder(spk_device)
|
||||||
|
spk_audio_recorder.record_into_queue(spk_audio_queue)
|
||||||
|
|
||||||
|
spk_transcriber = AudioTranscriber.AudioTranscriber(source=mic_audio_recorder.source, language="ja-JP")
|
||||||
|
spk_transcribe = threading.Thread(target=spk_transcriber.transcribe_audio_queue, args=(spk_audio_queue,))
|
||||||
|
spk_transcribe.daemon = True
|
||||||
|
spk_transcribe.start()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
text = mic_transcriber.get_transcript()
|
||||||
|
if len(text) > 0:
|
||||||
|
print("mic:", text)
|
||||||
|
# text = spk_transcriber.get_transcript()
|
||||||
|
# if len(text) > 0:
|
||||||
|
# print("spk:", text)
|
||||||
|
time.sleep(0.1)
|
||||||
Reference in New Issue
Block a user