From 92e6ed92059b6dc48944fe1ee6cc9b264d57c655 Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Tue, 20 Jun 2023 09:01:31 +0900 Subject: [PATCH] update spk process --- VRCT.py | 50 ++++++++------ transcription.py | 171 +++++++++++++++++++++++++++-------------------- window_config.py | 2 +- 3 files changed, 131 insertions(+), 92 deletions(-) diff --git a/VRCT.py b/VRCT.py index b6918330..aa591c87 100644 --- a/VRCT.py +++ b/VRCT.py @@ -1,6 +1,7 @@ import os import json -import threading +import queue +import time import customtkinter from PIL import Image @@ -41,7 +42,7 @@ class App(customtkinter.CTk): self.CHOICE_SPEAKER_DEVICE = None self.INPUT_SPEAKER_VOICE_LANGUAGE = "en-US" self.INPUT_SPEAKER_SAMPLING_RATE = 16000 - self.INPUT_SPEAKER_INTERVAL = 3 + self.INPUT_SPEAKER_INTERVAL = 4 self.INPUT_SPEAKER_BUFFER_SIZE = 4096 ## Parameter @@ -302,7 +303,9 @@ class App(customtkinter.CTk): utils.print_textbox(self.textbox_message_system_log, "Auth Key or language setting is incorrect", "ERROR") ## set transcription instance - self.vr = transcription.VoiceRecognizer() + self.mic_queue = queue.Queue() + self.spk_queue = queue.Queue() + self.vr = transcription.VoiceRecognizer(self.mic_queue, self.spk_queue) self.CHOICE_MIC_DEVICE = self.CHOICE_MIC_DEVICE if self.CHOICE_MIC_DEVICE is not None else self.vr.search_default_device()[0] self.CHOICE_SPEAKER_DEVICE = self.CHOICE_SPEAKER_DEVICE if self.CHOICE_SPEAKER_DEVICE is not None else self.vr.search_default_device()[1] @@ -383,6 +386,7 @@ class App(customtkinter.CTk): device_name=self.CHOICE_MIC_DEVICE, threshold=int(self.INPUT_MIC_THRESHOLD), is_dynamic=self.INPUT_MIC_IS_DYNAMIC, + language=self.INPUT_MIC_VOICE_LANGUAGE, ) self.vr.init_mic() self.th_vr_listen_mic = utils.thread_fnc(self.vr_listen_mic) @@ -404,21 +408,18 @@ class App(customtkinter.CTk): if self.ENABLE_TRANSCRIPTION_RECEIVE is True: utils.print_textbox(self.textbox_message_log, "Start speaker2log", "INFO") utils.print_textbox(self.textbox_message_system_log, "Start speaker2log", "INFO") - # start threading + self.vr.set_spk( device_name=self.CHOICE_SPEAKER_DEVICE, - sample_rate=int(self.INPUT_SPEAKER_SAMPLING_RATE), interval=int(self.INPUT_SPEAKER_INTERVAL), - buffer_size=int(self.INPUT_SPEAKER_BUFFER_SIZE), + language=self.INPUT_SPEAKER_VOICE_LANGUAGE, ) - self.vr.init_spk() - self.th_vr_listen_spk = utils.thread_fnc(self.vr_listen_spk) + self.vr.start_spk_recording() self.th_vr_recognize_spk = utils.thread_fnc(self.vr_recognize_spk) - self.th_vr_listen_spk.start() self.th_vr_recognize_spk.start() else: - if isinstance(self.th_vr_listen_spk, utils.thread_fnc): - self.th_vr_listen_spk.stop() + if self.vr.spk_stream is not None: + self.vr.close_spk_stream() if isinstance(self.th_vr_recognize_spk, utils.thread_fnc): self.th_vr_recognize_spk.stop() @@ -430,7 +431,7 @@ class App(customtkinter.CTk): self.vr.listen_mic() def vr_recognize_mic(self): - message = self.vr.recognize_mic(language=self.INPUT_MIC_VOICE_LANGUAGE) + message = self.vr.recognize_mic() if len(message) > 0: # translate if self.checkbox_translation.get() is False: @@ -457,7 +458,7 @@ class App(customtkinter.CTk): self.vr.listen_spk() def vr_recognize_spk(self): - message = self.vr.recognize_spk(language=self.INPUT_SPEAKER_VOICE_LANGUAGE) + message = self.vr.recognize_spk() if len(message) > 0: # translate if self.checkbox_translation.get() is False: @@ -540,12 +541,23 @@ class App(customtkinter.CTk): self.attributes("-topmost", True) def delete_window(self): - thread_list = threading.enumerate() - thread_list.remove(threading.main_thread()) - for thread in thread_list: - thread.stop() + if isinstance(self.th_vr_listen_mic, utils.thread_fnc): + while not self.th_vr_listen_mic.stopped(): + self.th_vr_listen_mic.stop() + if isinstance(self.th_vr_recognize_mic, utils.thread_fnc): + while not self.th_vr_recognize_mic.stopped(): + self.th_vr_recognize_mic.stop() + if self.vr.spk_stream is not None: + self.vr.close_spk_stream() + if isinstance(self.th_vr_recognize_spk, utils.thread_fnc): + while not self.th_vr_recognize_spk.stopped(): + self.th_vr_recognize_spk.stop() self.destroy() + if __name__ == "__main__": - app = App() - app.mainloop() \ No newline at end of file + try: + app = App() + app.mainloop() + except Exception as e: + print(e) \ No newline at end of file diff --git a/transcription.py b/transcription.py index 07c8f9ce..991126a5 100644 --- a/transcription.py +++ b/transcription.py @@ -5,11 +5,14 @@ import soundcard as sc import soundfile as sf import sounddevice as sd import speech_recognition as sr +import pyaudiowpatch as pyaudio # VoiceRecognizer class VoiceRecognizer(): - def __init__(self): + def __init__(self, mic_queue, spk_queue): self.r = sr.Recognizer() + self.p = pyaudio.PyAudio() + self.languages = [ "ja-JP","en-US","en-GB","af-ZA","ar-DZ","ar-BH","ar-EG","ar-IL","ar-IQ","ar-JO","ar-KW","ar-LB","ar-MA", "ar-OM","ar-PS","ar-QA","ar-SA","ar-TN","ar-AE","eu-ES","bg-BG","ca-ES","cmn-Hans-CN","cmn-Hans-HK", @@ -23,47 +26,55 @@ class VoiceRecognizer(): self.mic_device_name = None self.mic_threshold = 50 self.mic_is_dynamic = False - self.mic_queue = queue.Queue() + self.mic_language = "ja-JP" + self.mic_queue = mic_queue - self.spk_device_name = None - self.spk_sample_rate = 16000 + self.spk_device = None self.spk_interval = 3 - self.spk_buffer_size = 4096 - self.spk_audio = np.empty(self.spk_sample_rate * self.spk_interval + self.spk_buffer_size, dtype=np.float32) - self.n = 0 - self.spk_queue = queue.Queue() + self.spk_language = "ja-JP" + self.spk_stream = None + self.spk_queue = spk_queue def search_input_device(self): + devices = [] device_list = sd.query_devices() - input_device_list = [] - for device in device_list: if device["max_input_channels"] > 0: - input_device_list.append({"name": device["name"], "index": device["index"]}) - - return input_device_list + devices.append(device) + return devices def search_output_device(self): - device_list = sc.all_speakers() - output_device_list = [] - - for device in device_list: - output_device_list.append(str(device.name)) - - return output_device_list + devices =[] + with pyaudio.PyAudio() as p: + wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI) + for host_index in range(0, p.get_host_api_count()): + for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']): + device = p.get_device_info_by_host_api_device_index(host_index, device_index) + if device["hostApi"] == wasapi_info["index"] and device["isLoopbackDevice"] is True: + devices.append(device) + return devices def search_default_device(self): device_list = sd.query_devices() mic_index = sd.default.device[0] name_mic = device_list[mic_index]["name"] - name_spk = str(sc.default_speaker().name) + with pyaudio.PyAudio() as p: + wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI) + default_speakers = p.get_device_info_by_index(wasapi_info["defaultOutputDevice"]) + + if not default_speakers["isLoopbackDevice"]: + for loopback in p.get_loopback_device_info_generator(): + if default_speakers["name"] in loopback["name"]: + name_spk = loopback["name"] + break return name_mic, name_spk - def set_mic(self, device_name, threshold=50, is_dynamic=False): + def set_mic(self, device_name, threshold=50, is_dynamic=False, language="ja-JP"): input_device_list = self.search_input_device() self.mic_device_name = [device["index"] for device in input_device_list if device["name"] == device_name][0] self.mic_threshold = threshold self.mic_is_dynamic = is_dynamic + self.mic_language = language def init_mic(self): self.r.energy_threshold = self.mic_threshold @@ -76,79 +87,95 @@ class VoiceRecognizer(): audio = self.r.listen(source) self.mic_queue.put(audio) - def recognize_mic(self, language): + def recognize_mic(self): try: audio = self.mic_queue.get() - text = self.r.recognize_google(audio, language=language) + text = self.r.recognize_google(audio, language=self.mic_language) except: text = "" return text - def set_spk(self, device_name=str(sc.default_speaker().name), sample_rate=16000, interval=3, buffer_size=4096): - self.spk_device_name = device_name - self.spk_sample_rate = sample_rate + def set_spk(self, device_name, interval, language): + output_device_list = self.search_output_device() + self.spk_device = [device for device in output_device_list if device["name"] == device_name][0] self.spk_interval = interval - self.spk_buffer_size = buffer_size + self.spk_language = language - def init_spk(self): - self.spk_audio = np.empty(self.spk_sample_rate * self.spk_interval + self.spk_buffer_size, dtype=np.float32) - self.n = 0 + def spk_record_callback(self, in_data, frame_count, time_info, status): + self.spk_queue.put(in_data) + return (in_data, pyaudio.paContinue) - def listen_spk(self): - audio = self.spk_audio - n = self.n - with sc.get_microphone(id=self.spk_device_name, include_loopback=True).recorder(samplerate=self.spk_sample_rate, channels=1) as source: - while n < self.spk_sample_rate * self.spk_interval: - data = source.record(self.spk_buffer_size) - audio[n:n+len(data)] = data.reshape(-1) - n += len(data) - m = n * 4 // 5 - vol = np.convolve(audio[m:n] ** 2, np.ones(100) / 100, 'same') - m += vol.argmin() - audio_prev = audio.copy() - self.spk_queue.put(audio[:m]) - audio = np.empty(self.spk_sample_rate * self.spk_interval + self.spk_buffer_size, dtype=np.float32) - audio[:n-m] = audio_prev[m:n] - n = n-m - self.spk_audio = audio - self.n = n + def start_spk_recording(self): + self.close_spk_stream() + self.spk_stream = self.p.open(format=pyaudio.paInt16, + channels=self.spk_device["maxInputChannels"], + rate=int(self.spk_device["defaultSampleRate"]), + frames_per_buffer=int(self.spk_device["defaultSampleRate"])*self.spk_interval, + input=True, + input_device_index=self.spk_device["index"], + stream_callback=self.spk_record_callback + ) - def recognize_spk(self, language): + def stop_spk_stream(self): + self.spk_stream.stop_stream() + + def start_spk_stream(self): + self.spk_stream.start_stream() + + def close_spk_stream(self): + if self.spk_stream is not None: + self.spk_stream.stop_stream() + self.spk_stream.close() + self.spk_stream = None + + def recognize_spk(self): try: - audio = self.spk_queue.get() - with io.BytesIO() as memory_file: - sf.write(file=memory_file, data=audio, format="WAV", samplerate=self.spk_sample_rate) - memory_file.seek(0) - with sr.AudioFile(memory_file) as source: - audio = self.r.record(source) - text = self.r.recognize_google(audio, language=language) + in_data = self.spk_queue.get() + audio_data = sr.AudioData(in_data, int(self.spk_device["defaultSampleRate"]), self.spk_interval) + text = self.r.recognize_google(audio_data, language=self.spk_language) except: text = "" return text if __name__ == "__main__": - import time import threading - vr = VoiceRecognizer() - mic_name, spk_name = vr.search_default_device() - vr.spk_enable_recognize = True - vr.set_spk(language="ja-JP") - vr.init_spk() + mic_queue = queue.Queue() + spk_queue = queue.Queue() + vr = VoiceRecognizer(mic_queue, spk_queue) - def vr_listen_spk(): + mic_name, spk_name = vr.search_default_device() + print("mic_name", mic_name) + print("spk_name", spk_name) + + ############################################################### + vr.set_mic(device_name=mic_name, threshold=300, is_dynamic=False, language="ja-JP") + vr.init_mic() + + def vr_listen_mic(): while True: - vr.listen_spk() + vr.listen_mic() + + def vr_recognize_mic(): + while True: + text = vr.recognize_mic() + if len(text) > 0: + print(text) + th_vr_listen_mic = threading.Thread(target=vr_listen_mic) + th_vr_listen_mic.start() + th_vr_recognize_mic = threading.Thread(target=vr_recognize_mic) + th_vr_recognize_mic.start() + ############################################################### + + ############################################################### + vr.set_spk(device_name=spk_name, interval=4, language="ja-JP") + vr.start_spk_recording() def vr_recognize_spk(): while True: text = vr.recognize_spk() - print(text) - - th_vr_listen_spk = threading.Thread(target=vr_listen_spk) + if len(text) > 0: + print(text) th_vr_recognize_spk = threading.Thread(target=vr_recognize_spk) - th_vr_listen_spk.start() th_vr_recognize_spk.start() - - while True: - time.sleep(60) \ No newline at end of file + ############################################################### \ No newline at end of file diff --git a/window_config.py b/window_config.py index e2c2ebbb..5efa71fb 100644 --- a/window_config.py +++ b/window_config.py @@ -273,7 +273,7 @@ class ToplevelWindowConfig(customtkinter.CTkToplevel): self.label_input_speaker_device.grid(row=4, column=0, columnspan=1, padx=5, pady=5, sticky="nsw") self.optionmenu_input_speaker_device = customtkinter.CTkOptionMenu( self.tabview_config.tab("Transcription"), - values=self.parent.vr.search_output_device(), + values=[device["name"] for device in self.parent.vr.search_output_device()], command=self.optionmenu_input_speaker_device_callback, font=customtkinter.CTkFont(family=self.parent.FONT_FAMILY), variable=customtkinter.StringVar(value=self.parent.CHOICE_SPEAKER_DEVICE),