diff --git a/VRCT.py b/VRCT.py index 2fbe8467..a6a36757 100644 --- a/VRCT.py +++ b/VRCT.py @@ -259,8 +259,8 @@ class App(customtkinter.CTk): ## set transcription instance self.vr = transcription.VoiceRecognizer() - self.CHOICE_MIC_DEVICE = self.CHOICE_MIC_DEVICE if self.CHOICE_MIC_DEVICE is not None else self.vr.search_default_device_index()[0]["name"] - self.CHOICE_SPEAKER_DEVICE = self.CHOICE_SPEAKER_DEVICE if self.CHOICE_SPEAKER_DEVICE is not None else self.vr.search_default_device_index()[1]["name"] + self.CHOICE_MIC_DEVICE = self.CHOICE_MIC_DEVICE if self.CHOICE_MIC_DEVICE is not None else self.vr.search_default_device()[0] + self.CHOICE_SPEAKER_DEVICE = self.CHOICE_SPEAKER_DEVICE if self.CHOICE_SPEAKER_DEVICE is not None else self.vr.search_default_device()[1] ## set checkbox enable translation if self.ENABLE_TRANSLATION: @@ -324,33 +324,44 @@ class App(customtkinter.CTk): self.ENABLE_TRANSCRIPTION = self.checkbox_transcription.get() if self.ENABLE_TRANSCRIPTION is True: # start threading - th = threading.Thread(target = self.voice_input) - th.start() + self.vr.set_mic(self.CHOICE_MIC_DEVICE) + self.vr.init_mic(threshold=self.MIC_THRESHOLD, is_dynamic=self.ENABLE_MIC_IS_DYNAMIC) + th_vr_listen_mic = threading.Thread(target = self.vr_listen_mic) + th_vr_recognize_mic = threading.Thread(target = self.vr_recognize_mic) + th_vr_listen_mic.start() + th_vr_recognize_mic.start() + + self.vr.set_spk(self.CHOICE_SPEAKER_DEVICE) + self.vr.init_spk() + th_vr_listen_spk = threading.Thread(target = self.vr_listen_spk) + th_vr_recognize_spk = threading.Thread(target = self.vr_recognize_spk) + th_vr_listen_spk.start() + th_vr_recognize_spk.start() + utils.save_json(self.PATH_CONFIG, "ENABLE_TRANSCRIPTION", self.ENABLE_TRANSCRIPTION) - def voice_input(self): - self.vr.set_mic(self.CHOICE_MIC_DEVICE) - self.vr.init_mic(threshold=self.MIC_THRESHOLD, is_dynamic=self.ENABLE_MIC_IS_DYNAMIC) + def vr_listen_mic(self): + while self.checkbox_transcription.get() is True: + self.vr.listen_mic() - # start voice_input - if self.checkbox_transcription.get() is True: - self.textbox_message_log.configure(state='normal') - self.textbox_message_log.insert("end", f"[INFO] start transcription\n") - self.textbox_message_log.configure(state='disabled') - self.textbox_message_log.see("end") + def vr_recognize_mic(self): + self.textbox_message_log.configure(state='normal') + self.textbox_message_log.insert("end", f"[INFO] start transcription\n") + self.textbox_message_log.configure(state='disabled') + self.textbox_message_log.see("end") while self.checkbox_transcription.get() is True: - message = self.vr.listen_voice(language=self.INPUT_MIC_VOICE_LANGUAGE) + message = self.vr.recognize_mic(language=self.INPUT_MIC_VOICE_LANGUAGE) if len(message) > 0: # translate if self.checkbox_translation.get() is False: - chat_message = f"{message}" - elif (self.translator.translator_status[self.CHOICE_TRANSLATOR] is False) or (self.INPUT_SOURCE_LANG == "None") or (self.INPUT_TARGET_LANG == "None"): + voice_message = f"{message}" + elif self.translator.translator_status[self.CHOICE_TRANSLATOR] is False: self.textbox_message_log.configure(state='normal') self.textbox_message_log.insert("end", f"[ERROR] Auth Keyもしくは言語の設定が間違っています\n") self.textbox_message_log.configure(state='disabled') self.textbox_message_log.see("end") - chat_message = f"{message}" + voice_message = f"{message}" else: result = self.translator.translate( translator_name=self.CHOICE_TRANSLATOR, @@ -358,20 +369,51 @@ class App(customtkinter.CTk): target_language=self.INPUT_TARGET_LANG, message=message ) - chat_message = self.MESSAGE_FORMAT.replace("[message]", message).replace("[translation]", result) + voice_message = self.MESSAGE_FORMAT.replace("[message]", message).replace("[translation]", result) # send OSC message - osc_tools.send_message(chat_message, self.OSC_IP_ADDRESS, self.OSC_PORT) + osc_tools.send_message(voice_message, self.OSC_IP_ADDRESS, self.OSC_PORT) # update textbox message log self.textbox_message_log.configure(state='normal') - self.textbox_message_log.insert("end", f"[VOICE] {chat_message}\n") + self.textbox_message_log.insert("end", f"[VOICE] {voice_message}\n") + self.textbox_message_log.configure(state='disabled') + self.textbox_message_log.see("end") + + def vr_listen_spk(self): + while self.checkbox_transcription.get() is True: + self.vr.listen_spk() + + def vr_recognize_spk(self): + while self.checkbox_transcription.get() is True: + message = self.vr.recognize_spk(language=self.INPUT_SPEAKER_VOICE_LANGUAGE) + if len(message) > 0: + # translate + if self.checkbox_translation.get() is False: + voice_message = f"{message}" + elif self.translator.translator_status[self.CHOICE_TRANSLATOR] is False: + self.textbox_message_log.configure(state='normal') + self.textbox_message_log.insert("end", f"[ERROR] Auth Keyもしくは言語の設定が間違っています\n") + self.textbox_message_log.configure(state='disabled') + self.textbox_message_log.see("end") + voice_message = f"{message}" + else: + result = self.translator.translate( + translator_name=self.CHOICE_TRANSLATOR, + source_language=self.INPUT_SOURCE_LANG, + target_language=self.INPUT_TARGET_LANG, + message=message + ) + voice_message = self.MESSAGE_FORMAT.replace("[message]", message).replace("[translation]", result) + + # send OSC message + osc_tools.send_message(voice_message, self.OSC_IP_ADDRESS, self.OSC_PORT) + + # update textbox message log + self.textbox_message_log.configure(state='normal') + self.textbox_message_log.insert("end", f"[VOICE] {voice_message}\n") self.textbox_message_log.configure(state='disabled') self.textbox_message_log.see("end") - self.textbox_message_log.configure(state='normal') - self.textbox_message_log.insert("end", f"[INFO] stop transcription\n") - self.textbox_message_log.configure(state='disabled') - self.textbox_message_log.see("end") def checkbox_foreground_callback(self): self.ENABLE_FOREGROUND = self.checkbox_foreground.get() diff --git a/transcription.py b/transcription.py index ceddfb5c..ba358d08 100644 --- a/transcription.py +++ b/transcription.py @@ -1,3 +1,8 @@ +import io +import queue +import numpy as np +import soundcard as sc +import soundfile as sf import sounddevice as sd import speech_recognition as sr @@ -5,7 +10,6 @@ import speech_recognition as sr class VoiceRecognizer(): def __init__(self): self.r = sr.Recognizer() - self.mic = None self.languages = [ "ja-JP","en-US","en-GB","af-ZA","ar-DZ","ar-BH","ar-EG","ar-IL","ar-IQ","ar-JO","ar-KW","ar-LB","ar-MA", "ar-OM","ar-PS","ar-QA","ar-SA","ar-TN","ar-AE","eu-ES","bg-BG","ca-ES","cmn-Hans-CN","cmn-Hans-HK", @@ -16,6 +20,18 @@ class VoiceRecognizer(): "es-NI","es-PA","es-PY","es-PE","es-PR","es-ES","es-UY","es-US","es-VE","sv-SE","th-TH","tr-TR","uk-UA", "vi-VN","zu-ZA" ] + self.mic = None + self.enable_mic_recognize = False + self.queue_mic = queue.Queue() + + self.spk_device_name = None + self.spk_sample_rate = 16000 + self.spk_interval = 3 + self.spk_buffer_size = 4096 + self.spk_language = "en-US" + self.spk_audio = np.empty(self.spk_sample_rate * self.spk_interval + self.spk_buffer_size, dtype=np.float32) + self.n = 0 + self.queue_spk = queue.Queue() def search_input_device(self): device_list = sd.query_devices() @@ -28,21 +44,20 @@ class VoiceRecognizer(): return input_device_list def search_output_device(self): - device_list = sd.query_devices() + device_list = sc.all_speakers() output_device_list = [] for device in device_list: - if device["max_output_channels"] > 0: - output_device_list.append({"name": device["name"], "index": device["index"]}) + output_device_list.append(str(device.name)) return output_device_list - def search_default_device_index(self): + def search_default_device(self): device_list = sd.query_devices() - default_device_list = [] - for i in sd.default.device: - default_device_list.append({"name": device_list[i]["name"], "index": device_list[i]["index"]}) - return default_device_list + mic_index = sd.default.device[0] + name_mic = device_list[mic_index]["name"] + name_spk = str(sc.default_speaker().name) + return name_mic, name_spk def set_mic(self, device_name, threshold=50, is_dynamic=False): input_device_list = self.search_input_device() @@ -68,14 +83,86 @@ class VoiceRecognizer(): else: return False - def listen_voice(self, language): + def listen_mic(self): if self.mic != None: with self.mic as source: audio = self.r.listen(source) - try: - text = self.r.recognize_google(audio, language=language) - return text - except: - return "" - else: - return False \ No newline at end of file + self.queue_mic.put(audio) + + def recognize_mic(self, language): + try: + audio = self.queue_mic.get() + text = self.r.recognize_google(audio, language=language) + except: + text = "" + return text + + def set_spk(self, device_name=str(sc.default_speaker().name), sample_rate=16000, interval=3, buffer_size=4096, language="en-US"): + self.spk_device_name = device_name + self.spk_sample_rate = sample_rate + self.spk_interval = interval + self.spk_buffer_size = buffer_size + self.spk_language = language + + def init_spk(self): + self.spk_audio = np.empty(self.spk_sample_rate * self.spk_interval + self.spk_buffer_size, dtype=np.float32) + self.n = 0 + + def listen_spk(self): + audio = self.spk_audio + n = self.n + with sc.get_microphone(id=self.spk_device_name, include_loopback=True).recorder(samplerate=self.spk_sample_rate, channels=1) as source: + while n < self.spk_sample_rate * self.spk_interval: + data = source.record(self.spk_buffer_size) + audio[n:n+len(data)] = data.reshape(-1) + n += len(data) + m = n * 4 // 5 + vol = np.convolve(audio[m:n] ** 2, np.ones(100) / 100, 'same') + m += vol.argmin() + audio_prev = audio.copy() + self.queue_spk.put(audio[:m]) + audio = np.empty(self.spk_sample_rate * self.spk_interval + self.spk_buffer_size, dtype=np.float32) + audio[:n-m] = audio_prev[m:n] + n = n-m + self.spk_audio = audio + self.n = n + + def recognize_spk(self): + try: + audio = self.queue_spk.get() + with io.BytesIO() as memory_file: + sf.write(file=memory_file, data=audio, format="WAV", samplerate=self.spk_sample_rate) + memory_file.seek(0) + with sr.AudioFile(memory_file) as source: + audio = self.r.record(source) + text = self.r.recognize_google(audio, language=self.spk_language) + except Exception as e: + text = "" + return text + +if __name__ == "__main__": + import time + import threading + + vr = VoiceRecognizer() + mic_name, spk_name = vr.search_default_device() + vr.spk_enable_recognize = True + vr.set_spk(language="ja-JP") + vr.init_spk() + + def vr_listen_spk(): + while True: + vr.listen_spk() + + def vr_recognize_spk(): + while True: + text = vr.recognize_spk() + print(text) + + th_vr_listen_spk = threading.Thread(target=vr_listen_spk) + th_vr_recognize_spk = threading.Thread(target=vr_recognize_spk) + th_vr_listen_spk.start() + th_vr_recognize_spk.start() + + while True: + time.sleep(60) \ No newline at end of file diff --git a/window_config.py b/window_config.py index a5ef6daf..6930db1a 100644 --- a/window_config.py +++ b/window_config.py @@ -277,7 +277,7 @@ class ToplevelWindowConfig(customtkinter.CTkToplevel): self.label_input_speaker_device.grid(row=4, column=0, columnspan=1, padx=5, pady=5, sticky="nsw") self.optionmenu_input_speaker_device = customtkinter.CTkOptionMenu( self.tabview_config.tab("Transcription"), - values=[device["name"] for device in self.parent.vr.search_output_device()], + values=self.parent.vr.search_output_device(), # command=self.optionmenu_input_speaker_device_callback, font=customtkinter.CTkFont(family=self.parent.FONT_FAMILY), variable=customtkinter.StringVar(value=self.parent.CHOICE_SPEAKER_DEVICE),