diff --git a/AudioRecorder.py b/AudioRecorder.py index 2308d4cb..c402147d 100644 --- a/AudioRecorder.py +++ b/AudioRecorder.py @@ -11,6 +11,7 @@ class BaseRecorder: self.recorder = sr.Recognizer() self.recorder.energy_threshold = ENERGY_THRESHOLD self.recorder.dynamic_energy_threshold = DYNAMIC_ENERGY_THRESHOLD + self.stop = None if source is None: raise ValueError("audio source can't be None") @@ -25,7 +26,7 @@ class BaseRecorder: def record_callback(_, audio:sr.AudioData) -> None: audio_queue.put((audio.get_raw_data(), datetime.now())) - self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT) + self.stop = self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=RECORD_TIMEOUT) class SelectedMicRecorder(BaseRecorder): def __init__(self, device): diff --git a/AudioTranscriber.py b/AudioTranscriber.py index dfdffd8a..cefb6a25 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -1,6 +1,4 @@ import io -import os -import tempfile import threading import wave import custom_speech_recognition as sr @@ -28,24 +26,24 @@ class AudioTranscriber: } def transcribe_audio_queue(self, audio_queue): - while True: - audio, time_spoken = audio_queue.get() - self.update_last_sample_and_phrase_status(audio, time_spoken) + # while True: + audio, time_spoken = audio_queue.get() + self.update_last_sample_and_phrase_status(audio, time_spoken) - text = '' - try: - # fd, path = tempfile.mkstemp(suffix=".wav") - # os.close(fd) - audio_data = self.audio_sources["process_data_func"]() - text = self.audio_recognizer.recognize_google(audio_data, language=self.language) - except Exception as e: - pass - finally: - pass - # os.unlink(path) + text = '' + try: + # fd, path = tempfile.mkstemp(suffix=".wav") + # os.close(fd) + audio_data = self.audio_sources["process_data_func"]() + text = self.audio_recognizer.recognize_google(audio_data, language=self.language) + except Exception as e: + pass + finally: + pass + # os.unlink(path) - if text != '': - self.update_transcript(text) + if text != '': + self.update_transcript(text) def update_last_sample_and_phrase_status(self, data, time_spoken): source_info = self.audio_sources @@ -87,7 +85,6 @@ class AudioTranscriber: transcript[0] = text def get_transcript(self): - print(self.transcript_data) if len(self.transcript_data) > 0: text = self.transcript_data.pop(-1) else: diff --git a/VRCT.py b/VRCT.py index 902f6534..4df4146d 100644 --- a/VRCT.py +++ b/VRCT.py @@ -1,5 +1,7 @@ import os import json +import queue +import time import tkinter as tk import customtkinter from PIL import Image @@ -10,6 +12,10 @@ import transcription import osc_tools import window_config import window_information +import languages +import audio_utils +import AudioRecorder +import AudioTranscriber class App(customtkinter.CTk): def __init__(self, *args, **kwargs): @@ -33,17 +39,17 @@ class App(customtkinter.CTk): self.FONT_FAMILY = "Yu Gothic UI" ## Translation self.CHOICE_TRANSLATOR = "DeepL(web)" - self.INPUT_SOURCE_LANG = "Japanese" - self.INPUT_TARGET_LANG = "English" - self.OUTPUT_SOURCE_LANG = "English" - self.OUTPUT_TARGET_LANG = "Japanese" + self.INPUT_SOURCE_LANG = list(languages.deepl_translate_lang.keys())[0] + self.INPUT_TARGET_LANG = list(languages.deepl_translate_lang.keys())[1] + self.OUTPUT_SOURCE_LANG = list(languages.deepl_translate_lang.keys())[1] + self.OUTPUT_TARGET_LANG = list(languages.deepl_translate_lang.keys())[0] ## Transcription self.CHOICE_MIC_DEVICE = self.vr.search_default_device()[0] - self.INPUT_MIC_VOICE_LANGUAGE = "Japanese Japan" + self.INPUT_MIC_VOICE_LANGUAGE = list(languages.recognize_lang.keys())[0] self.INPUT_MIC_IS_DYNAMIC = False self.INPUT_MIC_THRESHOLD = 300 self.CHOICE_SPEAKER_DEVICE = self.vr.search_default_device()[1] - self.INPUT_SPEAKER_VOICE_LANGUAGE = "English United States" + self.INPUT_SPEAKER_VOICE_LANGUAGE = list(languages.recognize_lang.keys())[1] self.INPUT_SPEAKER_INTERVAL = 4 ## Parameter @@ -395,25 +401,31 @@ class App(customtkinter.CTk): def checkbox_transcription_send_callback(self): self.ENABLE_TRANSCRIPTION_SEND = self.checkbox_transcription_send.get() if self.ENABLE_TRANSCRIPTION_SEND is True: + self.mic_audio_queue = queue.Queue() + mic_device = audio_utils.get_default_input_device() + self.mic_audio_recorder = AudioRecorder.SelectedMicRecorder(mic_device) + self.mic_audio_recorder.record_into_queue(self.mic_audio_queue) + self.mic_transcriber = AudioTranscriber.AudioTranscriber( + speaker=False, + source=self.mic_audio_recorder.source, + language=languages.recognize_lang[self.INPUT_MIC_VOICE_LANGUAGE] + ) + self.mic_transcribe = utils.thread_fnc(self.mic_transcriber.transcribe_audio_queue, args=(self.mic_audio_queue,)) + self.mic_transcribe.daemon = True + self.mic_transcribe.start() + self.print_transcript = utils.thread_fnc(self.mic_transcript_to_chatbox) + self.print_transcript.start() + utils.print_textbox(self.textbox_message_log, "Start voice2chatbox", "INFO") utils.print_textbox(self.textbox_message_system_log, "Start voice2chatbox", "INFO") - # start threading - self.vr.set_mic( - device_name=self.CHOICE_MIC_DEVICE, - threshold=int(self.INPUT_MIC_THRESHOLD), - is_dynamic=self.INPUT_MIC_IS_DYNAMIC, - language=self.INPUT_MIC_VOICE_LANGUAGE, - ) - self.vr.init_mic() - self.th_vr_listen_mic = utils.thread_fnc(self.vr_listen_mic) - self.th_vr_recognize_mic = utils.thread_fnc(self.vr_recognize_mic) - self.th_vr_listen_mic.start() - self.th_vr_recognize_mic.start() else: - if isinstance(self.th_vr_listen_mic, utils.thread_fnc): - self.th_vr_listen_mic.stop() - if isinstance(self.th_vr_recognize_mic, utils.thread_fnc): - self.th_vr_recognize_mic.stop() + if isinstance(self.print_transcript, utils.thread_fnc): + self.print_transcript.stop() + if isinstance(self.mic_transcribe, utils.thread_fnc): + self.mic_transcribe.stop() + if self.mic_audio_recorder.stop != None: + self.mic_audio_recorder.stop() + self.mic_audio_recorder.stop = None utils.print_textbox(self.textbox_message_log, "Stop voice2chatbox", "INFO") utils.print_textbox(self.textbox_message_system_log, "Stop voice2chatbox", "INFO") @@ -448,8 +460,8 @@ class App(customtkinter.CTk): if self.checkbox_transcription_send.get() is True: self.vr.listen_mic() - def vr_recognize_mic(self): - message = self.vr.recognize_mic() + def mic_transcript_to_chatbox(self): + message = self.mic_transcriber.get_transcript() if len(message) > 0: # translate if self.checkbox_translation.get() is False: @@ -473,6 +485,7 @@ class App(customtkinter.CTk): # update textbox message log utils.print_textbox(self.textbox_message_log, f"{voice_message}", "SEND") utils.print_textbox(self.textbox_message_send_log, f"{voice_message}", "SEND") + time.sleep(1) def vr_listen_spk(self): if self.checkbox_transcription_receive.get() is True: diff --git a/utils.py b/utils.py index 234bacf7..473487fc 100644 --- a/utils.py +++ b/utils.py @@ -38,4 +38,4 @@ class thread_fnc(threading.Thread): while True: if self.stopped(): return - self.fnc() \ No newline at end of file + self.fnc(*self._args, **self._kwargs) \ No newline at end of file