diff --git a/config.py b/config.py index c59c0f17..6ce32035 100644 --- a/config.py +++ b/config.py @@ -210,6 +210,15 @@ class Config: if isinstance(value, bool): self._IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = value + @property + def IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER(self): + return self._IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER + + @IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER.setter + def IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER(self, value): + if isinstance(value, bool): + self._IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = value + # Save Json Data ## Main Window @property @@ -268,14 +277,14 @@ class Config: saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) @property - @json_serializable('SELECTED_RECOGNIZER') - def SELECTED_RECOGNIZER(self): - return self._SELECTED_RECOGNIZER + @json_serializable('SELECTED_TRANSCRIPTION_ENGINE') + def SELECTED_TRANSCRIPTION_ENGINE(self): + return self._SELECTED_TRANSCRIPTION_ENGINE - @SELECTED_RECOGNIZER.setter - def SELECTED_RECOGNIZER(self, value): + @SELECTED_TRANSCRIPTION_ENGINE.setter + def SELECTED_TRANSCRIPTION_ENGINE(self, value): if isinstance(value, str): - self._SELECTED_RECOGNIZER = value + self._SELECTED_TRANSCRIPTION_ENGINE = value saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) @property @@ -820,6 +829,7 @@ class Config: self._TARGET_LANGUAGE = "English" self._TARGET_COUNTRY = "United States" self._IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = False + self._IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = False # Save Json Data ## Main Window @@ -844,7 +854,7 @@ class Config: "2":"English\n(United States)", "3":"English\n(United States)", } - self._SELECTED_RECOGNIZER = "Google" + self._SELECTED_TRANSCRIPTION_ENGINE = "Google" self._IS_MAIN_WINDOW_SIDEBAR_COMPACT_MODE = False ## Config Window diff --git a/controller.py b/controller.py index 724d2cf3..e63101b2 100644 --- a/controller.py +++ b/controller.py @@ -773,29 +773,27 @@ def callbackSetUserWhisperFeature(value): config.USE_WHISPER_FEATURE = value if config.USE_WHISPER_FEATURE is True: view.openWhisperWeightTypeWidget() + if model.checkTranscriptionWhisperModelWeight() is True: + config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = False + config.SELECTED_TRANSCRIPTION_ENGINE = "Whisper" + else: + config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = True + config.SELECTED_TRANSCRIPTION_ENGINE = "Google" else: view.closeWhisperWeightTypeWidget() + view.showRestartButtonIfRequired() def callbackSetWhisperWeightType(value): print("callbackSetWhisperWeightType", value) config.WHISPER_WEIGHT_TYPE = str(value) view.updateSelectedWhisperWeightType(config.WHISPER_WEIGHT_TYPE) - # view.setWidgetsStatus_changeWeightType_Pending() - # if model.checkCTranslatorCTranslate2ModelWeight(): - # config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = False - # def callback(): - # model.changeTranslatorCTranslate2Model() - # view.useTranslationFeatureProcess("Normal") - # view.setWidgetsStatus_changeWeightType_Done() - # th_callback = Thread(target=callback) - # th_callback.daemon = True - # th_callback.start() - # else: - # config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = True - # view.useTranslationFeatureProcess("Restart") - # view.setWidgetsStatus_changeWeightType_Done() - # view.showRestartButtonIfRequired() - + if model.checkTranscriptionWhisperModelWeight() is True: + config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = False + config.SELECTED_TRANSCRIPTION_ENGINE = "Whisper" + else: + config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = True + config.SELECTED_TRANSCRIPTION_ENGINE = "Google" + view.showRestartButtonIfRequired() # Others Tab def callbackSetEnableAutoClearMessageBox(value): diff --git a/model.py b/model.py index 2c29d4c7..228bc253 100644 --- a/model.py +++ b/model.py @@ -24,6 +24,7 @@ from models.xsoverlay.notification import xsoverlayForVRCT from models.translation.translation_languages import translation_lang from models.transcription.transcription_languages import transcription_lang from models.translation.translation_utils import checkCTranslate2Weight +from models.transcription.transcription_whisper import checkWhisperWeight from config import config class threadFnc(Thread): @@ -74,6 +75,9 @@ class Model: def changeTranslatorCTranslate2Model(self): self.translator.changeCTranslate2Model(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE) + def checkTranscriptionWhisperModelWeight(self): + return checkWhisperWeight(config.PATH_LOCAL, config.WHISPER_WEIGHT_TYPE) + def resetKeywordProcessor(self): del self.keyword_processor self.keyword_processor = KeywordProcessor() @@ -335,12 +339,12 @@ class Model: source=self.mic_audio_recorder.source, phrase_timeout=phase_timeout, max_phrases=config.INPUT_MIC_MAX_PHRASES, - whisper_enabled=config.USE_WHISPER_FEATURE, + transcription_engine=config.SELECTED_TRANSCRIPTION_ENGINE, whisper_weight_type=config.WHISPER_WEIGHT_TYPE, root=config.PATH_LOCAL, ) def sendMicTranscript(): - mic_transcriber.transcribeAudioQueue(config.SELECTED_RECOGNIZER, mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY) + mic_transcriber.transcribeAudioQueue(mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY) message = mic_transcriber.getTranscript() try: fnc(message) @@ -419,12 +423,12 @@ class Model: source=self.speaker_audio_recorder.source, phrase_timeout=phase_timeout, max_phrases=config.INPUT_SPEAKER_MAX_PHRASES, - whisper_enabled=config.USE_WHISPER_FEATURE, + transcription_engine=config.SELECTED_TRANSCRIPTION_ENGINE, whisper_weight_type=config.WHISPER_WEIGHT_TYPE, root=config.PATH_LOCAL, ) def sendSpeakerTranscript(): - speaker_transcriber.transcribeAudioQueue(config.SELECTED_RECOGNIZER, speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY) + speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY) message = speaker_transcriber.getTranscript() try: fnc(message) diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py index 0f5b1790..b24d3163 100644 --- a/models/transcription/transcription_transcriber.py +++ b/models/transcription/transcription_transcriber.py @@ -14,7 +14,7 @@ PHRASE_TIMEOUT = 3 MAX_PHRASES = 10 class AudioTranscriber: - def __init__(self, speaker, source, phrase_timeout, max_phrases, whisper_enabled, whisper_weight_type, root): + def __init__(self, speaker, source, phrase_timeout, max_phrases, transcription_engine, whisper_weight_type=None, root=None): self.speaker = speaker self.phrase_timeout = phrase_timeout self.max_phrases = max_phrases @@ -30,38 +30,34 @@ class AudioTranscriber: "new_phrase": True, "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData } - if whisper_enabled is True: - self.whisper_model = getWhisperModel(root, whisper_weight_type) - else: - self.whisper_model = None + self.transcription_engine = transcription_engine + match self.transcription_engine: + case "Google": + self.audio_recognizer = Recognizer() + case "Whisper": + self.audio_recognizer = getWhisperModel(root, whisper_weight_type) - def transcribeAudioQueue(self, recognizer, audio_queue, language, country): - # while True: + def transcribeAudioQueue(self, audio_queue, language, country): audio, time_spoken = audio_queue.get() self.updateLastSampleAndPhraseStatus(audio, time_spoken) text = '' try: - # Whisperが使用できない場合はGoogle Speech-to-Textを使用する - if recognizer == "Whisper": - if self.whisper_model is None: - recognizer = "Google" - audio_data = self.audio_sources["process_data_func"]() - match recognizer: + match self.transcription_engine: case "Google": - text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][recognizer]) + text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][self.transcription_engine]) case "Whisper": audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0 if isinstance(audio_data, torch.Tensor): audio_data = audio_data.detach().numpy() - segments, _ = self.whisper_model.transcribe( + segments, _ = self.audio_recognizer.transcribe( audio_data, beam_size=5, temperature=0.0, log_prob_threshold=-0.8, no_speech_threshold=0.6, - language=transcription_lang[language][country][recognizer], + language=transcription_lang[language][country][self.transcription_engine], word_timestamps=False, without_timestamps=True, task="transcribe", diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py index e30fee2d..c6412d35 100644 --- a/models/transcription/transcription_whisper.py +++ b/models/transcription/transcription_whisper.py @@ -42,7 +42,8 @@ def downloadFile(url, path, func=None): except Exception as e: print("error:downloadFile()", e) -def checkWhisperWeight(path): +def checkWhisperWeight(root, weight_type): + path = os_path.join(root, "weights", "whisper", weight_type) result = False try: WhisperModel( @@ -62,7 +63,7 @@ def checkWhisperWeight(path): def downloadWhisperWeight(root, weight_type, callbackFunc): path = os_path.join(root, "weights", "whisper", weight_type) os_makedirs(path, exist_ok=True) - if checkWhisperWeight(path) is True: + if checkWhisperWeight(root, weight_type) is True: return for filename in _FILENAMES: diff --git a/view.py b/view.py index 6f7a6d7e..84ebd550 100644 --- a/view.py +++ b/view.py @@ -29,6 +29,7 @@ class View(): font_family=config.FONT_FAMILY, ui_language=config.UI_LANGUAGE, is_reset_button_displayed_for_translation=config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION, + is_reset_button_displayed_for_whisper=config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER, ) if config.ENABLE_SPEAKER2CHATBOX is False: @@ -1049,7 +1050,8 @@ class View(): self.restart_required_configs_pre_data.ui_scaling == config.UI_SCALING and self.restart_required_configs_pre_data.font_family == config.FONT_FAMILY and self.restart_required_configs_pre_data.ui_language == config.UI_LANGUAGE and - self.restart_required_configs_pre_data.is_reset_button_displayed_for_translation == config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION + self.restart_required_configs_pre_data.is_reset_button_displayed_for_translation == config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION and + self.restart_required_configs_pre_data.is_reset_button_displayed_for_whisper == config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER ) if locale is None: