From ba12e39bbc4187bad8831fa67d8e04350c3cd874 Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Tue, 30 Jan 2024 02:15:05 +0900 Subject: [PATCH 01/11] =?UTF-8?q?[WIP/TEST]=20Model=20:=20faster-whisper?= =?UTF-8?q?=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../transcription_transcriber.py | 28 +++++++++++++++++++ requirements.txt | 3 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py index bf78566e..fbea0e74 100644 --- a/models/transcription/transcription_transcriber.py +++ b/models/transcription/transcription_transcriber.py @@ -6,6 +6,10 @@ from datetime import timedelta from pyaudiowpatch import get_sample_size, paInt16 from .transcription_languages import transcription_lang +import torch +import numpy as np +from faster_whisper import WhisperModel + PHRASE_TIMEOUT = 3 MAX_PHRASES = 10 @@ -26,6 +30,7 @@ class AudioTranscriber: "new_phrase": True, "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData } + self.whisper_model = WhisperModel("base", device="cpu", device_index=0, compute_type="int8", cpu_threads=4, num_workers=1) def transcribeAudioQueue(self, audio_queue, language, country): # while True: @@ -38,6 +43,29 @@ class AudioTranscriber: # os.close(fd) audio_data = self.audio_sources["process_data_func"]() text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country]) + + audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0 + if isinstance(audio_data, torch.Tensor): + audio_data = audio_data.detach().numpy() + segments, _ = self.whisper_model.transcribe( + audio_data, + beam_size=5, + temperature=0.0, + log_prob_threshold=-0.8, + no_speech_threshold=0.6, + language="ja", + word_timestamps=False, + without_timestamps=True, + task="transcribe", + vad_filter=False, + ) + _text = "" + for s in segments: + if s.avg_logprob < -0.8 or s.no_speech_prob > 0.6: + continue + _text += s.text + print(_text) + except Exception: pass finally: diff --git a/requirements.txt b/requirements.txt index b6e14d85..68a6ce15 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ CTkToolTip == 0.8 pyinstaller==6.2.0 transformers[torch] sentencepiece==0.1.99 -ctranslate2==3.21.0 \ No newline at end of file +ctranslate2==3.21.0 +faster-whisper==0.10.0 \ No newline at end of file From 9cd1831ecbb4f313347c90d6971eb3c7a075812b Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Tue, 30 Jan 2024 18:21:55 +0900 Subject: [PATCH 02/11] =?UTF-8?q?[WIP/TEST]=20faster-whisper=E3=81=8C?= =?UTF-8?q?=E6=9C=80=E4=BD=8E=E9=99=90=E5=8B=95=E3=81=8F=E5=BD=A2=E3=81=A7?= =?UTF-8?q?=E5=AE=9F=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit config.jsonで設定変更で実行可能 --- config.py | 71 ++- controller.py | 4 +- main.py | 2 +- model.py | 14 +- .../transcription/transcription_languages.py | 443 ++++++++++++++---- .../transcription_transcriber.py | 70 +-- models/transcription/transcription_utils.py | 40 +- view.py | 4 +- 8 files changed, 511 insertions(+), 137 deletions(-) diff --git a/config.py b/config.py index 371ec121..6acf5e3f 100644 --- a/config.py +++ b/config.py @@ -98,6 +98,10 @@ class Config: def SELECTABLE_CTRANSLATE2_WEIGHT_TYPE_DICT(self): return self._SELECTABLE_CTRANSLATE2_WEIGHT_TYPE_DICT + @property + def SELECTABLE_WHISPER_WEIGHT_TYPE_DICT(self): + return self._SELECTABLE_WHISPER_WEIGHT_TYPE_DICT + @property def MAX_MIC_ENERGY_THRESHOLD(self): return self._MAX_MIC_ENERGY_THRESHOLD @@ -263,6 +267,17 @@ class Config: self._SELECTED_TAB_TARGET_LANGUAGES = value saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) + @property + @json_serializable('SELECTED_RECOGNIZER') + def SELECTED_RECOGNIZER(self): + return self._SELECTED_RECOGNIZER + + @SELECTED_RECOGNIZER.setter + def SELECTED_RECOGNIZER(self, value): + if isinstance(value, str): + self._SELECTED_RECOGNIZER = value + saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) + @property @json_serializable('IS_MAIN_WINDOW_SIDEBAR_COMPACT_MODE') def IS_MAIN_WINDOW_SIDEBAR_COMPACT_MODE(self): @@ -569,15 +584,37 @@ class Config: saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) @property - @json_serializable('WEIGHT_TYPE') - def WEIGHT_TYPE(self): - return self._WEIGHT_TYPE + @json_serializable('USE_RECOGNIZER_FEATURE') + def USE_RECOGNIZER_FEATURE(self): + return self._USE_RECOGNIZER_FEATURE - @WEIGHT_TYPE.setter - def WEIGHT_TYPE(self, value): + @USE_RECOGNIZER_FEATURE.setter + def USE_RECOGNIZER_FEATURE(self, value): + if isinstance(value, bool): + self._USE_RECOGNIZER_FEATURE = value + saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) + + @property + @json_serializable('CTRANSLATE2_WEIGHT_TYPE') + def CTRANSLATE2_WEIGHT_TYPE(self): + return self._CTRANSLATE2_WEIGHT_TYPE + + @CTRANSLATE2_WEIGHT_TYPE.setter + def CTRANSLATE2_WEIGHT_TYPE(self, value): # if isinstance(value, str) and value in self.SELECTABLE_CTRANSLATE2_WEIGHT_TYPE_DICT: if isinstance(value, str): - self._WEIGHT_TYPE = value + self._CTRANSLATE2_WEIGHT_TYPE = value + saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) + + @property + @json_serializable('WHISPER_WEIGHT_TYPE') + def WHISPER_WEIGHT_TYPE(self): + return self._WHISPER_WEIGHT_TYPE + + @WHISPER_WEIGHT_TYPE.setter + def WHISPER_WEIGHT_TYPE(self, value): + if isinstance(value, str): + self._WHISPER_WEIGHT_TYPE = value saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) @property @@ -756,6 +793,23 @@ class Config: "Small": "Small", "Large": "Large", } + + self._SELECTABLE_WHISPER_WEIGHT_TYPE_DICT = { + # {Save json str}: {i18n_placeholder} pairs + "tiny": "tiny", + "tiny.en": "tiny.en", + "base": "base", + "base.en": "base.en", + "small": "small", + "small.en": "small.en", + "medium": "medium", + "medium.en": "medium.en", + "large-v1": "large-v1", + "large-v2": "large-v2", + "large-v3": "large-v3", + "large": "large", + } + self._MAX_MIC_ENERGY_THRESHOLD = 2000 self._MAX_SPEAKER_ENERGY_THRESHOLD = 4000 @@ -795,6 +849,7 @@ class Config: "2":"English\n(United States)", "3":"English\n(United States)", } + self._SELECTED_RECOGNIZER = "Google" self._IS_MAIN_WINDOW_SIDEBAR_COMPACT_MODE = False ## Config Window @@ -831,7 +886,9 @@ class Config: "DeepL_API": None, } self._USE_TRANSLATION_FEATURE = True - self._WEIGHT_TYPE = "Small" + self._CTRANSLATE2_WEIGHT_TYPE = "Small" + self._USE_RECOGNIZER_FEATURE = True + self._WHISPER_WEIGHT_TYPE = "base" self._SEND_MESSAGE_FORMAT = "[message]" self._SEND_MESSAGE_FORMAT_WITH_T = "[message]([translation])" self._RECEIVED_MESSAGE_FORMAT = "[message]" diff --git a/controller.py b/controller.py index f9e9a5b3..9d44b491 100644 --- a/controller.py +++ b/controller.py @@ -505,8 +505,8 @@ def callbackSetUseTranslationFeature(value): def callbackSetCtranslate2WeightType(value): print("callbackSetCtranslate2WeightType", value) - config.WEIGHT_TYPE = str(value) - view.updateSelectedCtranslate2WeightType(config.WEIGHT_TYPE) + config.CTRANSLATE2_WEIGHT_TYPE = str(value) + view.updateSelectedCtranslate2WeightType(config.CTRANSLATE2_WEIGHT_TYPE) view.setWidgetsStatus_changeWeightType_Pending() if model.checkCTranslatorCTranslate2ModelWeight(): config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = False diff --git a/main.py b/main.py index 4810cbe5..cf80e289 100644 --- a/main.py +++ b/main.py @@ -10,7 +10,7 @@ if __name__ == "__main__": from config import config from models.translation.utils import downloadCTranslate2Weight if config.USE_TRANSLATION_FEATURE is True: - downloadCTranslate2Weight(config.PATH_LOCAL, config.WEIGHT_TYPE, splash.updateDownloadProgress) + downloadCTranslate2Weight(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE, splash.updateDownloadProgress) splash.toProgress(0) import controller diff --git a/model.py b/model.py index 573659a7..61ff24d7 100644 --- a/model.py +++ b/model.py @@ -65,14 +65,14 @@ class Model: self.speaker_energy_plot_progressbar = None self.translator = Translator() if config.USE_TRANSLATION_FEATURE is True: - self.translator.changeCTranslate2Model(config.PATH_LOCAL, config.WEIGHT_TYPE) + self.translator.changeCTranslate2Model(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE) self.keyword_processor = KeywordProcessor() def checkCTranslatorCTranslate2ModelWeight(self): - return checkCTranslate2Weight(config.PATH_LOCAL, config.WEIGHT_TYPE) + return checkCTranslate2Weight(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE) def changeTranslatorCTranslate2Model(self): - self.translator.changeCTranslate2Model(config.PATH_LOCAL, config.WEIGHT_TYPE) + self.translator.changeCTranslate2Model(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE) def resetKeywordProcessor(self): del self.keyword_processor @@ -335,9 +335,12 @@ class Model: source=self.mic_audio_recorder.source, phrase_timeout=phase_timeout, max_phrases=config.INPUT_MIC_MAX_PHRASES, + whisper_enabled=config.USE_RECOGNIZER_FEATURE, + whisper_weight_type=config.WHISPER_WEIGHT_TYPE, + whisper_weight_path=os_path.join(config.PATH_LOCAL, "weight", "whisper"), ) def sendMicTranscript(): - mic_transcriber.transcribeAudioQueue(mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY) + mic_transcriber.transcribeAudioQueue(config.SELECTED_RECOGNIZER, mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY) message = mic_transcriber.getTranscript() try: fnc(message) @@ -416,6 +419,9 @@ class Model: source=self.speaker_audio_recorder.source, phrase_timeout=phase_timeout, max_phrases=config.INPUT_SPEAKER_MAX_PHRASES, + whisper_enabled=config.USE_RECOGNIZER_FEATURE, + whisper_weight_type=config.WHISPER_WEIGHT_TYPE, + whisper_weight_path=os_path.join(config.PATH_LOCAL, "weight", "whisper"), ) def sendSpeakerTranscript(): speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY) diff --git a/models/transcription/transcription_languages.py b/models/transcription/transcription_languages.py index 26f2c3f6..63d92568 100644 --- a/models/transcription/transcription_languages.py +++ b/models/transcription/transcription_languages.py @@ -1,177 +1,438 @@ transcription_lang = { "Afrikaans":{ - "South Africa":"af-ZA", + "South Africa":{ + "Google": "af-ZA", + "Whisper": "af", + }, }, "Arabic":{ - "Algeria":"ar-DZ", - "Bahrain":"ar-BH", - "Egypt":"ar-EG", - "Israel":"ar-IL", - "Iraq":"ar-IQ", - "Jordan":"ar-JO", - "Kuwait":"ar-KW", - "Lebanon":"ar-LB", - "Morocco":"ar-MA", - "Oman":"ar-OM", - "State of Palestine":"ar-PS", - "Qatar":"ar-QA", - "Saudi Arabia":"ar-SA", - "Tunisia":"ar-TN", - "United Arab Emirates":"ar-AE", + "Algeria":{ + "Google": "ar-DZ", + "Whisper": "ar", + }, + "Bahrain":{ + "Google": "ar-BH", + "Whisper": "ar", + }, + "Egypt":{ + "Google": "ar-EG", + "Whisper": "ar", + }, + "Israel":{ + "Google": "ar-IL", + "Whisper": "ar", + }, + "Iraq":{ + "Google": "ar-IQ", + "Whisper": "ar", + }, + "Jordan":{ + "Google": "ar-JO", + "Whisper": "ar", + }, + "Kuwait":{ + "Google": "ar-KW", + "Whisper": "ar", + }, + "Lebanon":{ + "Google": "ar-LB", + "Whisper": "ar", + }, + "Morocco":{ + "Google": "ar-MA", + "Whisper": "ar", + }, + "Oman":{ + "Google": "ar-OM", + "Whisper": "ar", + }, + "State of Palestine":{ + "Google": "ar-PS", + "Whisper": "ar", + }, + "Qatar":{ + "Google": "ar-QA", + "Whisper": "ar", + }, + "Saudi Arabia":{ + "Google": "ar-SA", + "Whisper": "ar", + }, + "Tunisia":{ + "Google": "ar-TN", + "Whisper": "ar", + }, + "United Arab Emirates":{ + "Google": "ar-AE", + "Whisper": "ar", + }, }, "Basque":{ - "Spain":"eu-ES", + "Spain":{ + "Google": "eu-ES", + "Whisper": "eu", + }, }, "Bulgarian":{ - "Bulgaria":"bg-BG", + "Bulgaria":{ + "Google": "bg-BG", + "Whisper": "bg", + }, }, "Catalan":{ - "Spain":"ca-ES", + "Spain":{ + "Google": "ca-ES", + "Whisper": "ca", + }, }, "Chinese":{ - "Mandarin (Simplified, China)":"cmn-Hans-CN", - "Mandarin (Simplified, Hong Kong)":"cmn-Hans-HK", - "Mandarin (Traditional, Taiwan)":"cmn-Hant-TW", - "Cantonese (Traditional Hong Kong)":"yue-Hant-HK", + "Mandarin (Simplified, China)":{ + "Google": "cmn-Hans-CN", + "Whisper": "zh", + }, + "Mandarin (Simplified, Hong Kong)":{ + "Google": "cmn-Hans-HK", + "Whisper": "zh", + }, + "Mandarin (Traditional, Taiwan)":{ + "Google": "cmn-Hant-TW", + "Whisper": "zh", + }, + "Cantonese (Traditional Hong Kong)":{ + "Google": "yue-Hant-HK", + "Whisper": "yue", + }, }, "Croatian":{ - "Croatia":"hr-HR", + "Croatia":{ + "Google": "hr-HR", + "Whisper": "hr", + }, }, "Czech":{ - "Czech Republic":"cs-CZ", + "Czech Republic":{ + "Google": "cs-CZ", + "Whisper": "cs", + }, }, "Danish":{ - "Denmark":"da-DK", + "Denmark":{ + "Google": "da-DK", + "Whisper": "da", + }, }, "Dutch":{ - "Netherlands":"nl-NL", + "Netherlands":{ + "Google": "nl-NL", + "Whisper": "nl", + }, }, "English": { - "United States":"en-US", - "United Kingdom":"en-GB", - "Australia":"en-AU", - "Canada":"en-CA", - "India":"en-IN", - "Ireland":"en-IE", - "New Zealand":"en-NZ", - "Philippines":"en-PH", - "South Africa":"en-ZA", + "United States":{ + "Google": "en-US", + "Whisper": "en", + }, + "United Kingdom":{ + "Google": "en-GB", + "Whisper": "en", + }, + "Australia":{ + "Google": "en-AU", + "Whisper": "en", + }, + "Canada":{ + "Google": "en-CA", + "Whisper": "en", + }, + "India":{ + "Google": "en-IN", + "Whisper": "en", + }, + "Ireland":{ + "Google": "en-IE", + "Whisper": "en", + }, + "New Zealand":{ + "Google": "en-NZ", + "Whisper": "en", + }, + "Philippines":{ + "Google": "en-PH", + "Whisper": "en", + }, + "South Africa":{ + "Google": "en-ZA", + "Whisper": "en", + }, }, "Filipino":{ - "Philippines":"fil-PH", + "Philippines":{ + "Google": "fil-PH", + "Whisper": "tl", + }, }, "Finnish":{ - "Finland":"fi-FI", + "Finland":{ + "Google": "fi-FI", + "Whisper": "fi", + }, }, "French":{ - "France":"fr-FR", + "France":{ + "Google": "fr-FR", + "Whisper": "fr", + }, }, "Galician":{ - "Spain":"gl-ES", + "Spain":{ + "Google": "gl-ES", + "Whisper": "gl", + }, }, "German":{ - "Germany":"de-DE", + "Germany":{ + "Google": "de-DE", + "Whisper": "de", + }, }, "Greek":{ - "Greece":"el-GR", + "Greece":{ + "Google": "el-GR", + "Whisper": "el", + }, }, "Hebrew":{ - "Israel":"he-IL", + "Israel":{ + "Google": "he-IL", + "Whisper": "he", + }, }, "Hindi": { - "India":"hi-IN", + "India":{ + "Google": "hi-IN", + "Whisper": "hi", + }, }, "Hungarian":{ - "Hungary":"hu-HU", + "Hungary":{ + "Google": "hu-HU", + "Whisper": "hu", + }, }, "Indonesian":{ - "Indonesia":"id-ID", + "Indonesia":{ + "Google": "id-ID", + "Whisper": "id", + }, }, "Icelandic":{ - "Iceland":"is-IS", + "Iceland":{ + "Google": "is-IS", + "Whisper": "is", + }, }, "Italian":{ - "Italy":"it-IT", - "Switzerland":"it-CH", + "Italy":{ + "Google": "it-IT", + "Whisper": "it", + }, + "Switzerland":{ + "Google": "it-CH", + "Whisper": "it", + }, }, "Japanese":{ - "Japan":"ja-JP", + "Japan":{ + "Google": "ja-JP", + "Whisper": "ja", + }, }, "Korean":{ - "South Korea":"ko-KR", + "South Korea":{ + "Google": "ko-KR", + "Whisper": "ko", + }, }, "Lithuanian":{ - "Lithuania":"lt-LT", + "Lithuania":{ + "Google": "lt-LT", + "Whisper": "lt", + }, }, "Malay":{ - "Malaysia":"ms-MY", + "Malaysia":{ + "Google": "ms-MY", + "Whisper": "ms", + }, }, "Norwegian":{ - "Norway":"nb-NO", + "Norway":{ + "Google": "nb-NO", + "Whisper": "no", + }, }, "Persian":{ - "Iran":"fa-IR", + "Iran":{ + "Google": "fa-IR", + "Whisper": "fa", + }, }, "Polish":{ - "Poland":"pl-PL", + "Poland":{ + "Google": "pl-PL", + "Whisper": "pl", + }, }, "Portuguese":{ - "Brazil":"pt-BR", - "Portugal":"pt-PT", + "Brazil":{ + "Google": "pt-BR", + "Whisper": "pt", + }, + "Portugal":{ + "Google": "pt-PT", + "Whisper": "pt", + }, }, "Romanian":{ - "Romania":"ro-RO", + "Romania":{ + "Google": "ro-RO", + "Whisper": "ro", + }, }, "Russian":{ - "Russia":"ru-RU", + "Russia":{ + "Google": "ru-RU", + "Whisper": "ru", + }, }, "Serbian":{ - "Serbia":"sr-RS", + "Serbia":{ + "Google": "sr-RS", + "Whisper": "sr", + }, }, "Slovak":{ - "Slovakia":"sk-SK", + "Slovakia":{ + "Google": "sk-SK", + "Whisper": "sk", + }, }, "Slovenian":{ - "Slovenia":"sl-SI", + "Slovenia":{ + "Google": "sl-SI", + "Whisper": "sl", + }, }, "Spanish":{ - "Argentina":"es-AR", - "Bolivia":"es-BO", - "Chile":"es-CL", - "Colombia":"es-CO", - "Costa Rica":"es-CR", - "Dominican Republic":"es-DO", - "Ecuador":"es-EC", - "El Salvador":"es-SV", - "Guatemala":"es-GT", - "Honduras":"es-HN", - "Mexico":"es-MX", - "Nicaragua":"es-NI", - "Panama":"es-PA", - "Paraguay":"es-PY", - "Peru":"es-PE", - "Puerto Rico":"es-PR", - "Spain":"es-ES", - "Uruguay":"es-UY", - "United States":"es-US", - "Venezuela":"es-VE", + "Argentina":{ + "Google": "es-AR", + "Whisper": "es", + }, + "Bolivia":{ + "Google": "es-BO", + "Whisper": "es", + }, + "Chile":{ + "Google": "es-CL", + "Whisper": "es", + }, + "Colombia":{ + "Google": "es-CO", + "Whisper": "es", + }, + "Costa Rica":{ + "Google": "es-CR", + "Whisper": "es", + }, + "Dominican Republic":{ + "Google": "es-DO", + "Whisper": "es", + }, + "Ecuador":{ + "Google": "es-EC", + "Whisper": "es", + }, + "El Salvador":{ + "Google": "es-SV", + "Whisper": "es", + }, + "Guatemala":{ + "Google": "es-GT", + "Whisper": "es", + }, + "Honduras":{ + "Google": "es-HN", + "Whisper": "es", + }, + "Mexico":{ + "Google": "es-MX", + "Whisper": "es", + }, + "Nicaragua":{ + "Google": "es-NI", + "Whisper": "es", + }, + "Panama":{ + "Google": "es-PA", + "Whisper": "es", + }, + "Paraguay":{ + "Google": "es-PY", + "Whisper": "es", + }, + "Peru":{ + "Google": "es-PE", + "Whisper": "es", + }, + "Puerto Rico":{ + "Google": "es-PR", + "Whisper": "es", + }, + "Spain":{ + "Google": "es-ES", + "Whisper": "es", + }, + "Uruguay":{ + "Google": "es-UY", + "Whisper": "es", + }, + "United States":{ + "Google": "es-US", + "Whisper": "es", + }, + "Venezuela":{ + "Google": "es-VE", + "Whisper": "es", + }, }, "Swedish":{ - "Sweden":"sv-SE", + "Sweden":{ + "Google": "sv-SE", + "Whisper": "sv", + }, }, "Thai":{ - "Thailand":"th-TH", + "Thailand":{ + "Google": "th-TH", + "Whisper": "th", + }, }, "Turkish":{ - "Turkey":"tr-TR", + "Turkey":{ + "Google": "tr-TR", + "Whisper": "tr", + }, }, "Ukrainian":{ - "Ukraine":"uk-UA", + "Ukraine":{ + "Google": "uk-UA", + "Whisper": "uk", + }, }, "Vietnamese":{ - "Vietnam":"vi-VN", - }, - "Zulu":{ - "South Africa":"zu-ZA" + "Vietnam":{ + "Google": "vi-VN", + "Whisper": "vi", + }, }, } \ No newline at end of file diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py index fbea0e74..526c12dc 100644 --- a/models/transcription/transcription_transcriber.py +++ b/models/transcription/transcription_transcriber.py @@ -14,7 +14,7 @@ PHRASE_TIMEOUT = 3 MAX_PHRASES = 10 class AudioTranscriber: - def __init__(self, speaker, source, phrase_timeout, max_phrases): + def __init__(self, speaker, source, phrase_timeout, max_phrases, whisper_enabled, whisper_weight_type, whisper_weight_path): self.speaker = speaker self.phrase_timeout = phrase_timeout self.max_phrases = max_phrases @@ -30,47 +30,59 @@ class AudioTranscriber: "new_phrase": True, "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData } - self.whisper_model = WhisperModel("base", device="cpu", device_index=0, compute_type="int8", cpu_threads=4, num_workers=1) + if whisper_enabled is True: + self.whisper_model = WhisperModel( + model_size_or_path=whisper_weight_type, + device="cpu", + device_index=0, + compute_type="int8", + cpu_threads=4, + num_workers=1, + download_root=whisper_weight_path) + else: + self.whisper_model = None - def transcribeAudioQueue(self, audio_queue, language, country): + def transcribeAudioQueue(self, recognizer, audio_queue, language, country): # while True: audio, time_spoken = audio_queue.get() self.updateLastSampleAndPhraseStatus(audio, time_spoken) text = '' try: - # fd, path = tempfile.mkstemp(suffix=".wav") - # os.close(fd) - audio_data = self.audio_sources["process_data_func"]() - text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country]) + # Whisperが使用できない場合はGoogle Speech-to-Textを使用する + if recognizer == "Whisper": + if self.whisper_model is None: + recognizer = "Google" - audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0 - if isinstance(audio_data, torch.Tensor): - audio_data = audio_data.detach().numpy() - segments, _ = self.whisper_model.transcribe( - audio_data, - beam_size=5, - temperature=0.0, - log_prob_threshold=-0.8, - no_speech_threshold=0.6, - language="ja", - word_timestamps=False, - without_timestamps=True, - task="transcribe", - vad_filter=False, - ) - _text = "" - for s in segments: - if s.avg_logprob < -0.8 or s.no_speech_prob > 0.6: - continue - _text += s.text - print(_text) + audio_data = self.audio_sources["process_data_func"]() + match recognizer: + case "Google": + text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][recognizer]) + case "Whisper": + audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0 + if isinstance(audio_data, torch.Tensor): + audio_data = audio_data.detach().numpy() + segments, _ = self.whisper_model.transcribe( + audio_data, + beam_size=5, + temperature=0.0, + log_prob_threshold=-0.8, + no_speech_threshold=0.6, + language=transcription_lang[language][country][recognizer], + word_timestamps=False, + without_timestamps=True, + task="transcribe", + vad_filter=False, + ) + for s in segments: + if s.avg_logprob < -0.8 or s.no_speech_prob > 0.6: + continue + text += s.text except Exception: pass finally: pass - # os.unlink(path) if text != '': self.updateTranscript(text) diff --git a/models/transcription/transcription_utils.py b/models/transcription/transcription_utils.py index f40defeb..8de17e7e 100644 --- a/models/transcription/transcription_utils.py +++ b/models/transcription/transcription_utils.py @@ -1,4 +1,8 @@ from pyaudiowpatch import PyAudio, paWASAPI +from faster_whisper.utils import download_model +import logging +logger = logging.getLogger('faster_whisper') +logger.setLevel(logging.CRITICAL) def getInputDevices(): devices = {} @@ -44,4 +48,38 @@ def getDefaultOutputDevice(): if default_speakers["name"] in loopback["name"]: default_device = loopback return default_device - return {"name":"NoDevice"} \ No newline at end of file + return {"name":"NoDevice"} + +def downloadWhisperWeight(weight_type, path): + result = False + try: + download_model( + weight_type, + cache_dir=path) + result = True + except Exception: + pass + return result + +def checkWhisperWeight(weight_type, path): + result = False + try: + result = download_model( + weight_type, + local_files_only=True, + cache_dir=path) + result = True + except Exception: + pass + return result + +if __name__ == "__main__": + + + downloadWhisperWeight("base", "./weight/whisper/") + + from faster_whisper import WhisperModel + whisper_model = WhisperModel("base", device="cpu", device_index=0, compute_type="int8", cpu_threads=4, num_workers=1, download_root="./weight/whisper/") + + print(checkWhisperWeight("base", "./weight/whisper/")) + print(checkWhisperWeight("tiny", "./weight/whisper/")) \ No newline at end of file diff --git a/view.py b/view.py index 34711688..cf90dcfa 100644 --- a/view.py +++ b/view.py @@ -280,7 +280,7 @@ class View(): VAR_DESC_CTRANSLATE2_WEIGHT_TYPE=StringVar(value=i18n.t("config_window.ctranslate2_weight_type.desc")), DICT_CTRANSLATE2_WEIGHT_TYPE=self.getSelectableCtranslate2WeightTypeDict(), CALLBACK_SET_CTRANSLATE2_WEIGHT_TYPE=None, - VAR_CTRANSLATE2_WEIGHT_TYPE=StringVar(value=self.getSelectableCtranslate2WeightTypeDict()[config.WEIGHT_TYPE]), + VAR_CTRANSLATE2_WEIGHT_TYPE=StringVar(value=self.getSelectableCtranslate2WeightTypeDict()[config.CTRANSLATE2_WEIGHT_TYPE]), VAR_LABEL_DEEPL_AUTH_KEY=StringVar(value=i18n.t( "config_window.deepl_auth_key.label")), VAR_DESC_DEEPL_AUTH_KEY=StringVar( @@ -1069,7 +1069,7 @@ class View(): self.view_variable.VAR_CTRANSLATE2_WEIGHT_TYPE.set(self.getSelectableCtranslate2WeightTypeDict()[selected_weight_type]) def setLatestCTranslate2WeightType(self): - selected_weight_type = self.getSelectableCtranslate2WeightTypeDict()[config.WEIGHT_TYPE] + selected_weight_type = self.getSelectableCtranslate2WeightTypeDict()[config.CTRANSLATE2_WEIGHT_TYPE] self.view_variable.VAR_CTRANSLATE2_WEIGHT_TYPE.set(selected_weight_type) From 10b8d115a118f3cfeaf400af76186115c084950b Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Wed, 31 Jan 2024 22:50:31 +0900 Subject: [PATCH 03/11] =?UTF-8?q?[WIP/TEST]=20faster-whisper=20model=20wei?= =?UTF-8?q?ght=20=E3=81=AE=E3=83=80=E3=82=A6=E3=83=B3=E3=83=AD=E3=83=BC?= =?UTF-8?q?=E3=83=89/=E3=83=99=E3=83=AA=E3=83=95=E3=82=A1=E3=82=A4?= =?UTF-8?q?=E5=87=A6=E7=90=86=E3=82=92=E5=AE=9F=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 8 +- model.py | 4 +- .../transcription_transcriber.py | 13 +-- models/transcription/transcription_utils.py | 40 +------- models/transcription/transcription_whisper.py | 98 +++++++++++++++++++ 5 files changed, 111 insertions(+), 52 deletions(-) create mode 100644 models/transcription/transcription_whisper.py diff --git a/main.py b/main.py index cf80e289..4aaa7232 100644 --- a/main.py +++ b/main.py @@ -11,8 +11,14 @@ if __name__ == "__main__": from models.translation.utils import downloadCTranslate2Weight if config.USE_TRANSLATION_FEATURE is True: downloadCTranslate2Weight(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE, splash.updateDownloadProgress) - splash.toProgress(0) + + # whisperのダウンロードの説明に変更する必要あり + if config.USE_RECOGNIZER_FEATURE is True: + from models.transcription.transcription_whisper import downloadWhisperWeight + downloadWhisperWeight(config.PATH_LOCAL, config.WHISPER_WEIGHT_TYPE, splash.updateDownloadProgress) + splash.toProgress(0) + import controller controller.createMainWindow(splash) splash.destroySplash() diff --git a/model.py b/model.py index 61ff24d7..6b73bece 100644 --- a/model.py +++ b/model.py @@ -337,7 +337,7 @@ class Model: max_phrases=config.INPUT_MIC_MAX_PHRASES, whisper_enabled=config.USE_RECOGNIZER_FEATURE, whisper_weight_type=config.WHISPER_WEIGHT_TYPE, - whisper_weight_path=os_path.join(config.PATH_LOCAL, "weight", "whisper"), + root=config.PATH_LOCAL, ) def sendMicTranscript(): mic_transcriber.transcribeAudioQueue(config.SELECTED_RECOGNIZER, mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY) @@ -421,7 +421,7 @@ class Model: max_phrases=config.INPUT_SPEAKER_MAX_PHRASES, whisper_enabled=config.USE_RECOGNIZER_FEATURE, whisper_weight_type=config.WHISPER_WEIGHT_TYPE, - whisper_weight_path=os_path.join(config.PATH_LOCAL, "weight", "whisper"), + root=config.PATH_LOCAL, ) def sendSpeakerTranscript(): speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY) diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py index 526c12dc..0f5b1790 100644 --- a/models/transcription/transcription_transcriber.py +++ b/models/transcription/transcription_transcriber.py @@ -5,16 +5,16 @@ from speech_recognition import Recognizer, AudioData, AudioFile from datetime import timedelta from pyaudiowpatch import get_sample_size, paInt16 from .transcription_languages import transcription_lang +from .transcription_whisper import getWhisperModel import torch import numpy as np -from faster_whisper import WhisperModel PHRASE_TIMEOUT = 3 MAX_PHRASES = 10 class AudioTranscriber: - def __init__(self, speaker, source, phrase_timeout, max_phrases, whisper_enabled, whisper_weight_type, whisper_weight_path): + def __init__(self, speaker, source, phrase_timeout, max_phrases, whisper_enabled, whisper_weight_type, root): self.speaker = speaker self.phrase_timeout = phrase_timeout self.max_phrases = max_phrases @@ -31,14 +31,7 @@ class AudioTranscriber: "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData } if whisper_enabled is True: - self.whisper_model = WhisperModel( - model_size_or_path=whisper_weight_type, - device="cpu", - device_index=0, - compute_type="int8", - cpu_threads=4, - num_workers=1, - download_root=whisper_weight_path) + self.whisper_model = getWhisperModel(root, whisper_weight_type) else: self.whisper_model = None diff --git a/models/transcription/transcription_utils.py b/models/transcription/transcription_utils.py index 8de17e7e..f40defeb 100644 --- a/models/transcription/transcription_utils.py +++ b/models/transcription/transcription_utils.py @@ -1,8 +1,4 @@ from pyaudiowpatch import PyAudio, paWASAPI -from faster_whisper.utils import download_model -import logging -logger = logging.getLogger('faster_whisper') -logger.setLevel(logging.CRITICAL) def getInputDevices(): devices = {} @@ -48,38 +44,4 @@ def getDefaultOutputDevice(): if default_speakers["name"] in loopback["name"]: default_device = loopback return default_device - return {"name":"NoDevice"} - -def downloadWhisperWeight(weight_type, path): - result = False - try: - download_model( - weight_type, - cache_dir=path) - result = True - except Exception: - pass - return result - -def checkWhisperWeight(weight_type, path): - result = False - try: - result = download_model( - weight_type, - local_files_only=True, - cache_dir=path) - result = True - except Exception: - pass - return result - -if __name__ == "__main__": - - - downloadWhisperWeight("base", "./weight/whisper/") - - from faster_whisper import WhisperModel - whisper_model = WhisperModel("base", device="cpu", device_index=0, compute_type="int8", cpu_threads=4, num_workers=1, download_root="./weight/whisper/") - - print(checkWhisperWeight("base", "./weight/whisper/")) - print(checkWhisperWeight("tiny", "./weight/whisper/")) \ No newline at end of file + return {"name":"NoDevice"} \ No newline at end of file diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py new file mode 100644 index 00000000..dc606cb7 --- /dev/null +++ b/models/transcription/transcription_whisper.py @@ -0,0 +1,98 @@ +from os import path as os_path, makedirs as os_makedirs +from requests import get as requests_get +from typing import Callable +import huggingface_hub +from faster_whisper import WhisperModel +import logging +logger = logging.getLogger('faster_whisper') +logger.setLevel(logging.CRITICAL) + +_MODELS = { + "tiny.en": "Systran/faster-whisper-tiny.en", + "tiny": "Systran/faster-whisper-tiny", + "base.en": "Systran/faster-whisper-base.en", + "base": "Systran/faster-whisper-base", + "small.en": "Systran/faster-whisper-small.en", + "small": "Systran/faster-whisper-small", + "medium.en": "Systran/faster-whisper-medium.en", + "medium": "Systran/faster-whisper-medium", + "large-v1": "Systran/faster-whisper-large-v1", + "large-v2": "Systran/faster-whisper-large-v2", + "large-v3": "Systran/faster-whisper-large-v3", + "large": "Systran/faster-whisper-large-v3", +} + +_FILENAMES = [ + "config.json", + "preprocessor_config.json", + "model.bin", + "tokenizer.json", + "vocabulary.txt", +] + +def downloadFile(url, path, func=None): + try: + res = requests_get(url, stream=True) + res.raise_for_status() + file_size = int(res.headers.get('content-length', 0)) + total_chunk = 0 + with open(os_path.join(path), 'wb') as file: + for chunk in res.iter_content(chunk_size=1024*5): + file.write(chunk) + if isinstance(func, Callable): + total_chunk += len(chunk) + func(total_chunk/file_size) + + except Exception as e: + print("error:downloadFile()", e) + +def checkWhisperWeight(path): + result = False + try: + WhisperModel( + path, + device="cpu", + device_index=0, + compute_type="int8", + cpu_threads=4, + num_workers=1, + local_files_only=True, + ) + result = True + except Exception: + pass + return result + +def downloadWhisperWeight(root, weight_type, callbackFunc): + path = os_path.join(root, "weight", "whisper", weight_type) + os_makedirs(path, exist_ok=True) + if checkWhisperWeight(path) is True: + return + + for filename in _FILENAMES: + print("Downloading", filename, "...") + file_path = os_path.join(path, filename) + url = huggingface_hub.hf_hub_url(_MODELS[weight_type], filename) + downloadFile(url, file_path, func=callbackFunc) + +def getWhisperModel(root, weight_type): + path = os_path.join(root, "weight", "whisper", weight_type) + return WhisperModel( + path, + device="cpu", + device_index=0, + compute_type="int8", + cpu_threads=4, + num_workers=1, + local_files_only=True, + ) + +if __name__ == "__main__": + def callback(value): + print(value) + + downloadWhisperWeight("./", "tiny", callback) + downloadWhisperWeight("./", "base", callback) + downloadWhisperWeight("./", "small", callback) + downloadWhisperWeight("./", "medium", callback) + downloadWhisperWeight("./", "large", callback) \ No newline at end of file From e4c685d3822bf8efd7636e3b302e3f1e729eb7e5 Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Thu, 1 Feb 2024 13:40:24 +0900 Subject: [PATCH 04/11] [WIP/TEST] Config : USE_RECOGNIZER_FEATURE -> USE_WHISPER_FEATURE --- config.py | 19 +++++++------------ main.py | 2 +- model.py | 4 ++-- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/config.py b/config.py index 6acf5e3f..c59c0f17 100644 --- a/config.py +++ b/config.py @@ -584,14 +584,14 @@ class Config: saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) @property - @json_serializable('USE_RECOGNIZER_FEATURE') - def USE_RECOGNIZER_FEATURE(self): - return self._USE_RECOGNIZER_FEATURE + @json_serializable('USE_WHISPER_FEATURE') + def USE_WHISPER_FEATURE(self): + return self._USE_WHISPER_FEATURE - @USE_RECOGNIZER_FEATURE.setter - def USE_RECOGNIZER_FEATURE(self, value): + @USE_WHISPER_FEATURE.setter + def USE_WHISPER_FEATURE(self, value): if isinstance(value, bool): - self._USE_RECOGNIZER_FEATURE = value + self._USE_WHISPER_FEATURE = value saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) @property @@ -797,17 +797,12 @@ class Config: self._SELECTABLE_WHISPER_WEIGHT_TYPE_DICT = { # {Save json str}: {i18n_placeholder} pairs "tiny": "tiny", - "tiny.en": "tiny.en", "base": "base", - "base.en": "base.en", "small": "small", - "small.en": "small.en", "medium": "medium", - "medium.en": "medium.en", "large-v1": "large-v1", "large-v2": "large-v2", "large-v3": "large-v3", - "large": "large", } self._MAX_MIC_ENERGY_THRESHOLD = 2000 @@ -887,7 +882,7 @@ class Config: } self._USE_TRANSLATION_FEATURE = True self._CTRANSLATE2_WEIGHT_TYPE = "Small" - self._USE_RECOGNIZER_FEATURE = True + self._USE_WHISPER_FEATURE = True self._WHISPER_WEIGHT_TYPE = "base" self._SEND_MESSAGE_FORMAT = "[message]" self._SEND_MESSAGE_FORMAT_WITH_T = "[message]([translation])" diff --git a/main.py b/main.py index 4aaa7232..37bc53af 100644 --- a/main.py +++ b/main.py @@ -14,7 +14,7 @@ if __name__ == "__main__": splash.toProgress(0) # whisperのダウンロードの説明に変更する必要あり - if config.USE_RECOGNIZER_FEATURE is True: + if config.USE_WHISPER_FEATURE is True: from models.transcription.transcription_whisper import downloadWhisperWeight downloadWhisperWeight(config.PATH_LOCAL, config.WHISPER_WEIGHT_TYPE, splash.updateDownloadProgress) splash.toProgress(0) diff --git a/model.py b/model.py index 6b73bece..98d0a896 100644 --- a/model.py +++ b/model.py @@ -335,7 +335,7 @@ class Model: source=self.mic_audio_recorder.source, phrase_timeout=phase_timeout, max_phrases=config.INPUT_MIC_MAX_PHRASES, - whisper_enabled=config.USE_RECOGNIZER_FEATURE, + whisper_enabled=config.USE_WHISPER_FEATURE, whisper_weight_type=config.WHISPER_WEIGHT_TYPE, root=config.PATH_LOCAL, ) @@ -419,7 +419,7 @@ class Model: source=self.speaker_audio_recorder.source, phrase_timeout=phase_timeout, max_phrases=config.INPUT_SPEAKER_MAX_PHRASES, - whisper_enabled=config.USE_RECOGNIZER_FEATURE, + whisper_enabled=config.USE_WHISPER_FEATURE, whisper_weight_type=config.WHISPER_WEIGHT_TYPE, root=config.PATH_LOCAL, ) From 7cb8c473d4adb8dc1377fa13eb8f14a29bee2afe Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Thu, 1 Feb 2024 13:41:31 +0900 Subject: [PATCH 05/11] =?UTF-8?q?[WIP/TEST]=20Model=20:=20large=E3=83=A2?= =?UTF-8?q?=E3=83=87=E3=83=AB=E3=82=92=E3=83=80=E3=82=A6=E3=83=B3=E3=83=AD?= =?UTF-8?q?=E3=83=BC=E3=83=89=E5=87=A6=E7=90=86=E3=82=92=E4=BF=AE=E6=AD=A3?= =?UTF-8?q?/=20en=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/transcription/transcription_whisper.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py index dc606cb7..67ad61f0 100644 --- a/models/transcription/transcription_whisper.py +++ b/models/transcription/transcription_whisper.py @@ -8,18 +8,13 @@ logger = logging.getLogger('faster_whisper') logger.setLevel(logging.CRITICAL) _MODELS = { - "tiny.en": "Systran/faster-whisper-tiny.en", "tiny": "Systran/faster-whisper-tiny", - "base.en": "Systran/faster-whisper-base.en", "base": "Systran/faster-whisper-base", - "small.en": "Systran/faster-whisper-small.en", "small": "Systran/faster-whisper-small", - "medium.en": "Systran/faster-whisper-medium.en", "medium": "Systran/faster-whisper-medium", "large-v1": "Systran/faster-whisper-large-v1", "large-v2": "Systran/faster-whisper-large-v2", "large-v3": "Systran/faster-whisper-large-v3", - "large": "Systran/faster-whisper-large-v3", } _FILENAMES = [ @@ -28,6 +23,7 @@ _FILENAMES = [ "model.bin", "tokenizer.json", "vocabulary.txt", + "vocabulary.json", ] def downloadFile(url, path, func=None): @@ -67,6 +63,7 @@ def downloadWhisperWeight(root, weight_type, callbackFunc): path = os_path.join(root, "weight", "whisper", weight_type) os_makedirs(path, exist_ok=True) if checkWhisperWeight(path) is True: + print("weight_type:", weight_type, checkWhisperWeight(path)) return for filename in _FILENAMES: @@ -75,6 +72,8 @@ def downloadWhisperWeight(root, weight_type, callbackFunc): url = huggingface_hub.hf_hub_url(_MODELS[weight_type], filename) downloadFile(url, file_path, func=callbackFunc) + print("weight_type:", weight_type, checkWhisperWeight(path)) + def getWhisperModel(root, weight_type): path = os_path.join(root, "weight", "whisper", weight_type) return WhisperModel( @@ -90,9 +89,12 @@ def getWhisperModel(root, weight_type): if __name__ == "__main__": def callback(value): print(value) + pass downloadWhisperWeight("./", "tiny", callback) downloadWhisperWeight("./", "base", callback) downloadWhisperWeight("./", "small", callback) downloadWhisperWeight("./", "medium", callback) - downloadWhisperWeight("./", "large", callback) \ No newline at end of file + downloadWhisperWeight("./", "large-v1", callback) + downloadWhisperWeight("./", "large-v2", callback) + downloadWhisperWeight("./", "large-v3", callback) \ No newline at end of file From 1de239549f7dc3c00b55fefb1dc46da35aec2b24 Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Thu, 1 Feb 2024 15:49:17 +0900 Subject: [PATCH 06/11] =?UTF-8?q?[WIP/TEST]=20Model=20:=20=E3=83=A2?= =?UTF-8?q?=E3=83=87=E3=83=AB=E3=81=AE=E4=BF=9D=E5=AD=98=E4=BD=8D=E7=BD=AE?= =?UTF-8?q?=E3=81=AE=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - speakerの文字起こし処理のバグを修正 --- .gitignore | 2 +- main.py | 5 ++-- model.py | 4 ++-- models/transcription/transcription_whisper.py | 7 ++---- models/translation/translation_translator.py | 8 +++---- .../{utils.py => translation_utils.py} | 24 +++++++++---------- 6 files changed, 23 insertions(+), 27 deletions(-) rename models/translation/{utils.py => translation_utils.py} (78%) diff --git a/.gitignore b/.gitignore index 75c28a41..52825c27 100644 --- a/.gitignore +++ b/.gitignore @@ -6,7 +6,7 @@ VRCT.spec *.pyc logs/ .venv/ -weight/ +weights/ .vscode error.log *.exe diff --git a/main.py b/main.py index 37bc53af..0df15326 100644 --- a/main.py +++ b/main.py @@ -8,14 +8,13 @@ if __name__ == "__main__": splash.showSplash() from config import config - from models.translation.utils import downloadCTranslate2Weight + from models.translation.translation_utils import downloadCTranslate2Weight if config.USE_TRANSLATION_FEATURE is True: downloadCTranslate2Weight(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE, splash.updateDownloadProgress) - splash.toProgress(0) + from models.transcription.transcription_whisper import downloadWhisperWeight # whisperのダウンロードの説明に変更する必要あり if config.USE_WHISPER_FEATURE is True: - from models.transcription.transcription_whisper import downloadWhisperWeight downloadWhisperWeight(config.PATH_LOCAL, config.WHISPER_WEIGHT_TYPE, splash.updateDownloadProgress) splash.toProgress(0) diff --git a/model.py b/model.py index 98d0a896..2c29d4c7 100644 --- a/model.py +++ b/model.py @@ -23,7 +23,7 @@ from models.transcription.transcription_transcriber import AudioTranscriber from models.xsoverlay.notification import xsoverlayForVRCT from models.translation.translation_languages import translation_lang from models.transcription.transcription_languages import transcription_lang -from models.translation.utils import checkCTranslate2Weight +from models.translation.translation_utils import checkCTranslate2Weight from config import config class threadFnc(Thread): @@ -424,7 +424,7 @@ class Model: root=config.PATH_LOCAL, ) def sendSpeakerTranscript(): - speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY) + speaker_transcriber.transcribeAudioQueue(config.SELECTED_RECOGNIZER, speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY) message = speaker_transcriber.getTranscript() try: fnc(message) diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py index 67ad61f0..e30fee2d 100644 --- a/models/transcription/transcription_whisper.py +++ b/models/transcription/transcription_whisper.py @@ -60,10 +60,9 @@ def checkWhisperWeight(path): return result def downloadWhisperWeight(root, weight_type, callbackFunc): - path = os_path.join(root, "weight", "whisper", weight_type) + path = os_path.join(root, "weights", "whisper", weight_type) os_makedirs(path, exist_ok=True) if checkWhisperWeight(path) is True: - print("weight_type:", weight_type, checkWhisperWeight(path)) return for filename in _FILENAMES: @@ -72,10 +71,8 @@ def downloadWhisperWeight(root, weight_type, callbackFunc): url = huggingface_hub.hf_hub_url(_MODELS[weight_type], filename) downloadFile(url, file_path, func=callbackFunc) - print("weight_type:", weight_type, checkWhisperWeight(path)) - def getWhisperModel(root, weight_type): - path = os_path.join(root, "weight", "whisper", weight_type) + path = os_path.join(root, "weights", "whisper", weight_type) return WhisperModel( path, device="cpu", diff --git a/models/translation/translation_translator.py b/models/translation/translation_translator.py index ea02e490..c966c672 100644 --- a/models/translation/translation_translator.py +++ b/models/translation/translation_translator.py @@ -2,7 +2,7 @@ import os from deepl import Translator as deepl_Translator from translators import translate_text as other_web_Translator from .translation_languages import translation_lang -from .utils import ctranslate2_weights +from .translation_utils import ctranslate2_weights import ctranslate2 import transformers @@ -27,8 +27,8 @@ class Translator(): def changeCTranslate2Model(self, path, model_type): directory_name = ctranslate2_weights[model_type]["directory_name"] tokenizer = ctranslate2_weights[model_type]["tokenizer"] - weight_path = os.path.join(path, "weight", directory_name) - tokenizer_path = os.path.join(path, "weight", directory_name, "tokenizer") + weight_path = os.path.join(path, "weights", "ctranslate2", directory_name) + tokenizer_path = os.path.join(path, "weights", "ctranslate2", directory_name, "tokenizer") self.ctranslate2_translator = ctranslate2.Translator( weight_path, device="cpu", @@ -41,7 +41,7 @@ class Translator(): self.ctranslate2_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer, cache_dir=tokenizer_path) except Exception as e: print("Error: changeCTranslate2Model()", e) - tokenizer_path = os.path.join("./weight", directory_name, "tokenizer") + tokenizer_path = os.path.join("./weights", "ctranslate2", directory_name, "tokenizer") self.ctranslate2_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer, cache_dir=tokenizer_path) @staticmethod diff --git a/models/translation/utils.py b/models/translation/translation_utils.py similarity index 78% rename from models/translation/utils.py rename to models/translation/translation_utils.py index d47401cf..73805cdc 100644 --- a/models/translation/utils.py +++ b/models/translation/translation_utils.py @@ -39,36 +39,36 @@ def calculate_file_hash(file_path, block_size=65536): return hash_object.hexdigest() def checkCTranslate2Weight(path, weight_type="Small"): - directory_name = 'weight' - current_directory = path weight_directory_name = ctranslate2_weights[weight_type]["directory_name"] hash_data = ctranslate2_weights[weight_type]["hash"] - files = ["model.bin", "sentencepiece.model", "shared_vocabulary.txt"] + files = [ + "model.bin", + "sentencepiece.model", + "shared_vocabulary.txt" + ] # check already downloaded already_downloaded = False - if all(os_path.exists(os_path.join(current_directory, directory_name, weight_directory_name, file)) for file in files): + if all(os_path.exists(os_path.join(path, weight_directory_name, file)) for file in files): # check hash for file in files: original_hash = hash_data[file] - current_hash = calculate_file_hash(os_path.join(current_directory, directory_name, weight_directory_name, file)) + current_hash = calculate_file_hash(os_path.join(path, weight_directory_name, file)) if original_hash != current_hash: break already_downloaded = True return already_downloaded -def downloadCTranslate2Weight(path, weight_type="Small", func=None): +def downloadCTranslate2Weight(root, weight_type="Small", func=None): url = ctranslate2_weights[weight_type]["url"] - filename = 'weight.zip' - directory_name = 'weight' - current_directory = path + filename = "weight.zip" + path = os_path.join(root, "weights", "ctranslate2") + os_makedirs(path, exist_ok=True) if checkCTranslate2Weight(path, weight_type): return try: - os_makedirs(os_path.join(current_directory, directory_name), exist_ok=True) - print(os_path.join(current_directory, directory_name)) with tempfile.TemporaryDirectory() as tmp_path: res = requests_get(url, stream=True) file_size = int(res.headers.get('content-length', 0)) @@ -81,6 +81,6 @@ def downloadCTranslate2Weight(path, weight_type="Small", func=None): func(total_chunk/file_size) with ZipFile(os_path.join(tmp_path, filename)) as zf: - zf.extractall(os_path.join(current_directory, directory_name)) + zf.extractall(path) except Exception as e: print("error:downloadCTranslate2Weight()", e) \ No newline at end of file From 78b8cb590984a36b722801db0d3f1a63953cad93 Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Fri, 2 Feb 2024 13:14:56 +0900 Subject: [PATCH 07/11] =?UTF-8?q?=F0=9F=90=9B[bugfix]=20install.bat=20:=20?= =?UTF-8?q?package=20version=20fix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- install.bat | 4 +--- requirements.txt | 8 +++++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/install.bat b/install.bat index 036f6a51..8d2a5d51 100644 --- a/install.bat +++ b/install.bat @@ -1,4 +1,2 @@ python.exe -m pip install --upgrade pip -pip install -r requirements.txt -pip install git+https://github.com/misyaguziya/translators -pip install git+https://github.com/misyaguziya/custom_speech_recognition \ No newline at end of file +pip install -r requirements.txt \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 68a6ce15..cedd1568 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,9 @@ pyyaml == 6.0.1 python-i18n == 0.3.9 CTkToolTip == 0.8 pyinstaller==6.2.0 -transformers[torch] +transformers[torch]==4.37.2 sentencepiece==0.1.99 -ctranslate2==3.21.0 -faster-whisper==0.10.0 \ No newline at end of file +ctranslate2==3.24.0 +faster-whisper==0.10.0 +translators @ git+https://github.com/misyaguziya/translators@master +SpeechRecognition @ git+https://github.com/misyaguziya/custom_speech_recognition@master \ No newline at end of file From ee5c4c05ce0c5c9aa9a5aa6279ba3605cd0eede1 Mon Sep 17 00:00:00 2001 From: Sakamoto Shiina <68018796+ShiinaSakamoto@users.noreply.github.com> Date: Fri, 2 Feb 2024 18:08:18 +0900 Subject: [PATCH 08/11] =?UTF-8?q?[WIP/TEST]=20UI:=20=E6=A9=9F=E8=83=BD?= =?UTF-8?q?=E3=81=A8=E8=A6=8B=E3=81=9F=E7=9B=AE=E3=82=92=E7=B9=8B=E3=81=8E?= =?UTF-8?q?=E3=81=BE=E3=81=97=E3=81=9F=E3=80=82=E8=A8=AD=E5=AE=9A=E7=94=BB?= =?UTF-8?q?=E9=9D=A2=E3=81=8B=E3=82=89=E3=81=84=E3=81=98=E3=82=8C=E3=81=BE?= =?UTF-8?q?=E3=81=99=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- controller.py | 33 ++++++++++++ locales/en.yml | 16 ++++++ view.py | 52 +++++++++++++++++++ .../createSideMenuAndSettingsBoxContainers.py | 6 ++- .../setting_box_transcription/__init__.py | 3 +- .../createSettingBox_InternalModel.py | 37 +++++++++++++ 6 files changed, 145 insertions(+), 2 deletions(-) create mode 100644 vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/createSettingBox_InternalModel.py diff --git a/controller.py b/controller.py index 9d44b491..724d2cf3 100644 --- a/controller.py +++ b/controller.py @@ -767,6 +767,35 @@ def callbackSetSpeakerMaxPhrases(value): except Exception: view.showErrorMessage_SpeakerMaxPhrases() +# Transcription (Internal AI Model) +def callbackSetUserWhisperFeature(value): + print("callbackSetUserWhisperFeature", value) + config.USE_WHISPER_FEATURE = value + if config.USE_WHISPER_FEATURE is True: + view.openWhisperWeightTypeWidget() + else: + view.closeWhisperWeightTypeWidget() + +def callbackSetWhisperWeightType(value): + print("callbackSetWhisperWeightType", value) + config.WHISPER_WEIGHT_TYPE = str(value) + view.updateSelectedWhisperWeightType(config.WHISPER_WEIGHT_TYPE) + # view.setWidgetsStatus_changeWeightType_Pending() + # if model.checkCTranslatorCTranslate2ModelWeight(): + # config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = False + # def callback(): + # model.changeTranslatorCTranslate2Model() + # view.useTranslationFeatureProcess("Normal") + # view.setWidgetsStatus_changeWeightType_Done() + # th_callback = Thread(target=callback) + # th_callback.daemon = True + # th_callback.start() + # else: + # config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = True + # view.useTranslationFeatureProcess("Restart") + # view.setWidgetsStatus_changeWeightType_Done() + # view.showRestartButtonIfRequired() + # Others Tab def callbackSetEnableAutoClearMessageBox(value): @@ -993,6 +1022,10 @@ def createMainWindow(splash): "callback_set_speaker_phrase_timeout": callbackSetSpeakerPhraseTimeout, "callback_set_speaker_max_phrases": callbackSetSpeakerMaxPhrases, + # Transcription Tab (Internal AI Model) + "callback_set_use_whisper_feature": callbackSetUserWhisperFeature, + "callback_set_whisper_weight_type": callbackSetWhisperWeightType, + # Others Tab "callback_set_enable_auto_clear_chatbox": callbackSetEnableAutoClearMessageBox, "callback_set_send_only_translated_messages": callbackSetEnableSendOnlyTranslatedMessages, diff --git a/locales/en.yml b/locales/en.yml index 2806ea91..f68aa32c 100644 --- a/locales/en.yml +++ b/locales/en.yml @@ -79,6 +79,7 @@ config_window: transcription: Transcription transcription_mic: Mic transcription_speaker: Speaker + transcription_internal_model: Internal Model others: Others others_send_message_formats: Message Formats (Send) others_received_message_formats: Message Formats (Received) @@ -125,6 +126,21 @@ config_window: small: "Basic model (%{capacity})" large: "High accuracy model (%{capacity})" + use_whisper_feature: + label: Use Whisper Feature + desc: Description + + whisper_weight_type: + label: Select Whisper Model + desc: Description + tiny: "tiny model (%{capacity})" + base: "base model (%{capacity})" + small: "small model (%{capacity})" + medium: "medium model (%{capacity})" + large_v1: "large_v1 model (%{capacity})" + large_v2: "large_v2 model (%{capacity})" + large_v3: "large_v3 model (%{capacity})" + deepl_auth_key: label: DeepL Auth Key desc: Please select %{translator} on the main screen with DeepL_API when using. ※Some languages may not be supported. diff --git a/view.py b/view.py index cf90dcfa..6f7a6d7e 100644 --- a/view.py +++ b/view.py @@ -211,6 +211,7 @@ class View(): VAR_SIDE_MENU_LABEL_TRANSCRIPTION=StringVar(value=i18n.t("config_window.side_menu_labels.transcription")), VAR_SECOND_TITLE_TRANSCRIPTION_MIC=StringVar(value=i18n.t("config_window.side_menu_labels.transcription_mic")), VAR_SECOND_TITLE_TRANSCRIPTION_SPEAKER=StringVar(value=i18n.t("config_window.side_menu_labels.transcription_speaker")), + VAR_SECOND_TITLE_TRANSCRIPTION_INTERNAL_MODEL=StringVar(value=i18n.t("config_window.side_menu_labels.transcription_internal_model")), VAR_SIDE_MENU_LABEL_OTHERS=StringVar(value=i18n.t("config_window.side_menu_labels.others")), VAR_SIDE_MENU_LABEL_ADVANCED_SETTINGS=StringVar(value=i18n.t("config_window.side_menu_labels.advanced_settings")), @@ -381,6 +382,19 @@ class View(): CALLBACK_FOCUS_OUT_SPEAKER_MAX_PHRASES=self.callbackBindFocusOut_SpeakerMaxPhrases, + # Transcription Tab (Whisper Internal AI Model) + VAR_LABEL_USE_WHISPER_FEATURE=StringVar(value=i18n.t("config_window.use_whisper_feature.label")), + VAR_DESC_USE_WHISPER_FEATURE=StringVar(value=i18n.t("config_window.use_whisper_feature.desc")), + CALLBACK_SET_USE_WHISPER_FEATURE=None, + VAR_USE_WHISPER_FEATURE=BooleanVar(value=config.USE_WHISPER_FEATURE), + + VAR_LABEL_WHISPER_WEIGHT_TYPE=StringVar(value=i18n.t("config_window.whisper_weight_type.label")), + VAR_DESC_WHISPER_WEIGHT_TYPE=StringVar(value=i18n.t("config_window.whisper_weight_type.desc")), + DICT_WHISPER_WEIGHT_TYPE=self.getSelectableWhisperWeightTypeDict(), + CALLBACK_SET_WHISPER_WEIGHT_TYPE=None, + VAR_WHISPER_WEIGHT_TYPE=StringVar(value=self.getSelectableWhisperWeightTypeDict()[config.WHISPER_WEIGHT_TYPE]), + + # Others Tab VAR_LABEL_ENABLE_AUTO_CLEAR_MESSAGE_BOX=StringVar(value=i18n.t("config_window.auto_clear_the_message_box.label")), VAR_DESC_ENABLE_AUTO_CLEAR_MESSAGE_BOX=None, @@ -624,6 +638,11 @@ class View(): self.view_variable.CALLBACK_SET_SPEAKER_PHRASE_TIMEOUT = config_window_registers.get("callback_set_speaker_phrase_timeout", None) self.view_variable.CALLBACK_SET_SPEAKER_MAX_PHRASES = config_window_registers.get("callback_set_speaker_max_phrases", None) + # Transcription Tab (Internal AI Model) + self.view_variable.CALLBACK_SET_USE_WHISPER_FEATURE = config_window_registers.get("callback_set_use_whisper_feature", None) + self.view_variable.CALLBACK_SET_WHISPER_WEIGHT_TYPE = config_window_registers.get("callback_set_whisper_weight_type", None) + + # Others Tab self.view_variable.CALLBACK_SET_ENABLE_AUTO_CLEAR_MESSAGE_BOX = config_window_registers.get("callback_set_enable_auto_clear_chatbox", None) self.view_variable.CALLBACK_SET_ENABLE_SEND_ONLY_TRANSLATED_MESSAGES = config_window_registers.get("callback_set_send_only_translated_messages", None) @@ -678,6 +697,11 @@ class View(): ) self.replaceMicThresholdCheckButton_Disabled() + if config.USE_WHISPER_FEATURE is True: + self.openWhisperWeightTypeWidget() + else: + self.closeWhisperWeightTypeWidget() + if config.ENABLE_SPEAKER2CHATBOX is False: vrct_gui._changeConfigWindowWidgetsStatus( status="disabled", @@ -919,6 +943,17 @@ class View(): vrct_gui.update() vrct_gui.config_window.lift() + @staticmethod + def getSelectableWhisperWeightTypeDict(): + return { + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["tiny"]: i18n.t("config_window.whisper_weight_type.tiny", capacity="t"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["base"]: i18n.t("config_window.whisper_weight_type.base", capacity="b"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["small"]: i18n.t("config_window.whisper_weight_type.small", capacity="s"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["medium"]: i18n.t("config_window.whisper_weight_type.medium", capacity="m"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v1"]: i18n.t("config_window.whisper_weight_type.large_v1", capacity="l_v1"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v2"]: i18n.t("config_window.whisper_weight_type.large_v2", capacity="l_v2"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v3"]: i18n.t("config_window.whisper_weight_type.large_v3", capacity="l_v3"), + } # Open Webpage Functions def openWebPage_Booth(self): @@ -1082,6 +1117,23 @@ class View(): vrct_gui.config_window.sb__ctranslate2_weight_type.grid_remove() + def openWhisperWeightTypeWidget(self): + vrct_gui.config_window.sb__use_whisper_feature.grid() + vrct_gui.config_window.sb__whisper_weight_type.grid() + + def closeWhisperWeightTypeWidget(self): + vrct_gui.config_window.sb__use_whisper_feature.grid() + vrct_gui.config_window.sb__whisper_weight_type.grid_remove() + + + def updateSelectedWhisperWeightType(self, selected_weight_type:str): + self.view_variable.VAR_WHISPER_WEIGHT_TYPE.set(self.getSelectableWhisperWeightTypeDict()[selected_weight_type]) + + def setLatestCTranslate2WeightType(self): + selected_weight_type = self.getSelectableWhisperWeightTypeDict()[config.WHISPER_WEIGHT_TYPE] + self.view_variable.VAR_WHISPER_WEIGHT_TYPE.set(selected_weight_type) + + def openMicEnergyThresholdWidget(self): self.view_variable.VAR_LABEL_MIC_DYNAMIC_ENERGY_THRESHOLD.set(i18n.t("config_window.mic_dynamic_energy_threshold.label_for_manual")) self.view_variable.VAR_DESC_MIC_DYNAMIC_ENERGY_THRESHOLD.set(i18n.t("config_window.mic_dynamic_energy_threshold.desc_for_manual")) diff --git a/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/createSideMenuAndSettingsBoxContainers.py b/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/createSideMenuAndSettingsBoxContainers.py index 30af50de..49272afc 100644 --- a/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/createSideMenuAndSettingsBoxContainers.py +++ b/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/createSideMenuAndSettingsBoxContainers.py @@ -7,7 +7,7 @@ from ._createSettingBoxContainer import _createSettingBoxContainer from .setting_box_containers.setting_box_appearance import createSettingBox_Appearance -from .setting_box_containers.setting_box_transcription import createSettingBox_Mic, createSettingBox_Speaker +from .setting_box_containers.setting_box_transcription import createSettingBox_Mic, createSettingBox_Speaker, createSettingBox_InternalModel from .setting_box_containers.setting_box_others import createSettingBox_Others, createSettingBox_Others_SendMessageFormats, createSettingBox_Others_ReceivedMessageFormats, createSettingBox_Others_Additional from .setting_box_containers.setting_box_advanced_settings import createSettingBox_AdvancedSettings from .setting_box_containers.setting_box_translation import createSettingBox_Translation @@ -94,6 +94,10 @@ def createSideMenuAndSettingsBoxContainers(config_window, settings, view_variabl "var_section_title": view_variable.VAR_SECOND_TITLE_TRANSCRIPTION_SPEAKER, "setting_box": createSettingBox_Speaker }, + { + "var_section_title": view_variable.VAR_SECOND_TITLE_TRANSCRIPTION_INTERNAL_MODEL, + "setting_box": createSettingBox_InternalModel + }, ] }, }, diff --git a/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/__init__.py b/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/__init__.py index 5383094e..b06ff822 100644 --- a/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/__init__.py +++ b/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/__init__.py @@ -1,2 +1,3 @@ from .createSettingBox_Mic import createSettingBox_Mic -from .createSettingBox_Speaker import createSettingBox_Speaker \ No newline at end of file +from .createSettingBox_Speaker import createSettingBox_Speaker +from .createSettingBox_InternalModel import createSettingBox_InternalModel \ No newline at end of file diff --git a/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/createSettingBox_InternalModel.py b/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/createSettingBox_InternalModel.py new file mode 100644 index 00000000..0a6b3e69 --- /dev/null +++ b/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/createSettingBox_InternalModel.py @@ -0,0 +1,37 @@ +from utils import callFunctionIfCallable + +from .._SettingBoxGenerator import _SettingBoxGenerator + +def createSettingBox_InternalModel(setting_box_wrapper, config_window, settings, view_variable): + sbg = _SettingBoxGenerator(setting_box_wrapper, config_window, settings, view_variable) + createSettingBoxSwitch = sbg.createSettingBoxSwitch + createSettingBoxDropdownMenu = sbg.createSettingBoxDropdownMenu + + def switchUseWhisperFeatureCallback(switch_widget): + callFunctionIfCallable(view_variable.CALLBACK_SET_USE_WHISPER_FEATURE, switch_widget.get()) + + def optionmenuWhisperWeightTypeCallback(value): + callFunctionIfCallable(view_variable.CALLBACK_SET_WHISPER_WEIGHT_TYPE, value) + + + row=0 + config_window.sb__use_whisper_feature = createSettingBoxSwitch( + for_var_label_text=view_variable.VAR_LABEL_USE_WHISPER_FEATURE, + for_var_desc_text=view_variable.VAR_DESC_USE_WHISPER_FEATURE, + switch_attr_name="sb__switch_use_whisper_feature", + command=lambda: switchUseWhisperFeatureCallback(config_window.sb__switch_use_whisper_feature), + variable=view_variable.VAR_USE_WHISPER_FEATURE + ) + config_window.sb__use_whisper_feature.grid(row=row, pady=0) + row+=1 + + config_window.sb__whisper_weight_type = createSettingBoxDropdownMenu( + for_var_label_text=view_variable.VAR_LABEL_WHISPER_WEIGHT_TYPE, + for_var_desc_text=view_variable.VAR_DESC_WHISPER_WEIGHT_TYPE, + optionmenu_attr_name="sb__optionmenu_whisper_weight_type", + dropdown_menu_values=view_variable.DICT_WHISPER_WEIGHT_TYPE, + command=lambda value: optionmenuWhisperWeightTypeCallback(value), + variable=view_variable.VAR_WHISPER_WEIGHT_TYPE, + ) + config_window.sb__whisper_weight_type.grid(row=row, pady=0) + row+=1 \ No newline at end of file From 801d948513b1dd0d891f3b85cf582f524f537063 Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Sat, 3 Feb 2024 02:35:40 +0900 Subject: [PATCH 09/11] =?UTF-8?q?[WIP/TEST]=20Wisper=E3=81=AE=E5=87=A6?= =?UTF-8?q?=E7=90=86=E3=81=AB=E3=81=A4=E3=81=84=E3=81=A6UI=E3=81=A8?= =?UTF-8?q?=E5=86=85=E9=83=A8=E3=81=AE=E5=87=A6=E7=90=86=E3=82=92=E6=8E=A5?= =?UTF-8?q?=E7=B6=9A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.py | 24 ++++++++++----- controller.py | 30 +++++++++---------- model.py | 12 +++++--- .../transcription_transcriber.py | 28 ++++++++--------- models/transcription/transcription_whisper.py | 5 ++-- view.py | 4 ++- 6 files changed, 57 insertions(+), 46 deletions(-) diff --git a/config.py b/config.py index c59c0f17..6ce32035 100644 --- a/config.py +++ b/config.py @@ -210,6 +210,15 @@ class Config: if isinstance(value, bool): self._IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = value + @property + def IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER(self): + return self._IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER + + @IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER.setter + def IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER(self, value): + if isinstance(value, bool): + self._IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = value + # Save Json Data ## Main Window @property @@ -268,14 +277,14 @@ class Config: saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) @property - @json_serializable('SELECTED_RECOGNIZER') - def SELECTED_RECOGNIZER(self): - return self._SELECTED_RECOGNIZER + @json_serializable('SELECTED_TRANSCRIPTION_ENGINE') + def SELECTED_TRANSCRIPTION_ENGINE(self): + return self._SELECTED_TRANSCRIPTION_ENGINE - @SELECTED_RECOGNIZER.setter - def SELECTED_RECOGNIZER(self, value): + @SELECTED_TRANSCRIPTION_ENGINE.setter + def SELECTED_TRANSCRIPTION_ENGINE(self, value): if isinstance(value, str): - self._SELECTED_RECOGNIZER = value + self._SELECTED_TRANSCRIPTION_ENGINE = value saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) @property @@ -820,6 +829,7 @@ class Config: self._TARGET_LANGUAGE = "English" self._TARGET_COUNTRY = "United States" self._IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = False + self._IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = False # Save Json Data ## Main Window @@ -844,7 +854,7 @@ class Config: "2":"English\n(United States)", "3":"English\n(United States)", } - self._SELECTED_RECOGNIZER = "Google" + self._SELECTED_TRANSCRIPTION_ENGINE = "Google" self._IS_MAIN_WINDOW_SIDEBAR_COMPACT_MODE = False ## Config Window diff --git a/controller.py b/controller.py index 724d2cf3..e63101b2 100644 --- a/controller.py +++ b/controller.py @@ -773,29 +773,27 @@ def callbackSetUserWhisperFeature(value): config.USE_WHISPER_FEATURE = value if config.USE_WHISPER_FEATURE is True: view.openWhisperWeightTypeWidget() + if model.checkTranscriptionWhisperModelWeight() is True: + config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = False + config.SELECTED_TRANSCRIPTION_ENGINE = "Whisper" + else: + config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = True + config.SELECTED_TRANSCRIPTION_ENGINE = "Google" else: view.closeWhisperWeightTypeWidget() + view.showRestartButtonIfRequired() def callbackSetWhisperWeightType(value): print("callbackSetWhisperWeightType", value) config.WHISPER_WEIGHT_TYPE = str(value) view.updateSelectedWhisperWeightType(config.WHISPER_WEIGHT_TYPE) - # view.setWidgetsStatus_changeWeightType_Pending() - # if model.checkCTranslatorCTranslate2ModelWeight(): - # config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = False - # def callback(): - # model.changeTranslatorCTranslate2Model() - # view.useTranslationFeatureProcess("Normal") - # view.setWidgetsStatus_changeWeightType_Done() - # th_callback = Thread(target=callback) - # th_callback.daemon = True - # th_callback.start() - # else: - # config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = True - # view.useTranslationFeatureProcess("Restart") - # view.setWidgetsStatus_changeWeightType_Done() - # view.showRestartButtonIfRequired() - + if model.checkTranscriptionWhisperModelWeight() is True: + config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = False + config.SELECTED_TRANSCRIPTION_ENGINE = "Whisper" + else: + config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = True + config.SELECTED_TRANSCRIPTION_ENGINE = "Google" + view.showRestartButtonIfRequired() # Others Tab def callbackSetEnableAutoClearMessageBox(value): diff --git a/model.py b/model.py index 2c29d4c7..228bc253 100644 --- a/model.py +++ b/model.py @@ -24,6 +24,7 @@ from models.xsoverlay.notification import xsoverlayForVRCT from models.translation.translation_languages import translation_lang from models.transcription.transcription_languages import transcription_lang from models.translation.translation_utils import checkCTranslate2Weight +from models.transcription.transcription_whisper import checkWhisperWeight from config import config class threadFnc(Thread): @@ -74,6 +75,9 @@ class Model: def changeTranslatorCTranslate2Model(self): self.translator.changeCTranslate2Model(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE) + def checkTranscriptionWhisperModelWeight(self): + return checkWhisperWeight(config.PATH_LOCAL, config.WHISPER_WEIGHT_TYPE) + def resetKeywordProcessor(self): del self.keyword_processor self.keyword_processor = KeywordProcessor() @@ -335,12 +339,12 @@ class Model: source=self.mic_audio_recorder.source, phrase_timeout=phase_timeout, max_phrases=config.INPUT_MIC_MAX_PHRASES, - whisper_enabled=config.USE_WHISPER_FEATURE, + transcription_engine=config.SELECTED_TRANSCRIPTION_ENGINE, whisper_weight_type=config.WHISPER_WEIGHT_TYPE, root=config.PATH_LOCAL, ) def sendMicTranscript(): - mic_transcriber.transcribeAudioQueue(config.SELECTED_RECOGNIZER, mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY) + mic_transcriber.transcribeAudioQueue(mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY) message = mic_transcriber.getTranscript() try: fnc(message) @@ -419,12 +423,12 @@ class Model: source=self.speaker_audio_recorder.source, phrase_timeout=phase_timeout, max_phrases=config.INPUT_SPEAKER_MAX_PHRASES, - whisper_enabled=config.USE_WHISPER_FEATURE, + transcription_engine=config.SELECTED_TRANSCRIPTION_ENGINE, whisper_weight_type=config.WHISPER_WEIGHT_TYPE, root=config.PATH_LOCAL, ) def sendSpeakerTranscript(): - speaker_transcriber.transcribeAudioQueue(config.SELECTED_RECOGNIZER, speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY) + speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY) message = speaker_transcriber.getTranscript() try: fnc(message) diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py index 0f5b1790..b24d3163 100644 --- a/models/transcription/transcription_transcriber.py +++ b/models/transcription/transcription_transcriber.py @@ -14,7 +14,7 @@ PHRASE_TIMEOUT = 3 MAX_PHRASES = 10 class AudioTranscriber: - def __init__(self, speaker, source, phrase_timeout, max_phrases, whisper_enabled, whisper_weight_type, root): + def __init__(self, speaker, source, phrase_timeout, max_phrases, transcription_engine, whisper_weight_type=None, root=None): self.speaker = speaker self.phrase_timeout = phrase_timeout self.max_phrases = max_phrases @@ -30,38 +30,34 @@ class AudioTranscriber: "new_phrase": True, "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData } - if whisper_enabled is True: - self.whisper_model = getWhisperModel(root, whisper_weight_type) - else: - self.whisper_model = None + self.transcription_engine = transcription_engine + match self.transcription_engine: + case "Google": + self.audio_recognizer = Recognizer() + case "Whisper": + self.audio_recognizer = getWhisperModel(root, whisper_weight_type) - def transcribeAudioQueue(self, recognizer, audio_queue, language, country): - # while True: + def transcribeAudioQueue(self, audio_queue, language, country): audio, time_spoken = audio_queue.get() self.updateLastSampleAndPhraseStatus(audio, time_spoken) text = '' try: - # Whisperが使用できない場合はGoogle Speech-to-Textを使用する - if recognizer == "Whisper": - if self.whisper_model is None: - recognizer = "Google" - audio_data = self.audio_sources["process_data_func"]() - match recognizer: + match self.transcription_engine: case "Google": - text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][recognizer]) + text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][self.transcription_engine]) case "Whisper": audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0 if isinstance(audio_data, torch.Tensor): audio_data = audio_data.detach().numpy() - segments, _ = self.whisper_model.transcribe( + segments, _ = self.audio_recognizer.transcribe( audio_data, beam_size=5, temperature=0.0, log_prob_threshold=-0.8, no_speech_threshold=0.6, - language=transcription_lang[language][country][recognizer], + language=transcription_lang[language][country][self.transcription_engine], word_timestamps=False, without_timestamps=True, task="transcribe", diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py index e30fee2d..c6412d35 100644 --- a/models/transcription/transcription_whisper.py +++ b/models/transcription/transcription_whisper.py @@ -42,7 +42,8 @@ def downloadFile(url, path, func=None): except Exception as e: print("error:downloadFile()", e) -def checkWhisperWeight(path): +def checkWhisperWeight(root, weight_type): + path = os_path.join(root, "weights", "whisper", weight_type) result = False try: WhisperModel( @@ -62,7 +63,7 @@ def checkWhisperWeight(path): def downloadWhisperWeight(root, weight_type, callbackFunc): path = os_path.join(root, "weights", "whisper", weight_type) os_makedirs(path, exist_ok=True) - if checkWhisperWeight(path) is True: + if checkWhisperWeight(root, weight_type) is True: return for filename in _FILENAMES: diff --git a/view.py b/view.py index 6f7a6d7e..84ebd550 100644 --- a/view.py +++ b/view.py @@ -29,6 +29,7 @@ class View(): font_family=config.FONT_FAMILY, ui_language=config.UI_LANGUAGE, is_reset_button_displayed_for_translation=config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION, + is_reset_button_displayed_for_whisper=config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER, ) if config.ENABLE_SPEAKER2CHATBOX is False: @@ -1049,7 +1050,8 @@ class View(): self.restart_required_configs_pre_data.ui_scaling == config.UI_SCALING and self.restart_required_configs_pre_data.font_family == config.FONT_FAMILY and self.restart_required_configs_pre_data.ui_language == config.UI_LANGUAGE and - self.restart_required_configs_pre_data.is_reset_button_displayed_for_translation == config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION + self.restart_required_configs_pre_data.is_reset_button_displayed_for_translation == config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION and + self.restart_required_configs_pre_data.is_reset_button_displayed_for_whisper == config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER ) if locale is None: From 7aafce6e2e78187086db7602f9de3f48e270847b Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Sun, 4 Feb 2024 01:03:38 +0900 Subject: [PATCH 10/11] =?UTF-8?q?[WIP/TEST]=20distil-wisper=E3=81=AE?= =?UTF-8?q?=E5=87=A6=E7=90=86=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.py | 3 +++ controller.py | 6 ++++++ locales/en.yml | 3 +++ main.py | 2 +- models/transcription/transcription_whisper.py | 3 +++ view.py | 17 ++++++++++------- 6 files changed, 26 insertions(+), 8 deletions(-) diff --git a/config.py b/config.py index 6ce32035..ff1f7263 100644 --- a/config.py +++ b/config.py @@ -812,6 +812,9 @@ class Config: "large-v1": "large-v1", "large-v2": "large-v2", "large-v3": "large-v3", + "distil-small": "distil-small", + "distil-medium": "distil-medium", + "distil-large-v2": "distil-large-v2", } self._MAX_MIC_ENERGY_THRESHOLD = 2000 diff --git a/controller.py b/controller.py index e63101b2..e5b747d4 100644 --- a/controller.py +++ b/controller.py @@ -925,6 +925,12 @@ def createMainWindow(splash): # set Translation Engine updateTranslationEngineAndEngineList() + # set Transcription Engine + if config.USE_WHISPER_FEATURE is True: + config.SELECTED_TRANSCRIPTION_ENGINE = "Whisper" + else: + config.SELECTED_TRANSCRIPTION_ENGINE = "Google" + # set word filter model.addKeywords() diff --git a/locales/en.yml b/locales/en.yml index f68aa32c..c799c9d0 100644 --- a/locales/en.yml +++ b/locales/en.yml @@ -140,6 +140,9 @@ config_window: large_v1: "large_v1 model (%{capacity})" large_v2: "large_v2 model (%{capacity})" large_v3: "large_v3 model (%{capacity})" + distil_small: "distil-small model (%{capacity})" + distil_medium: "distil-medium model (%{capacity})" + distil_large_v2: "distil-large-v2 model (%{capacity})" deepl_auth_key: label: DeepL Auth Key diff --git a/main.py b/main.py index 0df15326..6b6c0e3e 100644 --- a/main.py +++ b/main.py @@ -13,9 +13,9 @@ if __name__ == "__main__": downloadCTranslate2Weight(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE, splash.updateDownloadProgress) from models.transcription.transcription_whisper import downloadWhisperWeight - # whisperのダウンロードの説明に変更する必要あり if config.USE_WHISPER_FEATURE is True: downloadWhisperWeight(config.PATH_LOCAL, config.WHISPER_WEIGHT_TYPE, splash.updateDownloadProgress) + splash.toProgress(0) import controller diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py index c6412d35..148b2edb 100644 --- a/models/transcription/transcription_whisper.py +++ b/models/transcription/transcription_whisper.py @@ -15,6 +15,9 @@ _MODELS = { "large-v1": "Systran/faster-whisper-large-v1", "large-v2": "Systran/faster-whisper-large-v2", "large-v3": "Systran/faster-whisper-large-v3", + "distil-small": "Systran/faster-distil-whisper-small.en", + "distil-medium": "Systran/faster-distil-whisper-medium.en", + "distil-large-v2": "Systran/faster-distil-whisper-large-v2" } _FILENAMES = [ diff --git a/view.py b/view.py index 84ebd550..1efb3f22 100644 --- a/view.py +++ b/view.py @@ -947,13 +947,16 @@ class View(): @staticmethod def getSelectableWhisperWeightTypeDict(): return { - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["tiny"]: i18n.t("config_window.whisper_weight_type.tiny", capacity="t"), - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["base"]: i18n.t("config_window.whisper_weight_type.base", capacity="b"), - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["small"]: i18n.t("config_window.whisper_weight_type.small", capacity="s"), - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["medium"]: i18n.t("config_window.whisper_weight_type.medium", capacity="m"), - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v1"]: i18n.t("config_window.whisper_weight_type.large_v1", capacity="l_v1"), - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v2"]: i18n.t("config_window.whisper_weight_type.large_v2", capacity="l_v2"), - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v3"]: i18n.t("config_window.whisper_weight_type.large_v3", capacity="l_v3"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["tiny"]: i18n.t("config_window.whisper_weight_type.tiny", capacity="74.5MB"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["base"]: i18n.t("config_window.whisper_weight_type.base", capacity="141MB"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["small"]: i18n.t("config_window.whisper_weight_type.small", capacity="463MB"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["medium"]: i18n.t("config_window.whisper_weight_type.medium", capacity="1.42GB"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v1"]: i18n.t("config_window.whisper_weight_type.large_v1", capacity="2.87GB"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v2"]: i18n.t("config_window.whisper_weight_type.large_v2", capacity="2.87GB"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v3"]: i18n.t("config_window.whisper_weight_type.large_v3", capacity="2.87GB"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-small"]: i18n.t("config_window.whisper_weight_type.distil_small", capacity="319MB"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-medium"]: i18n.t("config_window.whisper_weight_type.distil_medium", capacity="755MB"), + config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-large-v2"]: i18n.t("config_window.whisper_weight_type.distil_large_v2", capacity="1.41GB"), } # Open Webpage Functions From 61a6eb792b2a8c3aad67f3ef94b67f898561636d Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Sun, 4 Feb 2024 02:42:08 +0900 Subject: [PATCH 11/11] =?UTF-8?q?[WIP/TEST]=20distil-wisper=E3=82=92?= =?UTF-8?q?=E5=89=8A=E9=99=A4/faster-wisper=E3=81=AE=E5=87=A6=E7=90=86?= =?UTF-8?q?=E3=82=92=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.py | 5 +--- locales/en.yml | 3 -- model.py | 10 +++---- .../transcription_transcriber.py | 29 ++++++++++--------- models/transcription/transcription_whisper.py | 3 -- view.py | 3 -- 6 files changed, 21 insertions(+), 32 deletions(-) diff --git a/config.py b/config.py index ff1f7263..55ba2d40 100644 --- a/config.py +++ b/config.py @@ -812,9 +812,6 @@ class Config: "large-v1": "large-v1", "large-v2": "large-v2", "large-v3": "large-v3", - "distil-small": "distil-small", - "distil-medium": "distil-medium", - "distil-large-v2": "distil-large-v2", } self._MAX_MIC_ENERGY_THRESHOLD = 2000 @@ -895,7 +892,7 @@ class Config: } self._USE_TRANSLATION_FEATURE = True self._CTRANSLATE2_WEIGHT_TYPE = "Small" - self._USE_WHISPER_FEATURE = True + self._USE_WHISPER_FEATURE = False self._WHISPER_WEIGHT_TYPE = "base" self._SEND_MESSAGE_FORMAT = "[message]" self._SEND_MESSAGE_FORMAT_WITH_T = "[message]([translation])" diff --git a/locales/en.yml b/locales/en.yml index c799c9d0..f68aa32c 100644 --- a/locales/en.yml +++ b/locales/en.yml @@ -140,9 +140,6 @@ config_window: large_v1: "large_v1 model (%{capacity})" large_v2: "large_v2 model (%{capacity})" large_v3: "large_v3 model (%{capacity})" - distil_small: "distil-small model (%{capacity})" - distil_medium: "distil-medium model (%{capacity})" - distil_large_v2: "distil-large-v2 model (%{capacity})" deepl_auth_key: label: DeepL Auth Key diff --git a/model.py b/model.py index 228bc253..5b17e167 100644 --- a/model.py +++ b/model.py @@ -339,12 +339,11 @@ class Model: source=self.mic_audio_recorder.source, phrase_timeout=phase_timeout, max_phrases=config.INPUT_MIC_MAX_PHRASES, - transcription_engine=config.SELECTED_TRANSCRIPTION_ENGINE, - whisper_weight_type=config.WHISPER_WEIGHT_TYPE, root=config.PATH_LOCAL, + whisper_weight_type=config.WHISPER_WEIGHT_TYPE, ) def sendMicTranscript(): - mic_transcriber.transcribeAudioQueue(mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY) + mic_transcriber.transcribeAudioQueue(mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY, config.SELECTED_TRANSCRIPTION_ENGINE) message = mic_transcriber.getTranscript() try: fnc(message) @@ -423,12 +422,11 @@ class Model: source=self.speaker_audio_recorder.source, phrase_timeout=phase_timeout, max_phrases=config.INPUT_SPEAKER_MAX_PHRASES, - transcription_engine=config.SELECTED_TRANSCRIPTION_ENGINE, - whisper_weight_type=config.WHISPER_WEIGHT_TYPE, root=config.PATH_LOCAL, + whisper_weight_type=config.WHISPER_WEIGHT_TYPE, ) def sendSpeakerTranscript(): - speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY) + speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY, config.SELECTED_TRANSCRIPTION_ENGINE) message = speaker_transcriber.getTranscript() try: fnc(message) diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py index b24d3163..08cc6a1a 100644 --- a/models/transcription/transcription_transcriber.py +++ b/models/transcription/transcription_transcriber.py @@ -5,7 +5,7 @@ from speech_recognition import Recognizer, AudioData, AudioFile from datetime import timedelta from pyaudiowpatch import get_sample_size, paInt16 from .transcription_languages import transcription_lang -from .transcription_whisper import getWhisperModel +from .transcription_whisper import getWhisperModel, checkWhisperWeight import torch import numpy as np @@ -14,7 +14,7 @@ PHRASE_TIMEOUT = 3 MAX_PHRASES = 10 class AudioTranscriber: - def __init__(self, speaker, source, phrase_timeout, max_phrases, transcription_engine, whisper_weight_type=None, root=None): + def __init__(self, speaker, source, phrase_timeout, max_phrases, root=None, whisper_weight_type=None, ): self.speaker = speaker self.phrase_timeout = phrase_timeout self.max_phrases = max_phrases @@ -30,34 +30,37 @@ class AudioTranscriber: "new_phrase": True, "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData } - self.transcription_engine = transcription_engine - match self.transcription_engine: - case "Google": - self.audio_recognizer = Recognizer() - case "Whisper": - self.audio_recognizer = getWhisperModel(root, whisper_weight_type) + if whisper_weight_type is not None and root is not None and checkWhisperWeight(root, whisper_weight_type) is True: + self.whisper_model = getWhisperModel(root, whisper_weight_type) + else: + self.whisper_model = None - def transcribeAudioQueue(self, audio_queue, language, country): + def transcribeAudioQueue(self, audio_queue, language, country, transcription_engine): audio, time_spoken = audio_queue.get() self.updateLastSampleAndPhraseStatus(audio, time_spoken) text = '' try: + # Whisperが使用できない場合はGoogle Speech-to-Textを使用する + if transcription_engine == "Whisper": + if self.whisper_model is None: + transcription_engine = "Google" + audio_data = self.audio_sources["process_data_func"]() - match self.transcription_engine: + match transcription_engine: case "Google": - text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][self.transcription_engine]) + text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][transcription_engine]) case "Whisper": audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0 if isinstance(audio_data, torch.Tensor): audio_data = audio_data.detach().numpy() - segments, _ = self.audio_recognizer.transcribe( + segments, _ = self.whisper_model.transcribe( audio_data, beam_size=5, temperature=0.0, log_prob_threshold=-0.8, no_speech_threshold=0.6, - language=transcription_lang[language][country][self.transcription_engine], + language=transcription_lang[language][country][transcription_engine], word_timestamps=False, without_timestamps=True, task="transcribe", diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py index 148b2edb..c6412d35 100644 --- a/models/transcription/transcription_whisper.py +++ b/models/transcription/transcription_whisper.py @@ -15,9 +15,6 @@ _MODELS = { "large-v1": "Systran/faster-whisper-large-v1", "large-v2": "Systran/faster-whisper-large-v2", "large-v3": "Systran/faster-whisper-large-v3", - "distil-small": "Systran/faster-distil-whisper-small.en", - "distil-medium": "Systran/faster-distil-whisper-medium.en", - "distil-large-v2": "Systran/faster-distil-whisper-large-v2" } _FILENAMES = [ diff --git a/view.py b/view.py index 1efb3f22..94a4af8c 100644 --- a/view.py +++ b/view.py @@ -954,9 +954,6 @@ class View(): config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v1"]: i18n.t("config_window.whisper_weight_type.large_v1", capacity="2.87GB"), config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v2"]: i18n.t("config_window.whisper_weight_type.large_v2", capacity="2.87GB"), config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v3"]: i18n.t("config_window.whisper_weight_type.large_v3", capacity="2.87GB"), - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-small"]: i18n.t("config_window.whisper_weight_type.distil_small", capacity="319MB"), - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-medium"]: i18n.t("config_window.whisper_weight_type.distil_medium", capacity="755MB"), - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-large-v2"]: i18n.t("config_window.whisper_weight_type.distil_large_v2", capacity="1.41GB"), } # Open Webpage Functions