From ba12e39bbc4187bad8831fa67d8e04350c3cd874 Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Tue, 30 Jan 2024 02:15:05 +0900
Subject: [PATCH 01/11] =?UTF-8?q?[WIP/TEST]=20Model=20:=20faster-whisper?=
 =?UTF-8?q?=E3=82=92=E8=BF=BD=E5=8A=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../transcription_transcriber.py              | 28 +++++++++++++++++++
 requirements.txt                              |  3 +-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py
index bf78566e..fbea0e74 100644
--- a/models/transcription/transcription_transcriber.py
+++ b/models/transcription/transcription_transcriber.py
@@ -6,6 +6,10 @@ from datetime import timedelta
 from pyaudiowpatch import get_sample_size, paInt16
 from .transcription_languages import transcription_lang
 
+import torch
+import numpy as np
+from faster_whisper import WhisperModel
+
 PHRASE_TIMEOUT = 3
 MAX_PHRASES = 10
 
@@ -26,6 +30,7 @@ class AudioTranscriber:
                 "new_phrase": True,
                 "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData
         }
+        self.whisper_model = WhisperModel("base", device="cpu", device_index=0, compute_type="int8", cpu_threads=4, num_workers=1)
 
     def transcribeAudioQueue(self, audio_queue, language, country):
         # while True:
@@ -38,6 +43,29 @@ class AudioTranscriber:
             # os.close(fd)
             audio_data = self.audio_sources["process_data_func"]()
             text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country])
+
+            audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0
+            if isinstance(audio_data, torch.Tensor):
+                audio_data = audio_data.detach().numpy()
+            segments, _ = self.whisper_model.transcribe(
+                audio_data,
+                beam_size=5,
+                temperature=0.0,
+                log_prob_threshold=-0.8,
+                no_speech_threshold=0.6,
+                language="ja",
+                word_timestamps=False,
+                without_timestamps=True,
+                task="transcribe",
+                vad_filter=False,
+                )
+            _text = ""
+            for s in segments:
+                if s.avg_logprob < -0.8 or s.no_speech_prob > 0.6:
+                    continue
+                _text += s.text
+            print(_text)
+
         except Exception:
             pass
         finally:
diff --git a/requirements.txt b/requirements.txt
index b6e14d85..68a6ce15 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,5 @@ CTkToolTip == 0.8
 pyinstaller==6.2.0
 transformers[torch]
 sentencepiece==0.1.99
-ctranslate2==3.21.0
\ No newline at end of file
+ctranslate2==3.21.0
+faster-whisper==0.10.0
\ No newline at end of file

From 9cd1831ecbb4f313347c90d6971eb3c7a075812b Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Tue, 30 Jan 2024 18:21:55 +0900
Subject: [PATCH 02/11] =?UTF-8?q?[WIP/TEST]=20faster-whisper=E3=81=8C?=
 =?UTF-8?q?=E6=9C=80=E4=BD=8E=E9=99=90=E5=8B=95=E3=81=8F=E5=BD=A2=E3=81=A7?=
 =?UTF-8?q?=E5=AE=9F=E8=A3=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

config.jsonで設定変更で実行可能
---
 config.py                                     |  71 ++-
 controller.py                                 |   4 +-
 main.py                                       |   2 +-
 model.py                                      |  14 +-
 .../transcription/transcription_languages.py  | 443 ++++++++++++++----
 .../transcription_transcriber.py              |  70 +--
 models/transcription/transcription_utils.py   |  40 +-
 view.py                                       |   4 +-
 8 files changed, 511 insertions(+), 137 deletions(-)

diff --git a/config.py b/config.py
index 371ec121..6acf5e3f 100644
--- a/config.py
+++ b/config.py
@@ -98,6 +98,10 @@ class Config:
     def SELECTABLE_CTRANSLATE2_WEIGHT_TYPE_DICT(self):
         return self._SELECTABLE_CTRANSLATE2_WEIGHT_TYPE_DICT
 
+    @property
+    def SELECTABLE_WHISPER_WEIGHT_TYPE_DICT(self):
+        return self._SELECTABLE_WHISPER_WEIGHT_TYPE_DICT
+
     @property
     def MAX_MIC_ENERGY_THRESHOLD(self):
         return self._MAX_MIC_ENERGY_THRESHOLD
@@ -263,6 +267,17 @@ class Config:
             self._SELECTED_TAB_TARGET_LANGUAGES = value
             saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
 
+    @property
+    @json_serializable('SELECTED_RECOGNIZER')
+    def SELECTED_RECOGNIZER(self):
+        return self._SELECTED_RECOGNIZER
+
+    @SELECTED_RECOGNIZER.setter
+    def SELECTED_RECOGNIZER(self, value):
+        if isinstance(value, str):
+            self._SELECTED_RECOGNIZER = value
+            saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
+
     @property
     @json_serializable('IS_MAIN_WINDOW_SIDEBAR_COMPACT_MODE')
     def IS_MAIN_WINDOW_SIDEBAR_COMPACT_MODE(self):
@@ -569,15 +584,37 @@ class Config:
             saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
 
     @property
-    @json_serializable('WEIGHT_TYPE')
-    def WEIGHT_TYPE(self):
-        return self._WEIGHT_TYPE
+    @json_serializable('USE_RECOGNIZER_FEATURE')
+    def USE_RECOGNIZER_FEATURE(self):
+        return self._USE_RECOGNIZER_FEATURE
 
-    @WEIGHT_TYPE.setter
-    def WEIGHT_TYPE(self, value):
+    @USE_RECOGNIZER_FEATURE.setter
+    def USE_RECOGNIZER_FEATURE(self, value):
+        if isinstance(value, bool):
+            self._USE_RECOGNIZER_FEATURE = value
+            saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
+
+    @property
+    @json_serializable('CTRANSLATE2_WEIGHT_TYPE')
+    def CTRANSLATE2_WEIGHT_TYPE(self):
+        return self._CTRANSLATE2_WEIGHT_TYPE
+
+    @CTRANSLATE2_WEIGHT_TYPE.setter
+    def CTRANSLATE2_WEIGHT_TYPE(self, value):
         # if isinstance(value, str) and value in self.SELECTABLE_CTRANSLATE2_WEIGHT_TYPE_DICT:
         if isinstance(value, str):
-            self._WEIGHT_TYPE = value
+            self._CTRANSLATE2_WEIGHT_TYPE = value
+            saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
+
+    @property
+    @json_serializable('WHISPER_WEIGHT_TYPE')
+    def WHISPER_WEIGHT_TYPE(self):
+        return self._WHISPER_WEIGHT_TYPE
+
+    @WHISPER_WEIGHT_TYPE.setter
+    def WHISPER_WEIGHT_TYPE(self, value):
+        if isinstance(value, str):
+            self._WHISPER_WEIGHT_TYPE = value
             saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
 
     @property
@@ -756,6 +793,23 @@ class Config:
             "Small": "Small",
             "Large": "Large",
         }
+
+        self._SELECTABLE_WHISPER_WEIGHT_TYPE_DICT = {
+            # {Save json str}: {i18n_placeholder} pairs
+            "tiny": "tiny",
+            "tiny.en": "tiny.en",
+            "base": "base",
+            "base.en": "base.en",
+            "small": "small",
+            "small.en": "small.en",
+            "medium": "medium",
+            "medium.en": "medium.en",
+            "large-v1": "large-v1",
+            "large-v2": "large-v2",
+            "large-v3": "large-v3",
+            "large": "large",
+        }
+
         self._MAX_MIC_ENERGY_THRESHOLD = 2000
         self._MAX_SPEAKER_ENERGY_THRESHOLD = 4000
 
@@ -795,6 +849,7 @@ class Config:
             "2":"English\n(United States)",
             "3":"English\n(United States)",
         }
+        self._SELECTED_RECOGNIZER = "Google"
         self._IS_MAIN_WINDOW_SIDEBAR_COMPACT_MODE = False
 
         ## Config Window
@@ -831,7 +886,9 @@ class Config:
             "DeepL_API": None,
         }
         self._USE_TRANSLATION_FEATURE = True
-        self._WEIGHT_TYPE = "Small"
+        self._CTRANSLATE2_WEIGHT_TYPE = "Small"
+        self._USE_RECOGNIZER_FEATURE = True
+        self._WHISPER_WEIGHT_TYPE = "base"
         self._SEND_MESSAGE_FORMAT = "[message]"
         self._SEND_MESSAGE_FORMAT_WITH_T = "[message]([translation])"
         self._RECEIVED_MESSAGE_FORMAT = "[message]"
diff --git a/controller.py b/controller.py
index f9e9a5b3..9d44b491 100644
--- a/controller.py
+++ b/controller.py
@@ -505,8 +505,8 @@ def callbackSetUseTranslationFeature(value):
 
 def callbackSetCtranslate2WeightType(value):
     print("callbackSetCtranslate2WeightType", value)
-    config.WEIGHT_TYPE = str(value)
-    view.updateSelectedCtranslate2WeightType(config.WEIGHT_TYPE)
+    config.CTRANSLATE2_WEIGHT_TYPE = str(value)
+    view.updateSelectedCtranslate2WeightType(config.CTRANSLATE2_WEIGHT_TYPE)
     view.setWidgetsStatus_changeWeightType_Pending()
     if model.checkCTranslatorCTranslate2ModelWeight():
         config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = False
diff --git a/main.py b/main.py
index 4810cbe5..cf80e289 100644
--- a/main.py
+++ b/main.py
@@ -10,7 +10,7 @@ if __name__ == "__main__":
         from config import config
         from models.translation.utils import downloadCTranslate2Weight
         if config.USE_TRANSLATION_FEATURE is True:
-            downloadCTranslate2Weight(config.PATH_LOCAL, config.WEIGHT_TYPE, splash.updateDownloadProgress)
+            downloadCTranslate2Weight(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE, splash.updateDownloadProgress)
 
         splash.toProgress(0)
         import controller
diff --git a/model.py b/model.py
index 573659a7..61ff24d7 100644
--- a/model.py
+++ b/model.py
@@ -65,14 +65,14 @@ class Model:
         self.speaker_energy_plot_progressbar = None
         self.translator = Translator()
         if config.USE_TRANSLATION_FEATURE is True:
-            self.translator.changeCTranslate2Model(config.PATH_LOCAL, config.WEIGHT_TYPE)
+            self.translator.changeCTranslate2Model(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE)
         self.keyword_processor = KeywordProcessor()
 
     def checkCTranslatorCTranslate2ModelWeight(self):
-        return checkCTranslate2Weight(config.PATH_LOCAL, config.WEIGHT_TYPE)
+        return checkCTranslate2Weight(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE)
 
     def changeTranslatorCTranslate2Model(self):
-        self.translator.changeCTranslate2Model(config.PATH_LOCAL, config.WEIGHT_TYPE)
+        self.translator.changeCTranslate2Model(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE)
 
     def resetKeywordProcessor(self):
         del self.keyword_processor
@@ -335,9 +335,12 @@ class Model:
             source=self.mic_audio_recorder.source,
             phrase_timeout=phase_timeout,
             max_phrases=config.INPUT_MIC_MAX_PHRASES,
+            whisper_enabled=config.USE_RECOGNIZER_FEATURE,
+            whisper_weight_type=config.WHISPER_WEIGHT_TYPE,
+            whisper_weight_path=os_path.join(config.PATH_LOCAL, "weight", "whisper"),
         )
         def sendMicTranscript():
-            mic_transcriber.transcribeAudioQueue(mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY)
+            mic_transcriber.transcribeAudioQueue(config.SELECTED_RECOGNIZER, mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY)
             message = mic_transcriber.getTranscript()
             try:
                 fnc(message)
@@ -416,6 +419,9 @@ class Model:
             source=self.speaker_audio_recorder.source,
             phrase_timeout=phase_timeout,
             max_phrases=config.INPUT_SPEAKER_MAX_PHRASES,
+            whisper_enabled=config.USE_RECOGNIZER_FEATURE,
+            whisper_weight_type=config.WHISPER_WEIGHT_TYPE,
+            whisper_weight_path=os_path.join(config.PATH_LOCAL, "weight", "whisper"),
         )
         def sendSpeakerTranscript():
             speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY)
diff --git a/models/transcription/transcription_languages.py b/models/transcription/transcription_languages.py
index 26f2c3f6..63d92568 100644
--- a/models/transcription/transcription_languages.py
+++ b/models/transcription/transcription_languages.py
@@ -1,177 +1,438 @@
 transcription_lang = {
     "Afrikaans":{
-        "South Africa":"af-ZA",
+        "South Africa":{
+            "Google": "af-ZA",
+            "Whisper": "af",
+        },
     },
     "Arabic":{
-        "Algeria":"ar-DZ",
-        "Bahrain":"ar-BH",
-        "Egypt":"ar-EG",
-        "Israel":"ar-IL",
-        "Iraq":"ar-IQ",
-        "Jordan":"ar-JO",
-        "Kuwait":"ar-KW",
-        "Lebanon":"ar-LB",
-        "Morocco":"ar-MA",
-        "Oman":"ar-OM",
-        "State of Palestine":"ar-PS",
-        "Qatar":"ar-QA",
-        "Saudi Arabia":"ar-SA",
-        "Tunisia":"ar-TN",
-        "United Arab Emirates":"ar-AE",
+        "Algeria":{
+            "Google": "ar-DZ",
+            "Whisper": "ar",
+        },
+        "Bahrain":{
+            "Google": "ar-BH",
+            "Whisper": "ar",
+        },
+        "Egypt":{
+            "Google": "ar-EG",
+            "Whisper": "ar",
+        },
+        "Israel":{
+            "Google": "ar-IL",
+            "Whisper": "ar",
+        },
+        "Iraq":{
+            "Google": "ar-IQ",
+            "Whisper": "ar",
+        },
+        "Jordan":{
+            "Google": "ar-JO",
+            "Whisper": "ar",
+        },
+        "Kuwait":{
+            "Google": "ar-KW",
+            "Whisper": "ar",
+        },
+        "Lebanon":{
+            "Google": "ar-LB",
+            "Whisper": "ar",
+        },
+        "Morocco":{
+            "Google": "ar-MA",
+            "Whisper": "ar",
+        },
+        "Oman":{
+            "Google": "ar-OM",
+            "Whisper": "ar",
+        },
+        "State of Palestine":{
+            "Google": "ar-PS",
+            "Whisper": "ar",
+        },
+        "Qatar":{
+            "Google": "ar-QA",
+            "Whisper": "ar",
+        },
+        "Saudi Arabia":{
+            "Google": "ar-SA",
+            "Whisper": "ar",
+        },
+        "Tunisia":{
+            "Google": "ar-TN",
+            "Whisper": "ar",
+        },
+        "United Arab Emirates":{
+            "Google": "ar-AE",
+            "Whisper": "ar",
+        },
     },
     "Basque":{
-        "Spain":"eu-ES",
+        "Spain":{
+            "Google": "eu-ES",
+            "Whisper": "eu",
+        },
     },
     "Bulgarian":{
-        "Bulgaria":"bg-BG",
+        "Bulgaria":{
+            "Google": "bg-BG",
+            "Whisper": "bg",
+        },
     },
     "Catalan":{
-        "Spain":"ca-ES",
+        "Spain":{
+            "Google": "ca-ES",
+            "Whisper": "ca",
+        },
     },
     "Chinese":{
-        "Mandarin (Simplified, China)":"cmn-Hans-CN",
-        "Mandarin (Simplified, Hong Kong)":"cmn-Hans-HK",
-        "Mandarin (Traditional, Taiwan)":"cmn-Hant-TW",
-        "Cantonese (Traditional Hong Kong)":"yue-Hant-HK",
+        "Mandarin (Simplified, China)":{
+            "Google": "cmn-Hans-CN",
+            "Whisper": "zh",
+        },
+        "Mandarin (Simplified, Hong Kong)":{
+            "Google": "cmn-Hans-HK",
+            "Whisper": "zh",
+        },
+        "Mandarin (Traditional, Taiwan)":{
+            "Google": "cmn-Hant-TW",
+            "Whisper": "zh",
+        },
+        "Cantonese (Traditional Hong Kong)":{
+            "Google": "yue-Hant-HK",
+            "Whisper": "yue",
+        },
     },
     "Croatian":{
-        "Croatia":"hr-HR",
+        "Croatia":{
+            "Google": "hr-HR",
+            "Whisper": "hr",
+        },
     },
     "Czech":{
-        "Czech Republic":"cs-CZ",
+        "Czech Republic":{
+            "Google": "cs-CZ",
+            "Whisper": "cs",
+        },
     },
     "Danish":{
-        "Denmark":"da-DK",
+        "Denmark":{
+            "Google": "da-DK",
+            "Whisper": "da",
+        },
     },
     "Dutch":{
-        "Netherlands":"nl-NL",
+        "Netherlands":{
+            "Google": "nl-NL",
+            "Whisper": "nl",
+        },
     },
     "English": {
-        "United States":"en-US",
-        "United Kingdom":"en-GB",
-        "Australia":"en-AU",
-        "Canada":"en-CA",
-        "India":"en-IN",
-        "Ireland":"en-IE",
-        "New Zealand":"en-NZ",
-        "Philippines":"en-PH",
-        "South Africa":"en-ZA",
+        "United States":{
+            "Google": "en-US",
+            "Whisper": "en",
+        },
+        "United Kingdom":{
+            "Google": "en-GB",
+            "Whisper": "en",
+        },
+        "Australia":{
+            "Google": "en-AU",
+            "Whisper": "en",
+        },
+        "Canada":{
+            "Google": "en-CA",
+            "Whisper": "en",
+        },
+        "India":{
+            "Google": "en-IN",
+            "Whisper": "en",
+        },
+        "Ireland":{
+            "Google": "en-IE",
+            "Whisper": "en",
+        },
+        "New Zealand":{
+            "Google": "en-NZ",
+            "Whisper": "en",
+        },
+        "Philippines":{
+            "Google": "en-PH",
+            "Whisper": "en",
+        },
+        "South Africa":{
+            "Google": "en-ZA",
+            "Whisper": "en",
+        },
     },
     "Filipino":{
-        "Philippines":"fil-PH",
+        "Philippines":{
+            "Google": "fil-PH",
+            "Whisper": "tl",
+        },
     },
     "Finnish":{
-        "Finland":"fi-FI",
+        "Finland":{
+            "Google": "fi-FI",
+            "Whisper": "fi",
+        },
     },
     "French":{
-        "France":"fr-FR",
+        "France":{
+            "Google": "fr-FR",
+            "Whisper": "fr",
+        },
     },
     "Galician":{
-        "Spain":"gl-ES",
+        "Spain":{
+            "Google": "gl-ES",
+            "Whisper": "gl",
+        },
     },
     "German":{
-        "Germany":"de-DE",
+        "Germany":{
+            "Google": "de-DE",
+            "Whisper": "de",
+        },
     },
     "Greek":{
-        "Greece":"el-GR",
+        "Greece":{
+            "Google": "el-GR",
+            "Whisper": "el",
+        },
     },
     "Hebrew":{
-        "Israel":"he-IL",
+        "Israel":{
+            "Google": "he-IL",
+            "Whisper": "he",
+        },
     },
     "Hindi": {
-        "India":"hi-IN",
+        "India":{
+            "Google": "hi-IN",
+            "Whisper": "hi",
+        },
     },
     "Hungarian":{
-        "Hungary":"hu-HU",
+        "Hungary":{
+            "Google": "hu-HU",
+            "Whisper": "hu",
+        },
     },
     "Indonesian":{
-        "Indonesia":"id-ID",
+        "Indonesia":{
+            "Google": "id-ID",
+            "Whisper": "id",
+        },
     },
     "Icelandic":{
-        "Iceland":"is-IS",
+        "Iceland":{
+            "Google": "is-IS",
+            "Whisper": "is",
+        },
     },
     "Italian":{
-        "Italy":"it-IT",
-        "Switzerland":"it-CH",
+        "Italy":{
+            "Google": "it-IT",
+            "Whisper": "it",
+        },
+        "Switzerland":{
+            "Google": "it-CH",
+            "Whisper": "it",
+        },
     },
     "Japanese":{
-        "Japan":"ja-JP",
+        "Japan":{
+            "Google": "ja-JP",
+            "Whisper": "ja",
+        },
     },
     "Korean":{
-        "South Korea":"ko-KR",
+        "South Korea":{
+            "Google": "ko-KR",
+            "Whisper": "ko",
+        },
     },
     "Lithuanian":{
-        "Lithuania":"lt-LT",
+        "Lithuania":{
+            "Google": "lt-LT",
+            "Whisper": "lt",
+        },
     },
     "Malay":{
-        "Malaysia":"ms-MY",
+        "Malaysia":{
+            "Google": "ms-MY",
+            "Whisper": "ms",
+        },
     },
     "Norwegian":{
-        "Norway":"nb-NO",
+        "Norway":{
+            "Google": "nb-NO",
+            "Whisper": "no",
+        },
     },
     "Persian":{
-        "Iran":"fa-IR",
+        "Iran":{
+            "Google": "fa-IR",
+            "Whisper": "fa",
+        },
     },
     "Polish":{
-        "Poland":"pl-PL",
+        "Poland":{
+            "Google": "pl-PL",
+            "Whisper": "pl",
+        },
     },
     "Portuguese":{
-        "Brazil":"pt-BR",
-        "Portugal":"pt-PT",
+        "Brazil":{
+            "Google": "pt-BR",
+            "Whisper": "pt",
+        },
+        "Portugal":{
+            "Google": "pt-PT",
+            "Whisper": "pt",
+        },
     },
     "Romanian":{
-        "Romania":"ro-RO",
+        "Romania":{
+            "Google": "ro-RO",
+            "Whisper": "ro",
+        },
     },
     "Russian":{
-        "Russia":"ru-RU",
+        "Russia":{
+            "Google": "ru-RU",
+            "Whisper": "ru",
+        },
     },
     "Serbian":{
-        "Serbia":"sr-RS",
+        "Serbia":{
+            "Google": "sr-RS",
+            "Whisper": "sr",
+        },
     },
     "Slovak":{
-        "Slovakia":"sk-SK",
+        "Slovakia":{
+            "Google": "sk-SK",
+            "Whisper": "sk",
+        },
     },
     "Slovenian":{
-        "Slovenia":"sl-SI",
+        "Slovenia":{
+            "Google": "sl-SI",
+            "Whisper": "sl",
+        },
     },
     "Spanish":{
-        "Argentina":"es-AR",
-        "Bolivia":"es-BO",
-        "Chile":"es-CL",
-        "Colombia":"es-CO",
-        "Costa Rica":"es-CR",
-        "Dominican Republic":"es-DO",
-        "Ecuador":"es-EC",
-        "El Salvador":"es-SV",
-        "Guatemala":"es-GT",
-        "Honduras":"es-HN",
-        "Mexico":"es-MX",
-        "Nicaragua":"es-NI",
-        "Panama":"es-PA",
-        "Paraguay":"es-PY",
-        "Peru":"es-PE",
-        "Puerto Rico":"es-PR",
-        "Spain":"es-ES",
-        "Uruguay":"es-UY",
-        "United States":"es-US",
-        "Venezuela":"es-VE",
+        "Argentina":{
+            "Google": "es-AR",
+            "Whisper": "es",
+        },
+        "Bolivia":{
+            "Google": "es-BO",
+            "Whisper": "es",
+        },
+        "Chile":{
+            "Google": "es-CL",
+            "Whisper": "es",
+        },
+        "Colombia":{
+            "Google": "es-CO",
+            "Whisper": "es",
+        },
+        "Costa Rica":{
+            "Google": "es-CR",
+            "Whisper": "es",
+        },
+        "Dominican Republic":{
+            "Google": "es-DO",
+            "Whisper": "es",
+        },
+        "Ecuador":{
+            "Google": "es-EC",
+            "Whisper": "es",
+        },
+        "El Salvador":{
+            "Google": "es-SV",
+            "Whisper": "es",
+        },
+        "Guatemala":{
+            "Google": "es-GT",
+            "Whisper": "es",
+        },
+        "Honduras":{
+            "Google": "es-HN",
+            "Whisper": "es",
+        },
+        "Mexico":{
+            "Google": "es-MX",
+            "Whisper": "es",
+        },
+        "Nicaragua":{
+            "Google": "es-NI",
+            "Whisper": "es",
+        },
+        "Panama":{
+            "Google": "es-PA",
+            "Whisper": "es",
+        },
+        "Paraguay":{
+            "Google": "es-PY",
+            "Whisper": "es",
+        },
+        "Peru":{
+            "Google": "es-PE",
+            "Whisper": "es",
+        },
+        "Puerto Rico":{
+            "Google": "es-PR",
+            "Whisper": "es",
+        },
+        "Spain":{
+            "Google": "es-ES",
+            "Whisper": "es",
+        },
+        "Uruguay":{
+            "Google": "es-UY",
+            "Whisper": "es",
+        },
+        "United States":{
+            "Google": "es-US",
+            "Whisper": "es",
+        },
+        "Venezuela":{
+            "Google": "es-VE",
+            "Whisper": "es",
+        },
     },
     "Swedish":{
-        "Sweden":"sv-SE",
+        "Sweden":{
+            "Google": "sv-SE",
+            "Whisper": "sv",
+        },
     },
     "Thai":{
-        "Thailand":"th-TH",
+        "Thailand":{
+            "Google": "th-TH",
+            "Whisper": "th",
+        },
     },
     "Turkish":{
-        "Turkey":"tr-TR",
+        "Turkey":{
+            "Google": "tr-TR",
+            "Whisper": "tr",
+        },
     },
     "Ukrainian":{
-        "Ukraine":"uk-UA",
+        "Ukraine":{
+            "Google": "uk-UA",
+            "Whisper": "uk",
+        },
     },
     "Vietnamese":{
-        "Vietnam":"vi-VN",
-    },
-    "Zulu":{
-        "South Africa":"zu-ZA"
+        "Vietnam":{
+            "Google": "vi-VN",
+            "Whisper": "vi",
+        },
     },
 }
\ No newline at end of file
diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py
index fbea0e74..526c12dc 100644
--- a/models/transcription/transcription_transcriber.py
+++ b/models/transcription/transcription_transcriber.py
@@ -14,7 +14,7 @@ PHRASE_TIMEOUT = 3
 MAX_PHRASES = 10
 
 class AudioTranscriber:
-    def __init__(self, speaker, source, phrase_timeout, max_phrases):
+    def __init__(self, speaker, source, phrase_timeout, max_phrases, whisper_enabled, whisper_weight_type, whisper_weight_path):
         self.speaker = speaker
         self.phrase_timeout = phrase_timeout
         self.max_phrases = max_phrases
@@ -30,47 +30,59 @@ class AudioTranscriber:
                 "new_phrase": True,
                 "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData
         }
-        self.whisper_model = WhisperModel("base", device="cpu", device_index=0, compute_type="int8", cpu_threads=4, num_workers=1)
+        if whisper_enabled is True:
+            self.whisper_model = WhisperModel(
+                model_size_or_path=whisper_weight_type,
+                device="cpu",
+                device_index=0,
+                compute_type="int8",
+                cpu_threads=4,
+                num_workers=1,
+                download_root=whisper_weight_path)
+        else:
+            self.whisper_model = None
 
-    def transcribeAudioQueue(self, audio_queue, language, country):
+    def transcribeAudioQueue(self, recognizer, audio_queue, language, country):
         # while True:
         audio, time_spoken = audio_queue.get()
         self.updateLastSampleAndPhraseStatus(audio, time_spoken)
 
         text = ''
         try:
-            # fd, path = tempfile.mkstemp(suffix=".wav")
-            # os.close(fd)
-            audio_data = self.audio_sources["process_data_func"]()
-            text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country])
+            # Whisperが使用できない場合はGoogle Speech-to-Textを使用する
+            if recognizer == "Whisper":
+                if self.whisper_model is None:
+                    recognizer = "Google"
 
-            audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0
-            if isinstance(audio_data, torch.Tensor):
-                audio_data = audio_data.detach().numpy()
-            segments, _ = self.whisper_model.transcribe(
-                audio_data,
-                beam_size=5,
-                temperature=0.0,
-                log_prob_threshold=-0.8,
-                no_speech_threshold=0.6,
-                language="ja",
-                word_timestamps=False,
-                without_timestamps=True,
-                task="transcribe",
-                vad_filter=False,
-                )
-            _text = ""
-            for s in segments:
-                if s.avg_logprob < -0.8 or s.no_speech_prob > 0.6:
-                    continue
-                _text += s.text
-            print(_text)
+            audio_data = self.audio_sources["process_data_func"]()
+            match recognizer:
+                case "Google":
+                    text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][recognizer])
+                case "Whisper":
+                    audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0
+                    if isinstance(audio_data, torch.Tensor):
+                        audio_data = audio_data.detach().numpy()
+                    segments, _ = self.whisper_model.transcribe(
+                        audio_data,
+                        beam_size=5,
+                        temperature=0.0,
+                        log_prob_threshold=-0.8,
+                        no_speech_threshold=0.6,
+                        language=transcription_lang[language][country][recognizer],
+                        word_timestamps=False,
+                        without_timestamps=True,
+                        task="transcribe",
+                        vad_filter=False,
+                        )
+                    for s in segments:
+                        if s.avg_logprob < -0.8 or s.no_speech_prob > 0.6:
+                            continue
+                        text += s.text
 
         except Exception:
             pass
         finally:
             pass
-            # os.unlink(path)
 
         if text != '':
             self.updateTranscript(text)
diff --git a/models/transcription/transcription_utils.py b/models/transcription/transcription_utils.py
index f40defeb..8de17e7e 100644
--- a/models/transcription/transcription_utils.py
+++ b/models/transcription/transcription_utils.py
@@ -1,4 +1,8 @@
 from pyaudiowpatch import PyAudio, paWASAPI
+from faster_whisper.utils import download_model
+import logging
+logger = logging.getLogger('faster_whisper')
+logger.setLevel(logging.CRITICAL)
 
 def getInputDevices():
     devices = {}
@@ -44,4 +48,38 @@ def getDefaultOutputDevice():
                             if default_speakers["name"] in loopback["name"]:
                                 default_device = loopback
                                 return default_device
-    return {"name":"NoDevice"}
\ No newline at end of file
+    return {"name":"NoDevice"}
+
+def downloadWhisperWeight(weight_type, path):
+    result = False
+    try:
+        download_model(
+            weight_type,
+            cache_dir=path)
+        result = True
+    except Exception:
+        pass
+    return result
+
+def checkWhisperWeight(weight_type, path):
+    result = False
+    try:
+        result = download_model(
+            weight_type,
+            local_files_only=True,
+            cache_dir=path)
+        result = True
+    except Exception:
+        pass
+    return result
+
+if __name__ == "__main__":
+
+
+    downloadWhisperWeight("base", "./weight/whisper/")
+
+    from faster_whisper import WhisperModel
+    whisper_model = WhisperModel("base", device="cpu", device_index=0, compute_type="int8", cpu_threads=4, num_workers=1, download_root="./weight/whisper/")
+
+    print(checkWhisperWeight("base", "./weight/whisper/"))
+    print(checkWhisperWeight("tiny", "./weight/whisper/"))
\ No newline at end of file
diff --git a/view.py b/view.py
index 34711688..cf90dcfa 100644
--- a/view.py
+++ b/view.py
@@ -280,7 +280,7 @@ class View():
             VAR_DESC_CTRANSLATE2_WEIGHT_TYPE=StringVar(value=i18n.t("config_window.ctranslate2_weight_type.desc")),
             DICT_CTRANSLATE2_WEIGHT_TYPE=self.getSelectableCtranslate2WeightTypeDict(),
             CALLBACK_SET_CTRANSLATE2_WEIGHT_TYPE=None,
-            VAR_CTRANSLATE2_WEIGHT_TYPE=StringVar(value=self.getSelectableCtranslate2WeightTypeDict()[config.WEIGHT_TYPE]),
+            VAR_CTRANSLATE2_WEIGHT_TYPE=StringVar(value=self.getSelectableCtranslate2WeightTypeDict()[config.CTRANSLATE2_WEIGHT_TYPE]),
 
             VAR_LABEL_DEEPL_AUTH_KEY=StringVar(value=i18n.t( "config_window.deepl_auth_key.label")),
             VAR_DESC_DEEPL_AUTH_KEY=StringVar(
@@ -1069,7 +1069,7 @@ class View():
         self.view_variable.VAR_CTRANSLATE2_WEIGHT_TYPE.set(self.getSelectableCtranslate2WeightTypeDict()[selected_weight_type])
 
     def setLatestCTranslate2WeightType(self):
-        selected_weight_type = self.getSelectableCtranslate2WeightTypeDict()[config.WEIGHT_TYPE]
+        selected_weight_type = self.getSelectableCtranslate2WeightTypeDict()[config.CTRANSLATE2_WEIGHT_TYPE]
         self.view_variable.VAR_CTRANSLATE2_WEIGHT_TYPE.set(selected_weight_type)
 
 

From 10b8d115a118f3cfeaf400af76186115c084950b Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Wed, 31 Jan 2024 22:50:31 +0900
Subject: [PATCH 03/11] =?UTF-8?q?[WIP/TEST]=20faster-whisper=20model=20wei?=
 =?UTF-8?q?ght=20=E3=81=AE=E3=83=80=E3=82=A6=E3=83=B3=E3=83=AD=E3=83=BC?=
 =?UTF-8?q?=E3=83=89/=E3=83=99=E3=83=AA=E3=83=95=E3=82=A1=E3=82=A4?=
 =?UTF-8?q?=E5=87=A6=E7=90=86=E3=82=92=E5=AE=9F=E8=A3=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 main.py                                       |  8 +-
 model.py                                      |  4 +-
 .../transcription_transcriber.py              | 13 +--
 models/transcription/transcription_utils.py   | 40 +-------
 models/transcription/transcription_whisper.py | 98 +++++++++++++++++++
 5 files changed, 111 insertions(+), 52 deletions(-)
 create mode 100644 models/transcription/transcription_whisper.py

diff --git a/main.py b/main.py
index cf80e289..4aaa7232 100644
--- a/main.py
+++ b/main.py
@@ -11,8 +11,14 @@ if __name__ == "__main__":
         from models.translation.utils import downloadCTranslate2Weight
         if config.USE_TRANSLATION_FEATURE is True:
             downloadCTranslate2Weight(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE, splash.updateDownloadProgress)
-
         splash.toProgress(0)
+
+        # whisperのダウンロードの説明に変更する必要あり
+        if config.USE_RECOGNIZER_FEATURE is True:
+            from models.transcription.transcription_whisper import downloadWhisperWeight
+            downloadWhisperWeight(config.PATH_LOCAL, config.WHISPER_WEIGHT_TYPE, splash.updateDownloadProgress)
+        splash.toProgress(0)
+
         import controller
         controller.createMainWindow(splash)
         splash.destroySplash()
diff --git a/model.py b/model.py
index 61ff24d7..6b73bece 100644
--- a/model.py
+++ b/model.py
@@ -337,7 +337,7 @@ class Model:
             max_phrases=config.INPUT_MIC_MAX_PHRASES,
             whisper_enabled=config.USE_RECOGNIZER_FEATURE,
             whisper_weight_type=config.WHISPER_WEIGHT_TYPE,
-            whisper_weight_path=os_path.join(config.PATH_LOCAL, "weight", "whisper"),
+            root=config.PATH_LOCAL,
         )
         def sendMicTranscript():
             mic_transcriber.transcribeAudioQueue(config.SELECTED_RECOGNIZER, mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY)
@@ -421,7 +421,7 @@ class Model:
             max_phrases=config.INPUT_SPEAKER_MAX_PHRASES,
             whisper_enabled=config.USE_RECOGNIZER_FEATURE,
             whisper_weight_type=config.WHISPER_WEIGHT_TYPE,
-            whisper_weight_path=os_path.join(config.PATH_LOCAL, "weight", "whisper"),
+            root=config.PATH_LOCAL,
         )
         def sendSpeakerTranscript():
             speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY)
diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py
index 526c12dc..0f5b1790 100644
--- a/models/transcription/transcription_transcriber.py
+++ b/models/transcription/transcription_transcriber.py
@@ -5,16 +5,16 @@ from speech_recognition import Recognizer, AudioData, AudioFile
 from datetime import timedelta
 from pyaudiowpatch import get_sample_size, paInt16
 from .transcription_languages import transcription_lang
+from .transcription_whisper import getWhisperModel
 
 import torch
 import numpy as np
-from faster_whisper import WhisperModel
 
 PHRASE_TIMEOUT = 3
 MAX_PHRASES = 10
 
 class AudioTranscriber:
-    def __init__(self, speaker, source, phrase_timeout, max_phrases, whisper_enabled, whisper_weight_type, whisper_weight_path):
+    def __init__(self, speaker, source, phrase_timeout, max_phrases, whisper_enabled, whisper_weight_type, root):
         self.speaker = speaker
         self.phrase_timeout = phrase_timeout
         self.max_phrases = max_phrases
@@ -31,14 +31,7 @@ class AudioTranscriber:
                 "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData
         }
         if whisper_enabled is True:
-            self.whisper_model = WhisperModel(
-                model_size_or_path=whisper_weight_type,
-                device="cpu",
-                device_index=0,
-                compute_type="int8",
-                cpu_threads=4,
-                num_workers=1,
-                download_root=whisper_weight_path)
+            self.whisper_model = getWhisperModel(root, whisper_weight_type)
         else:
             self.whisper_model = None
 
diff --git a/models/transcription/transcription_utils.py b/models/transcription/transcription_utils.py
index 8de17e7e..f40defeb 100644
--- a/models/transcription/transcription_utils.py
+++ b/models/transcription/transcription_utils.py
@@ -1,8 +1,4 @@
 from pyaudiowpatch import PyAudio, paWASAPI
-from faster_whisper.utils import download_model
-import logging
-logger = logging.getLogger('faster_whisper')
-logger.setLevel(logging.CRITICAL)
 
 def getInputDevices():
     devices = {}
@@ -48,38 +44,4 @@ def getDefaultOutputDevice():
                             if default_speakers["name"] in loopback["name"]:
                                 default_device = loopback
                                 return default_device
-    return {"name":"NoDevice"}
-
-def downloadWhisperWeight(weight_type, path):
-    result = False
-    try:
-        download_model(
-            weight_type,
-            cache_dir=path)
-        result = True
-    except Exception:
-        pass
-    return result
-
-def checkWhisperWeight(weight_type, path):
-    result = False
-    try:
-        result = download_model(
-            weight_type,
-            local_files_only=True,
-            cache_dir=path)
-        result = True
-    except Exception:
-        pass
-    return result
-
-if __name__ == "__main__":
-
-
-    downloadWhisperWeight("base", "./weight/whisper/")
-
-    from faster_whisper import WhisperModel
-    whisper_model = WhisperModel("base", device="cpu", device_index=0, compute_type="int8", cpu_threads=4, num_workers=1, download_root="./weight/whisper/")
-
-    print(checkWhisperWeight("base", "./weight/whisper/"))
-    print(checkWhisperWeight("tiny", "./weight/whisper/"))
\ No newline at end of file
+    return {"name":"NoDevice"}
\ No newline at end of file
diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py
new file mode 100644
index 00000000..dc606cb7
--- /dev/null
+++ b/models/transcription/transcription_whisper.py
@@ -0,0 +1,98 @@
+from os import path as os_path, makedirs as os_makedirs
+from requests import get as requests_get
+from typing import Callable
+import huggingface_hub
+from faster_whisper import WhisperModel
+import logging
+logger = logging.getLogger('faster_whisper')
+logger.setLevel(logging.CRITICAL)
+
+_MODELS = {
+    "tiny.en": "Systran/faster-whisper-tiny.en",
+    "tiny": "Systran/faster-whisper-tiny",
+    "base.en": "Systran/faster-whisper-base.en",
+    "base": "Systran/faster-whisper-base",
+    "small.en": "Systran/faster-whisper-small.en",
+    "small": "Systran/faster-whisper-small",
+    "medium.en": "Systran/faster-whisper-medium.en",
+    "medium": "Systran/faster-whisper-medium",
+    "large-v1": "Systran/faster-whisper-large-v1",
+    "large-v2": "Systran/faster-whisper-large-v2",
+    "large-v3": "Systran/faster-whisper-large-v3",
+    "large": "Systran/faster-whisper-large-v3",
+}
+
+_FILENAMES = [
+    "config.json",
+    "preprocessor_config.json",
+    "model.bin",
+    "tokenizer.json",
+    "vocabulary.txt",
+]
+
+def downloadFile(url, path, func=None):
+    try:
+        res = requests_get(url, stream=True)
+        res.raise_for_status()
+        file_size = int(res.headers.get('content-length', 0))
+        total_chunk = 0
+        with open(os_path.join(path), 'wb') as file:
+            for chunk in res.iter_content(chunk_size=1024*5):
+                file.write(chunk)
+                if isinstance(func, Callable):
+                    total_chunk += len(chunk)
+                    func(total_chunk/file_size)
+
+    except Exception as e:
+            print("error:downloadFile()", e)
+
+def checkWhisperWeight(path):
+    result = False
+    try:
+        WhisperModel(
+            path,
+            device="cpu",
+            device_index=0,
+            compute_type="int8",
+            cpu_threads=4,
+            num_workers=1,
+            local_files_only=True,
+        )
+        result = True
+    except Exception:
+        pass
+    return result
+
+def downloadWhisperWeight(root, weight_type, callbackFunc):
+    path = os_path.join(root, "weight", "whisper", weight_type)
+    os_makedirs(path, exist_ok=True)
+    if checkWhisperWeight(path) is True:
+        return
+
+    for filename in _FILENAMES:
+        print("Downloading", filename, "...")
+        file_path = os_path.join(path, filename)
+        url = huggingface_hub.hf_hub_url(_MODELS[weight_type], filename)
+        downloadFile(url, file_path, func=callbackFunc)
+
+def getWhisperModel(root, weight_type):
+    path = os_path.join(root, "weight", "whisper", weight_type)
+    return WhisperModel(
+        path,
+        device="cpu",
+        device_index=0,
+        compute_type="int8",
+        cpu_threads=4,
+        num_workers=1,
+        local_files_only=True,
+    )
+
+if __name__ == "__main__":
+    def callback(value):
+        print(value)
+
+    downloadWhisperWeight("./", "tiny", callback)
+    downloadWhisperWeight("./", "base", callback)
+    downloadWhisperWeight("./", "small", callback)
+    downloadWhisperWeight("./", "medium", callback)
+    downloadWhisperWeight("./", "large", callback)
\ No newline at end of file

From e4c685d3822bf8efd7636e3b302e3f1e729eb7e5 Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Thu, 1 Feb 2024 13:40:24 +0900
Subject: [PATCH 04/11] [WIP/TEST] Config : USE_RECOGNIZER_FEATURE ->
 USE_WHISPER_FEATURE

---
 config.py | 19 +++++++------------
 main.py   |  2 +-
 model.py  |  4 ++--
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/config.py b/config.py
index 6acf5e3f..c59c0f17 100644
--- a/config.py
+++ b/config.py
@@ -584,14 +584,14 @@ class Config:
             saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
 
     @property
-    @json_serializable('USE_RECOGNIZER_FEATURE')
-    def USE_RECOGNIZER_FEATURE(self):
-        return self._USE_RECOGNIZER_FEATURE
+    @json_serializable('USE_WHISPER_FEATURE')
+    def USE_WHISPER_FEATURE(self):
+        return self._USE_WHISPER_FEATURE
 
-    @USE_RECOGNIZER_FEATURE.setter
-    def USE_RECOGNIZER_FEATURE(self, value):
+    @USE_WHISPER_FEATURE.setter
+    def USE_WHISPER_FEATURE(self, value):
         if isinstance(value, bool):
-            self._USE_RECOGNIZER_FEATURE = value
+            self._USE_WHISPER_FEATURE = value
             saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
 
     @property
@@ -797,17 +797,12 @@ class Config:
         self._SELECTABLE_WHISPER_WEIGHT_TYPE_DICT = {
             # {Save json str}: {i18n_placeholder} pairs
             "tiny": "tiny",
-            "tiny.en": "tiny.en",
             "base": "base",
-            "base.en": "base.en",
             "small": "small",
-            "small.en": "small.en",
             "medium": "medium",
-            "medium.en": "medium.en",
             "large-v1": "large-v1",
             "large-v2": "large-v2",
             "large-v3": "large-v3",
-            "large": "large",
         }
 
         self._MAX_MIC_ENERGY_THRESHOLD = 2000
@@ -887,7 +882,7 @@ class Config:
         }
         self._USE_TRANSLATION_FEATURE = True
         self._CTRANSLATE2_WEIGHT_TYPE = "Small"
-        self._USE_RECOGNIZER_FEATURE = True
+        self._USE_WHISPER_FEATURE = True
         self._WHISPER_WEIGHT_TYPE = "base"
         self._SEND_MESSAGE_FORMAT = "[message]"
         self._SEND_MESSAGE_FORMAT_WITH_T = "[message]([translation])"
diff --git a/main.py b/main.py
index 4aaa7232..37bc53af 100644
--- a/main.py
+++ b/main.py
@@ -14,7 +14,7 @@ if __name__ == "__main__":
         splash.toProgress(0)
 
         # whisperのダウンロードの説明に変更する必要あり
-        if config.USE_RECOGNIZER_FEATURE is True:
+        if config.USE_WHISPER_FEATURE is True:
             from models.transcription.transcription_whisper import downloadWhisperWeight
             downloadWhisperWeight(config.PATH_LOCAL, config.WHISPER_WEIGHT_TYPE, splash.updateDownloadProgress)
         splash.toProgress(0)
diff --git a/model.py b/model.py
index 6b73bece..98d0a896 100644
--- a/model.py
+++ b/model.py
@@ -335,7 +335,7 @@ class Model:
             source=self.mic_audio_recorder.source,
             phrase_timeout=phase_timeout,
             max_phrases=config.INPUT_MIC_MAX_PHRASES,
-            whisper_enabled=config.USE_RECOGNIZER_FEATURE,
+            whisper_enabled=config.USE_WHISPER_FEATURE,
             whisper_weight_type=config.WHISPER_WEIGHT_TYPE,
             root=config.PATH_LOCAL,
         )
@@ -419,7 +419,7 @@ class Model:
             source=self.speaker_audio_recorder.source,
             phrase_timeout=phase_timeout,
             max_phrases=config.INPUT_SPEAKER_MAX_PHRASES,
-            whisper_enabled=config.USE_RECOGNIZER_FEATURE,
+            whisper_enabled=config.USE_WHISPER_FEATURE,
             whisper_weight_type=config.WHISPER_WEIGHT_TYPE,
             root=config.PATH_LOCAL,
         )

From 7cb8c473d4adb8dc1377fa13eb8f14a29bee2afe Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Thu, 1 Feb 2024 13:41:31 +0900
Subject: [PATCH 05/11] =?UTF-8?q?[WIP/TEST]=20Model=20:=20large=E3=83=A2?=
 =?UTF-8?q?=E3=83=87=E3=83=AB=E3=82=92=E3=83=80=E3=82=A6=E3=83=B3=E3=83=AD?=
 =?UTF-8?q?=E3=83=BC=E3=83=89=E5=87=A6=E7=90=86=E3=82=92=E4=BF=AE=E6=AD=A3?=
 =?UTF-8?q?/=20en=E3=82=92=E5=89=8A=E9=99=A4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 models/transcription/transcription_whisper.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py
index dc606cb7..67ad61f0 100644
--- a/models/transcription/transcription_whisper.py
+++ b/models/transcription/transcription_whisper.py
@@ -8,18 +8,13 @@ logger = logging.getLogger('faster_whisper')
 logger.setLevel(logging.CRITICAL)
 
 _MODELS = {
-    "tiny.en": "Systran/faster-whisper-tiny.en",
     "tiny": "Systran/faster-whisper-tiny",
-    "base.en": "Systran/faster-whisper-base.en",
     "base": "Systran/faster-whisper-base",
-    "small.en": "Systran/faster-whisper-small.en",
     "small": "Systran/faster-whisper-small",
-    "medium.en": "Systran/faster-whisper-medium.en",
     "medium": "Systran/faster-whisper-medium",
     "large-v1": "Systran/faster-whisper-large-v1",
     "large-v2": "Systran/faster-whisper-large-v2",
     "large-v3": "Systran/faster-whisper-large-v3",
-    "large": "Systran/faster-whisper-large-v3",
 }
 
 _FILENAMES = [
@@ -28,6 +23,7 @@ _FILENAMES = [
     "model.bin",
     "tokenizer.json",
     "vocabulary.txt",
+    "vocabulary.json",
 ]
 
 def downloadFile(url, path, func=None):
@@ -67,6 +63,7 @@ def downloadWhisperWeight(root, weight_type, callbackFunc):
     path = os_path.join(root, "weight", "whisper", weight_type)
     os_makedirs(path, exist_ok=True)
     if checkWhisperWeight(path) is True:
+        print("weight_type:", weight_type, checkWhisperWeight(path))
         return
 
     for filename in _FILENAMES:
@@ -75,6 +72,8 @@ def downloadWhisperWeight(root, weight_type, callbackFunc):
         url = huggingface_hub.hf_hub_url(_MODELS[weight_type], filename)
         downloadFile(url, file_path, func=callbackFunc)
 
+    print("weight_type:", weight_type, checkWhisperWeight(path))
+
 def getWhisperModel(root, weight_type):
     path = os_path.join(root, "weight", "whisper", weight_type)
     return WhisperModel(
@@ -90,9 +89,12 @@ def getWhisperModel(root, weight_type):
 if __name__ == "__main__":
     def callback(value):
         print(value)
+        pass
 
     downloadWhisperWeight("./", "tiny", callback)
     downloadWhisperWeight("./", "base", callback)
     downloadWhisperWeight("./", "small", callback)
     downloadWhisperWeight("./", "medium", callback)
-    downloadWhisperWeight("./", "large", callback)
\ No newline at end of file
+    downloadWhisperWeight("./", "large-v1", callback)
+    downloadWhisperWeight("./", "large-v2", callback)
+    downloadWhisperWeight("./", "large-v3", callback)
\ No newline at end of file

From 1de239549f7dc3c00b55fefb1dc46da35aec2b24 Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Thu, 1 Feb 2024 15:49:17 +0900
Subject: [PATCH 06/11] =?UTF-8?q?[WIP/TEST]=20Model=20:=20=E3=83=A2?=
 =?UTF-8?q?=E3=83=87=E3=83=AB=E3=81=AE=E4=BF=9D=E5=AD=98=E4=BD=8D=E7=BD=AE?=
 =?UTF-8?q?=E3=81=AE=E5=A4=89=E6=9B=B4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- speakerの文字起こし処理のバグを修正
---
 .gitignore                                    |  2 +-
 main.py                                       |  5 ++--
 model.py                                      |  4 ++--
 models/transcription/transcription_whisper.py |  7 ++----
 models/translation/translation_translator.py  |  8 +++----
 .../{utils.py => translation_utils.py}        | 24 +++++++++----------
 6 files changed, 23 insertions(+), 27 deletions(-)
 rename models/translation/{utils.py => translation_utils.py} (78%)

diff --git a/.gitignore b/.gitignore
index 75c28a41..52825c27 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,7 @@ VRCT.spec
 *.pyc
 logs/
 .venv/
-weight/
+weights/
 .vscode
 error.log
 *.exe
diff --git a/main.py b/main.py
index 37bc53af..0df15326 100644
--- a/main.py
+++ b/main.py
@@ -8,14 +8,13 @@ if __name__ == "__main__":
         splash.showSplash()
 
         from config import config
-        from models.translation.utils import downloadCTranslate2Weight
+        from models.translation.translation_utils import downloadCTranslate2Weight
         if config.USE_TRANSLATION_FEATURE is True:
             downloadCTranslate2Weight(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE, splash.updateDownloadProgress)
-        splash.toProgress(0)
 
+        from models.transcription.transcription_whisper import downloadWhisperWeight
         # whisperのダウンロードの説明に変更する必要あり
         if config.USE_WHISPER_FEATURE is True:
-            from models.transcription.transcription_whisper import downloadWhisperWeight
             downloadWhisperWeight(config.PATH_LOCAL, config.WHISPER_WEIGHT_TYPE, splash.updateDownloadProgress)
         splash.toProgress(0)
 
diff --git a/model.py b/model.py
index 98d0a896..2c29d4c7 100644
--- a/model.py
+++ b/model.py
@@ -23,7 +23,7 @@ from models.transcription.transcription_transcriber import AudioTranscriber
 from models.xsoverlay.notification import xsoverlayForVRCT
 from models.translation.translation_languages import translation_lang
 from models.transcription.transcription_languages import transcription_lang
-from models.translation.utils import checkCTranslate2Weight
+from models.translation.translation_utils import checkCTranslate2Weight
 from config import config
 
 class threadFnc(Thread):
@@ -424,7 +424,7 @@ class Model:
             root=config.PATH_LOCAL,
         )
         def sendSpeakerTranscript():
-            speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY)
+            speaker_transcriber.transcribeAudioQueue(config.SELECTED_RECOGNIZER, speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY)
             message = speaker_transcriber.getTranscript()
             try:
                 fnc(message)
diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py
index 67ad61f0..e30fee2d 100644
--- a/models/transcription/transcription_whisper.py
+++ b/models/transcription/transcription_whisper.py
@@ -60,10 +60,9 @@ def checkWhisperWeight(path):
     return result
 
 def downloadWhisperWeight(root, weight_type, callbackFunc):
-    path = os_path.join(root, "weight", "whisper", weight_type)
+    path = os_path.join(root, "weights", "whisper", weight_type)
     os_makedirs(path, exist_ok=True)
     if checkWhisperWeight(path) is True:
-        print("weight_type:", weight_type, checkWhisperWeight(path))
         return
 
     for filename in _FILENAMES:
@@ -72,10 +71,8 @@ def downloadWhisperWeight(root, weight_type, callbackFunc):
         url = huggingface_hub.hf_hub_url(_MODELS[weight_type], filename)
         downloadFile(url, file_path, func=callbackFunc)
 
-    print("weight_type:", weight_type, checkWhisperWeight(path))
-
 def getWhisperModel(root, weight_type):
-    path = os_path.join(root, "weight", "whisper", weight_type)
+    path = os_path.join(root, "weights", "whisper", weight_type)
     return WhisperModel(
         path,
         device="cpu",
diff --git a/models/translation/translation_translator.py b/models/translation/translation_translator.py
index ea02e490..c966c672 100644
--- a/models/translation/translation_translator.py
+++ b/models/translation/translation_translator.py
@@ -2,7 +2,7 @@ import os
 from deepl import Translator as deepl_Translator
 from translators import translate_text as other_web_Translator
 from .translation_languages import translation_lang
-from .utils import ctranslate2_weights
+from .translation_utils import ctranslate2_weights
 
 import ctranslate2
 import transformers
@@ -27,8 +27,8 @@ class Translator():
     def changeCTranslate2Model(self, path, model_type):
         directory_name = ctranslate2_weights[model_type]["directory_name"]
         tokenizer = ctranslate2_weights[model_type]["tokenizer"]
-        weight_path = os.path.join(path, "weight", directory_name)
-        tokenizer_path = os.path.join(path, "weight", directory_name, "tokenizer")
+        weight_path = os.path.join(path, "weights", "ctranslate2", directory_name)
+        tokenizer_path = os.path.join(path, "weights", "ctranslate2", directory_name, "tokenizer")
         self.ctranslate2_translator = ctranslate2.Translator(
             weight_path,
             device="cpu",
@@ -41,7 +41,7 @@ class Translator():
             self.ctranslate2_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer, cache_dir=tokenizer_path)
         except Exception as e:
             print("Error: changeCTranslate2Model()", e)
-            tokenizer_path = os.path.join("./weight", directory_name, "tokenizer")
+            tokenizer_path = os.path.join("./weights", "ctranslate2", directory_name, "tokenizer")
             self.ctranslate2_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer, cache_dir=tokenizer_path)
 
     @staticmethod
diff --git a/models/translation/utils.py b/models/translation/translation_utils.py
similarity index 78%
rename from models/translation/utils.py
rename to models/translation/translation_utils.py
index d47401cf..73805cdc 100644
--- a/models/translation/utils.py
+++ b/models/translation/translation_utils.py
@@ -39,36 +39,36 @@ def calculate_file_hash(file_path, block_size=65536):
     return hash_object.hexdigest()
 
 def checkCTranslate2Weight(path, weight_type="Small"):
-    directory_name = 'weight'
-    current_directory = path
     weight_directory_name = ctranslate2_weights[weight_type]["directory_name"]
     hash_data = ctranslate2_weights[weight_type]["hash"]
-    files = ["model.bin", "sentencepiece.model", "shared_vocabulary.txt"]
+    files = [
+        "model.bin",
+        "sentencepiece.model",
+        "shared_vocabulary.txt"
+    ]
 
     # check already downloaded
     already_downloaded = False
-    if all(os_path.exists(os_path.join(current_directory, directory_name, weight_directory_name, file)) for file in files):
+    if all(os_path.exists(os_path.join(path, weight_directory_name, file)) for file in files):
         # check hash
         for file in files:
             original_hash = hash_data[file]
-            current_hash = calculate_file_hash(os_path.join(current_directory, directory_name, weight_directory_name, file))
+            current_hash = calculate_file_hash(os_path.join(path, weight_directory_name, file))
             if original_hash != current_hash:
                 break
         already_downloaded = True
     return already_downloaded
 
-def downloadCTranslate2Weight(path, weight_type="Small", func=None):
+def downloadCTranslate2Weight(root, weight_type="Small", func=None):
     url = ctranslate2_weights[weight_type]["url"]
-    filename = 'weight.zip'
-    directory_name = 'weight'
-    current_directory = path
+    filename = "weight.zip"
+    path = os_path.join(root, "weights", "ctranslate2")
+    os_makedirs(path, exist_ok=True)
 
     if checkCTranslate2Weight(path, weight_type):
         return
 
     try:
-        os_makedirs(os_path.join(current_directory, directory_name), exist_ok=True)
-        print(os_path.join(current_directory, directory_name))
         with tempfile.TemporaryDirectory() as tmp_path:
             res = requests_get(url, stream=True)
             file_size = int(res.headers.get('content-length', 0))
@@ -81,6 +81,6 @@ def downloadCTranslate2Weight(path, weight_type="Small", func=None):
                         func(total_chunk/file_size)
 
             with ZipFile(os_path.join(tmp_path, filename)) as zf:
-                zf.extractall(os_path.join(current_directory, directory_name))
+                zf.extractall(path)
     except Exception as e:
             print("error:downloadCTranslate2Weight()", e)
\ No newline at end of file

From 78b8cb590984a36b722801db0d3f1a63953cad93 Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Fri, 2 Feb 2024 13:14:56 +0900
Subject: [PATCH 07/11] =?UTF-8?q?=F0=9F=90=9B[bugfix]=20install.bat=20:=20?=
 =?UTF-8?q?package=20version=20fix?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 install.bat      | 4 +---
 requirements.txt | 8 +++++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/install.bat b/install.bat
index 036f6a51..8d2a5d51 100644
--- a/install.bat
+++ b/install.bat
@@ -1,4 +1,2 @@
 python.exe -m pip install --upgrade pip
-pip install -r requirements.txt
-pip install git+https://github.com/misyaguziya/translators
-pip install git+https://github.com/misyaguziya/custom_speech_recognition
\ No newline at end of file
+pip install -r requirements.txt
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 68a6ce15..cedd1568 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,9 @@ pyyaml == 6.0.1
 python-i18n == 0.3.9
 CTkToolTip == 0.8
 pyinstaller==6.2.0
-transformers[torch]
+transformers[torch]==4.37.2
 sentencepiece==0.1.99
-ctranslate2==3.21.0
-faster-whisper==0.10.0
\ No newline at end of file
+ctranslate2==3.24.0
+faster-whisper==0.10.0
+translators @ git+https://github.com/misyaguziya/translators@master
+SpeechRecognition @ git+https://github.com/misyaguziya/custom_speech_recognition@master
\ No newline at end of file

From ee5c4c05ce0c5c9aa9a5aa6279ba3605cd0eede1 Mon Sep 17 00:00:00 2001
From: Sakamoto Shiina <68018796+ShiinaSakamoto@users.noreply.github.com>
Date: Fri, 2 Feb 2024 18:08:18 +0900
Subject: [PATCH 08/11] =?UTF-8?q?[WIP/TEST]=20UI:=20=E6=A9=9F=E8=83=BD?=
 =?UTF-8?q?=E3=81=A8=E8=A6=8B=E3=81=9F=E7=9B=AE=E3=82=92=E7=B9=8B=E3=81=8E?=
 =?UTF-8?q?=E3=81=BE=E3=81=97=E3=81=9F=E3=80=82=E8=A8=AD=E5=AE=9A=E7=94=BB?=
 =?UTF-8?q?=E9=9D=A2=E3=81=8B=E3=82=89=E3=81=84=E3=81=98=E3=82=8C=E3=81=BE?=
 =?UTF-8?q?=E3=81=99=E3=80=82?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 controller.py                                 | 33 ++++++++++++
 locales/en.yml                                | 16 ++++++
 view.py                                       | 52 +++++++++++++++++++
 .../createSideMenuAndSettingsBoxContainers.py |  6 ++-
 .../setting_box_transcription/__init__.py     |  3 +-
 .../createSettingBox_InternalModel.py         | 37 +++++++++++++
 6 files changed, 145 insertions(+), 2 deletions(-)
 create mode 100644 vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/createSettingBox_InternalModel.py

diff --git a/controller.py b/controller.py
index 9d44b491..724d2cf3 100644
--- a/controller.py
+++ b/controller.py
@@ -767,6 +767,35 @@ def callbackSetSpeakerMaxPhrases(value):
     except Exception:
         view.showErrorMessage_SpeakerMaxPhrases()
 
+# Transcription (Internal AI Model)
+def callbackSetUserWhisperFeature(value):
+    print("callbackSetUserWhisperFeature", value)
+    config.USE_WHISPER_FEATURE = value
+    if config.USE_WHISPER_FEATURE is True:
+        view.openWhisperWeightTypeWidget()
+    else:
+        view.closeWhisperWeightTypeWidget()
+
+def callbackSetWhisperWeightType(value):
+    print("callbackSetWhisperWeightType", value)
+    config.WHISPER_WEIGHT_TYPE = str(value)
+    view.updateSelectedWhisperWeightType(config.WHISPER_WEIGHT_TYPE)
+    # view.setWidgetsStatus_changeWeightType_Pending()
+    # if model.checkCTranslatorCTranslate2ModelWeight():
+    #     config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = False
+    #     def callback():
+    #         model.changeTranslatorCTranslate2Model()
+    #         view.useTranslationFeatureProcess("Normal")
+    #         view.setWidgetsStatus_changeWeightType_Done()
+    #     th_callback = Thread(target=callback)
+    #     th_callback.daemon = True
+    #     th_callback.start()
+    # else:
+    #     config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = True
+    #     view.useTranslationFeatureProcess("Restart")
+    #     view.setWidgetsStatus_changeWeightType_Done()
+    # view.showRestartButtonIfRequired()
+
 
 # Others Tab
 def callbackSetEnableAutoClearMessageBox(value):
@@ -993,6 +1022,10 @@ def createMainWindow(splash):
             "callback_set_speaker_phrase_timeout": callbackSetSpeakerPhraseTimeout,
             "callback_set_speaker_max_phrases": callbackSetSpeakerMaxPhrases,
 
+            # Transcription Tab (Internal AI Model)
+            "callback_set_use_whisper_feature": callbackSetUserWhisperFeature,
+            "callback_set_whisper_weight_type": callbackSetWhisperWeightType,
+
             # Others Tab
             "callback_set_enable_auto_clear_chatbox": callbackSetEnableAutoClearMessageBox,
             "callback_set_send_only_translated_messages": callbackSetEnableSendOnlyTranslatedMessages,
diff --git a/locales/en.yml b/locales/en.yml
index 2806ea91..f68aa32c 100644
--- a/locales/en.yml
+++ b/locales/en.yml
@@ -79,6 +79,7 @@ config_window:
     transcription: Transcription
     transcription_mic: Mic
     transcription_speaker: Speaker
+    transcription_internal_model: Internal Model
     others: Others
     others_send_message_formats: Message Formats (Send)
     others_received_message_formats: Message Formats (Received)
@@ -125,6 +126,21 @@ config_window:
     small: "Basic model (%{capacity})"
     large: "High accuracy model (%{capacity})"
 
+  use_whisper_feature:
+    label: Use Whisper Feature
+    desc: Description
+
+  whisper_weight_type:
+    label: Select Whisper Model
+    desc: Description
+    tiny: "tiny model (%{capacity})"
+    base: "base model (%{capacity})"
+    small: "small model (%{capacity})"
+    medium: "medium model (%{capacity})"
+    large_v1: "large_v1 model (%{capacity})"
+    large_v2: "large_v2 model (%{capacity})"
+    large_v3: "large_v3 model (%{capacity})"
+
   deepl_auth_key:
     label: DeepL Auth Key
     desc: Please select %{translator} on the main screen with DeepL_API when using. ※Some languages may not be supported.
diff --git a/view.py b/view.py
index cf90dcfa..6f7a6d7e 100644
--- a/view.py
+++ b/view.py
@@ -211,6 +211,7 @@ class View():
             VAR_SIDE_MENU_LABEL_TRANSCRIPTION=StringVar(value=i18n.t("config_window.side_menu_labels.transcription")),
             VAR_SECOND_TITLE_TRANSCRIPTION_MIC=StringVar(value=i18n.t("config_window.side_menu_labels.transcription_mic")),
             VAR_SECOND_TITLE_TRANSCRIPTION_SPEAKER=StringVar(value=i18n.t("config_window.side_menu_labels.transcription_speaker")),
+            VAR_SECOND_TITLE_TRANSCRIPTION_INTERNAL_MODEL=StringVar(value=i18n.t("config_window.side_menu_labels.transcription_internal_model")),
             VAR_SIDE_MENU_LABEL_OTHERS=StringVar(value=i18n.t("config_window.side_menu_labels.others")),
             VAR_SIDE_MENU_LABEL_ADVANCED_SETTINGS=StringVar(value=i18n.t("config_window.side_menu_labels.advanced_settings")),
 
@@ -381,6 +382,19 @@ class View():
             CALLBACK_FOCUS_OUT_SPEAKER_MAX_PHRASES=self.callbackBindFocusOut_SpeakerMaxPhrases,
 
 
+            # Transcription Tab (Whisper Internal AI Model)
+            VAR_LABEL_USE_WHISPER_FEATURE=StringVar(value=i18n.t("config_window.use_whisper_feature.label")),
+            VAR_DESC_USE_WHISPER_FEATURE=StringVar(value=i18n.t("config_window.use_whisper_feature.desc")),
+            CALLBACK_SET_USE_WHISPER_FEATURE=None,
+            VAR_USE_WHISPER_FEATURE=BooleanVar(value=config.USE_WHISPER_FEATURE),
+
+            VAR_LABEL_WHISPER_WEIGHT_TYPE=StringVar(value=i18n.t("config_window.whisper_weight_type.label")),
+            VAR_DESC_WHISPER_WEIGHT_TYPE=StringVar(value=i18n.t("config_window.whisper_weight_type.desc")),
+            DICT_WHISPER_WEIGHT_TYPE=self.getSelectableWhisperWeightTypeDict(),
+            CALLBACK_SET_WHISPER_WEIGHT_TYPE=None,
+            VAR_WHISPER_WEIGHT_TYPE=StringVar(value=self.getSelectableWhisperWeightTypeDict()[config.WHISPER_WEIGHT_TYPE]),
+
+
             # Others Tab
             VAR_LABEL_ENABLE_AUTO_CLEAR_MESSAGE_BOX=StringVar(value=i18n.t("config_window.auto_clear_the_message_box.label")),
             VAR_DESC_ENABLE_AUTO_CLEAR_MESSAGE_BOX=None,
@@ -624,6 +638,11 @@ class View():
             self.view_variable.CALLBACK_SET_SPEAKER_PHRASE_TIMEOUT = config_window_registers.get("callback_set_speaker_phrase_timeout", None)
             self.view_variable.CALLBACK_SET_SPEAKER_MAX_PHRASES = config_window_registers.get("callback_set_speaker_max_phrases", None)
 
+            # Transcription Tab (Internal AI Model)
+            self.view_variable.CALLBACK_SET_USE_WHISPER_FEATURE = config_window_registers.get("callback_set_use_whisper_feature", None)
+            self.view_variable.CALLBACK_SET_WHISPER_WEIGHT_TYPE = config_window_registers.get("callback_set_whisper_weight_type", None)
+
+
             # Others Tab
             self.view_variable.CALLBACK_SET_ENABLE_AUTO_CLEAR_MESSAGE_BOX = config_window_registers.get("callback_set_enable_auto_clear_chatbox", None)
             self.view_variable.CALLBACK_SET_ENABLE_SEND_ONLY_TRANSLATED_MESSAGES = config_window_registers.get("callback_set_send_only_translated_messages", None)
@@ -678,6 +697,11 @@ class View():
             )
             self.replaceMicThresholdCheckButton_Disabled()
 
+        if config.USE_WHISPER_FEATURE is True:
+            self.openWhisperWeightTypeWidget()
+        else:
+            self.closeWhisperWeightTypeWidget()
+
         if config.ENABLE_SPEAKER2CHATBOX is False:
             vrct_gui._changeConfigWindowWidgetsStatus(
                 status="disabled",
@@ -919,6 +943,17 @@ class View():
         vrct_gui.update()
         vrct_gui.config_window.lift()
 
+    @staticmethod
+    def getSelectableWhisperWeightTypeDict():
+        return {
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["tiny"]: i18n.t("config_window.whisper_weight_type.tiny", capacity="t"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["base"]: i18n.t("config_window.whisper_weight_type.base", capacity="b"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["small"]: i18n.t("config_window.whisper_weight_type.small", capacity="s"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["medium"]: i18n.t("config_window.whisper_weight_type.medium", capacity="m"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v1"]: i18n.t("config_window.whisper_weight_type.large_v1", capacity="l_v1"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v2"]: i18n.t("config_window.whisper_weight_type.large_v2", capacity="l_v2"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v3"]: i18n.t("config_window.whisper_weight_type.large_v3", capacity="l_v3"),
+        }
 
 # Open Webpage Functions
     def openWebPage_Booth(self):
@@ -1082,6 +1117,23 @@ class View():
         vrct_gui.config_window.sb__ctranslate2_weight_type.grid_remove()
 
 
+    def openWhisperWeightTypeWidget(self):
+        vrct_gui.config_window.sb__use_whisper_feature.grid()
+        vrct_gui.config_window.sb__whisper_weight_type.grid()
+
+    def closeWhisperWeightTypeWidget(self):
+        vrct_gui.config_window.sb__use_whisper_feature.grid()
+        vrct_gui.config_window.sb__whisper_weight_type.grid_remove()
+
+
+    def updateSelectedWhisperWeightType(self, selected_weight_type:str):
+        self.view_variable.VAR_WHISPER_WEIGHT_TYPE.set(self.getSelectableWhisperWeightTypeDict()[selected_weight_type])
+
+    def setLatestCTranslate2WeightType(self):
+        selected_weight_type = self.getSelectableWhisperWeightTypeDict()[config.WHISPER_WEIGHT_TYPE]
+        self.view_variable.VAR_WHISPER_WEIGHT_TYPE.set(selected_weight_type)
+
+
     def openMicEnergyThresholdWidget(self):
         self.view_variable.VAR_LABEL_MIC_DYNAMIC_ENERGY_THRESHOLD.set(i18n.t("config_window.mic_dynamic_energy_threshold.label_for_manual"))
         self.view_variable.VAR_DESC_MIC_DYNAMIC_ENERGY_THRESHOLD.set(i18n.t("config_window.mic_dynamic_energy_threshold.desc_for_manual"))
diff --git a/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/createSideMenuAndSettingsBoxContainers.py b/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/createSideMenuAndSettingsBoxContainers.py
index 30af50de..49272afc 100644
--- a/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/createSideMenuAndSettingsBoxContainers.py
+++ b/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/createSideMenuAndSettingsBoxContainers.py
@@ -7,7 +7,7 @@ from ._createSettingBoxContainer import _createSettingBoxContainer
 
 
 from .setting_box_containers.setting_box_appearance import createSettingBox_Appearance
-from .setting_box_containers.setting_box_transcription import createSettingBox_Mic, createSettingBox_Speaker
+from .setting_box_containers.setting_box_transcription import createSettingBox_Mic, createSettingBox_Speaker, createSettingBox_InternalModel
 from .setting_box_containers.setting_box_others import createSettingBox_Others, createSettingBox_Others_SendMessageFormats, createSettingBox_Others_ReceivedMessageFormats, createSettingBox_Others_Additional
 from .setting_box_containers.setting_box_advanced_settings import createSettingBox_AdvancedSettings
 from .setting_box_containers.setting_box_translation import createSettingBox_Translation
@@ -94,6 +94,10 @@ def createSideMenuAndSettingsBoxContainers(config_window, settings, view_variabl
                         "var_section_title": view_variable.VAR_SECOND_TITLE_TRANSCRIPTION_SPEAKER,
                         "setting_box": createSettingBox_Speaker
                     },
+                    {
+                        "var_section_title": view_variable.VAR_SECOND_TITLE_TRANSCRIPTION_INTERNAL_MODEL,
+                        "setting_box": createSettingBox_InternalModel
+                    },
                 ]
             },
         },
diff --git a/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/__init__.py b/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/__init__.py
index 5383094e..b06ff822 100644
--- a/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/__init__.py
+++ b/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/__init__.py
@@ -1,2 +1,3 @@
 from .createSettingBox_Mic import createSettingBox_Mic
-from .createSettingBox_Speaker import createSettingBox_Speaker
\ No newline at end of file
+from .createSettingBox_Speaker import createSettingBox_Speaker
+from .createSettingBox_InternalModel import createSettingBox_InternalModel
\ No newline at end of file
diff --git a/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/createSettingBox_InternalModel.py b/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/createSettingBox_InternalModel.py
new file mode 100644
index 00000000..0a6b3e69
--- /dev/null
+++ b/vrct_gui/config_window/widgets/createSideMenuAndSettingsBoxContainers/setting_box_containers/setting_box_transcription/createSettingBox_InternalModel.py
@@ -0,0 +1,37 @@
+from utils import callFunctionIfCallable
+
+from .._SettingBoxGenerator import _SettingBoxGenerator
+
+def createSettingBox_InternalModel(setting_box_wrapper, config_window, settings, view_variable):
+    sbg = _SettingBoxGenerator(setting_box_wrapper, config_window, settings, view_variable)
+    createSettingBoxSwitch = sbg.createSettingBoxSwitch
+    createSettingBoxDropdownMenu = sbg.createSettingBoxDropdownMenu
+
+    def switchUseWhisperFeatureCallback(switch_widget):
+        callFunctionIfCallable(view_variable.CALLBACK_SET_USE_WHISPER_FEATURE, switch_widget.get())
+
+    def optionmenuWhisperWeightTypeCallback(value):
+        callFunctionIfCallable(view_variable.CALLBACK_SET_WHISPER_WEIGHT_TYPE, value)
+
+
+    row=0
+    config_window.sb__use_whisper_feature = createSettingBoxSwitch(
+        for_var_label_text=view_variable.VAR_LABEL_USE_WHISPER_FEATURE,
+        for_var_desc_text=view_variable.VAR_DESC_USE_WHISPER_FEATURE,
+        switch_attr_name="sb__switch_use_whisper_feature",
+        command=lambda: switchUseWhisperFeatureCallback(config_window.sb__switch_use_whisper_feature),
+        variable=view_variable.VAR_USE_WHISPER_FEATURE
+    )
+    config_window.sb__use_whisper_feature.grid(row=row, pady=0)
+    row+=1
+
+    config_window.sb__whisper_weight_type = createSettingBoxDropdownMenu(
+        for_var_label_text=view_variable.VAR_LABEL_WHISPER_WEIGHT_TYPE,
+        for_var_desc_text=view_variable.VAR_DESC_WHISPER_WEIGHT_TYPE,
+        optionmenu_attr_name="sb__optionmenu_whisper_weight_type",
+        dropdown_menu_values=view_variable.DICT_WHISPER_WEIGHT_TYPE,
+        command=lambda value: optionmenuWhisperWeightTypeCallback(value),
+        variable=view_variable.VAR_WHISPER_WEIGHT_TYPE,
+    )
+    config_window.sb__whisper_weight_type.grid(row=row, pady=0)
+    row+=1
\ No newline at end of file

From 801d948513b1dd0d891f3b85cf582f524f537063 Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Sat, 3 Feb 2024 02:35:40 +0900
Subject: [PATCH 09/11] =?UTF-8?q?[WIP/TEST]=20Wisper=E3=81=AE=E5=87=A6?=
 =?UTF-8?q?=E7=90=86=E3=81=AB=E3=81=A4=E3=81=84=E3=81=A6UI=E3=81=A8?=
 =?UTF-8?q?=E5=86=85=E9=83=A8=E3=81=AE=E5=87=A6=E7=90=86=E3=82=92=E6=8E=A5?=
 =?UTF-8?q?=E7=B6=9A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config.py                                     | 24 ++++++++++-----
 controller.py                                 | 30 +++++++++----------
 model.py                                      | 12 +++++---
 .../transcription_transcriber.py              | 28 ++++++++---------
 models/transcription/transcription_whisper.py |  5 ++--
 view.py                                       |  4 ++-
 6 files changed, 57 insertions(+), 46 deletions(-)

diff --git a/config.py b/config.py
index c59c0f17..6ce32035 100644
--- a/config.py
+++ b/config.py
@@ -210,6 +210,15 @@ class Config:
         if isinstance(value, bool):
             self._IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = value
 
+    @property
+    def IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER(self):
+        return self._IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER
+
+    @IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER.setter
+    def IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER(self, value):
+        if isinstance(value, bool):
+            self._IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = value
+
     # Save Json Data
     ## Main Window
     @property
@@ -268,14 +277,14 @@ class Config:
             saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
 
     @property
-    @json_serializable('SELECTED_RECOGNIZER')
-    def SELECTED_RECOGNIZER(self):
-        return self._SELECTED_RECOGNIZER
+    @json_serializable('SELECTED_TRANSCRIPTION_ENGINE')
+    def SELECTED_TRANSCRIPTION_ENGINE(self):
+        return self._SELECTED_TRANSCRIPTION_ENGINE
 
-    @SELECTED_RECOGNIZER.setter
-    def SELECTED_RECOGNIZER(self, value):
+    @SELECTED_TRANSCRIPTION_ENGINE.setter
+    def SELECTED_TRANSCRIPTION_ENGINE(self, value):
         if isinstance(value, str):
-            self._SELECTED_RECOGNIZER = value
+            self._SELECTED_TRANSCRIPTION_ENGINE = value
             saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
 
     @property
@@ -820,6 +829,7 @@ class Config:
         self._TARGET_LANGUAGE = "English"
         self._TARGET_COUNTRY = "United States"
         self._IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = False
+        self._IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = False
 
         # Save Json Data
         ## Main Window
@@ -844,7 +854,7 @@ class Config:
             "2":"English\n(United States)",
             "3":"English\n(United States)",
         }
-        self._SELECTED_RECOGNIZER = "Google"
+        self._SELECTED_TRANSCRIPTION_ENGINE = "Google"
         self._IS_MAIN_WINDOW_SIDEBAR_COMPACT_MODE = False
 
         ## Config Window
diff --git a/controller.py b/controller.py
index 724d2cf3..e63101b2 100644
--- a/controller.py
+++ b/controller.py
@@ -773,29 +773,27 @@ def callbackSetUserWhisperFeature(value):
     config.USE_WHISPER_FEATURE = value
     if config.USE_WHISPER_FEATURE is True:
         view.openWhisperWeightTypeWidget()
+        if model.checkTranscriptionWhisperModelWeight() is True:
+            config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = False
+            config.SELECTED_TRANSCRIPTION_ENGINE = "Whisper"
+        else:
+            config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = True
+            config.SELECTED_TRANSCRIPTION_ENGINE = "Google"
     else:
         view.closeWhisperWeightTypeWidget()
+    view.showRestartButtonIfRequired()
 
 def callbackSetWhisperWeightType(value):
     print("callbackSetWhisperWeightType", value)
     config.WHISPER_WEIGHT_TYPE = str(value)
     view.updateSelectedWhisperWeightType(config.WHISPER_WEIGHT_TYPE)
-    # view.setWidgetsStatus_changeWeightType_Pending()
-    # if model.checkCTranslatorCTranslate2ModelWeight():
-    #     config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = False
-    #     def callback():
-    #         model.changeTranslatorCTranslate2Model()
-    #         view.useTranslationFeatureProcess("Normal")
-    #         view.setWidgetsStatus_changeWeightType_Done()
-    #     th_callback = Thread(target=callback)
-    #     th_callback.daemon = True
-    #     th_callback.start()
-    # else:
-    #     config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION = True
-    #     view.useTranslationFeatureProcess("Restart")
-    #     view.setWidgetsStatus_changeWeightType_Done()
-    # view.showRestartButtonIfRequired()
-
+    if model.checkTranscriptionWhisperModelWeight() is True:
+        config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = False
+        config.SELECTED_TRANSCRIPTION_ENGINE = "Whisper"
+    else:
+        config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER = True
+        config.SELECTED_TRANSCRIPTION_ENGINE = "Google"
+    view.showRestartButtonIfRequired()
 
 # Others Tab
 def callbackSetEnableAutoClearMessageBox(value):
diff --git a/model.py b/model.py
index 2c29d4c7..228bc253 100644
--- a/model.py
+++ b/model.py
@@ -24,6 +24,7 @@ from models.xsoverlay.notification import xsoverlayForVRCT
 from models.translation.translation_languages import translation_lang
 from models.transcription.transcription_languages import transcription_lang
 from models.translation.translation_utils import checkCTranslate2Weight
+from models.transcription.transcription_whisper import checkWhisperWeight
 from config import config
 
 class threadFnc(Thread):
@@ -74,6 +75,9 @@ class Model:
     def changeTranslatorCTranslate2Model(self):
         self.translator.changeCTranslate2Model(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE)
 
+    def checkTranscriptionWhisperModelWeight(self):
+        return checkWhisperWeight(config.PATH_LOCAL, config.WHISPER_WEIGHT_TYPE)
+
     def resetKeywordProcessor(self):
         del self.keyword_processor
         self.keyword_processor = KeywordProcessor()
@@ -335,12 +339,12 @@ class Model:
             source=self.mic_audio_recorder.source,
             phrase_timeout=phase_timeout,
             max_phrases=config.INPUT_MIC_MAX_PHRASES,
-            whisper_enabled=config.USE_WHISPER_FEATURE,
+            transcription_engine=config.SELECTED_TRANSCRIPTION_ENGINE,
             whisper_weight_type=config.WHISPER_WEIGHT_TYPE,
             root=config.PATH_LOCAL,
         )
         def sendMicTranscript():
-            mic_transcriber.transcribeAudioQueue(config.SELECTED_RECOGNIZER, mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY)
+            mic_transcriber.transcribeAudioQueue(mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY)
             message = mic_transcriber.getTranscript()
             try:
                 fnc(message)
@@ -419,12 +423,12 @@ class Model:
             source=self.speaker_audio_recorder.source,
             phrase_timeout=phase_timeout,
             max_phrases=config.INPUT_SPEAKER_MAX_PHRASES,
-            whisper_enabled=config.USE_WHISPER_FEATURE,
+            transcription_engine=config.SELECTED_TRANSCRIPTION_ENGINE,
             whisper_weight_type=config.WHISPER_WEIGHT_TYPE,
             root=config.PATH_LOCAL,
         )
         def sendSpeakerTranscript():
-            speaker_transcriber.transcribeAudioQueue(config.SELECTED_RECOGNIZER, speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY)
+            speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY)
             message = speaker_transcriber.getTranscript()
             try:
                 fnc(message)
diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py
index 0f5b1790..b24d3163 100644
--- a/models/transcription/transcription_transcriber.py
+++ b/models/transcription/transcription_transcriber.py
@@ -14,7 +14,7 @@ PHRASE_TIMEOUT = 3
 MAX_PHRASES = 10
 
 class AudioTranscriber:
-    def __init__(self, speaker, source, phrase_timeout, max_phrases, whisper_enabled, whisper_weight_type, root):
+    def __init__(self, speaker, source, phrase_timeout, max_phrases, transcription_engine, whisper_weight_type=None, root=None):
         self.speaker = speaker
         self.phrase_timeout = phrase_timeout
         self.max_phrases = max_phrases
@@ -30,38 +30,34 @@ class AudioTranscriber:
                 "new_phrase": True,
                 "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData
         }
-        if whisper_enabled is True:
-            self.whisper_model = getWhisperModel(root, whisper_weight_type)
-        else:
-            self.whisper_model = None
+        self.transcription_engine = transcription_engine
+        match self.transcription_engine:
+            case "Google":
+                self.audio_recognizer = Recognizer()
+            case "Whisper":
+                self.audio_recognizer = getWhisperModel(root, whisper_weight_type)
 
-    def transcribeAudioQueue(self, recognizer, audio_queue, language, country):
-        # while True:
+    def transcribeAudioQueue(self, audio_queue, language, country):
         audio, time_spoken = audio_queue.get()
         self.updateLastSampleAndPhraseStatus(audio, time_spoken)
 
         text = ''
         try:
-            # Whisperが使用できない場合はGoogle Speech-to-Textを使用する
-            if recognizer == "Whisper":
-                if self.whisper_model is None:
-                    recognizer = "Google"
-
             audio_data = self.audio_sources["process_data_func"]()
-            match recognizer:
+            match self.transcription_engine:
                 case "Google":
-                    text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][recognizer])
+                    text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][self.transcription_engine])
                 case "Whisper":
                     audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0
                     if isinstance(audio_data, torch.Tensor):
                         audio_data = audio_data.detach().numpy()
-                    segments, _ = self.whisper_model.transcribe(
+                    segments, _ = self.audio_recognizer.transcribe(
                         audio_data,
                         beam_size=5,
                         temperature=0.0,
                         log_prob_threshold=-0.8,
                         no_speech_threshold=0.6,
-                        language=transcription_lang[language][country][recognizer],
+                        language=transcription_lang[language][country][self.transcription_engine],
                         word_timestamps=False,
                         without_timestamps=True,
                         task="transcribe",
diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py
index e30fee2d..c6412d35 100644
--- a/models/transcription/transcription_whisper.py
+++ b/models/transcription/transcription_whisper.py
@@ -42,7 +42,8 @@ def downloadFile(url, path, func=None):
     except Exception as e:
             print("error:downloadFile()", e)
 
-def checkWhisperWeight(path):
+def checkWhisperWeight(root, weight_type):
+    path = os_path.join(root, "weights", "whisper", weight_type)
     result = False
     try:
         WhisperModel(
@@ -62,7 +63,7 @@ def checkWhisperWeight(path):
 def downloadWhisperWeight(root, weight_type, callbackFunc):
     path = os_path.join(root, "weights", "whisper", weight_type)
     os_makedirs(path, exist_ok=True)
-    if checkWhisperWeight(path) is True:
+    if checkWhisperWeight(root, weight_type) is True:
         return
 
     for filename in _FILENAMES:
diff --git a/view.py b/view.py
index 6f7a6d7e..84ebd550 100644
--- a/view.py
+++ b/view.py
@@ -29,6 +29,7 @@ class View():
             font_family=config.FONT_FAMILY,
             ui_language=config.UI_LANGUAGE,
             is_reset_button_displayed_for_translation=config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION,
+            is_reset_button_displayed_for_whisper=config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER,
         )
 
         if config.ENABLE_SPEAKER2CHATBOX is False:
@@ -1049,7 +1050,8 @@ class View():
             self.restart_required_configs_pre_data.ui_scaling == config.UI_SCALING and
             self.restart_required_configs_pre_data.font_family == config.FONT_FAMILY and
             self.restart_required_configs_pre_data.ui_language == config.UI_LANGUAGE and
-            self.restart_required_configs_pre_data.is_reset_button_displayed_for_translation == config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION
+            self.restart_required_configs_pre_data.is_reset_button_displayed_for_translation == config.IS_RESET_BUTTON_DISPLAYED_FOR_TRANSLATION and
+            self.restart_required_configs_pre_data.is_reset_button_displayed_for_whisper == config.IS_RESET_BUTTON_DISPLAYED_FOR_WHISPER
         )
 
         if locale is None:

From 7aafce6e2e78187086db7602f9de3f48e270847b Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Sun, 4 Feb 2024 01:03:38 +0900
Subject: [PATCH 10/11] =?UTF-8?q?[WIP/TEST]=20distil-wisper=E3=81=AE?=
 =?UTF-8?q?=E5=87=A6=E7=90=86=E3=82=92=E8=BF=BD=E5=8A=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config.py                                     |  3 +++
 controller.py                                 |  6 ++++++
 locales/en.yml                                |  3 +++
 main.py                                       |  2 +-
 models/transcription/transcription_whisper.py |  3 +++
 view.py                                       | 17 ++++++++++-------
 6 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/config.py b/config.py
index 6ce32035..ff1f7263 100644
--- a/config.py
+++ b/config.py
@@ -812,6 +812,9 @@ class Config:
             "large-v1": "large-v1",
             "large-v2": "large-v2",
             "large-v3": "large-v3",
+            "distil-small": "distil-small",
+            "distil-medium": "distil-medium",
+            "distil-large-v2": "distil-large-v2",
         }
 
         self._MAX_MIC_ENERGY_THRESHOLD = 2000
diff --git a/controller.py b/controller.py
index e63101b2..e5b747d4 100644
--- a/controller.py
+++ b/controller.py
@@ -925,6 +925,12 @@ def createMainWindow(splash):
     # set Translation Engine
     updateTranslationEngineAndEngineList()
 
+    # set Transcription Engine
+    if config.USE_WHISPER_FEATURE is True:
+        config.SELECTED_TRANSCRIPTION_ENGINE = "Whisper"
+    else:
+        config.SELECTED_TRANSCRIPTION_ENGINE = "Google"
+
     # set word filter
     model.addKeywords()
 
diff --git a/locales/en.yml b/locales/en.yml
index f68aa32c..c799c9d0 100644
--- a/locales/en.yml
+++ b/locales/en.yml
@@ -140,6 +140,9 @@ config_window:
     large_v1: "large_v1 model (%{capacity})"
     large_v2: "large_v2 model (%{capacity})"
     large_v3: "large_v3 model (%{capacity})"
+    distil_small: "distil-small model (%{capacity})"
+    distil_medium: "distil-medium model (%{capacity})"
+    distil_large_v2: "distil-large-v2 model (%{capacity})"
 
   deepl_auth_key:
     label: DeepL Auth Key
diff --git a/main.py b/main.py
index 0df15326..6b6c0e3e 100644
--- a/main.py
+++ b/main.py
@@ -13,9 +13,9 @@ if __name__ == "__main__":
             downloadCTranslate2Weight(config.PATH_LOCAL, config.CTRANSLATE2_WEIGHT_TYPE, splash.updateDownloadProgress)
 
         from models.transcription.transcription_whisper import downloadWhisperWeight
-        # whisperのダウンロードの説明に変更する必要あり
         if config.USE_WHISPER_FEATURE is True:
             downloadWhisperWeight(config.PATH_LOCAL, config.WHISPER_WEIGHT_TYPE, splash.updateDownloadProgress)
+
         splash.toProgress(0)
 
         import controller
diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py
index c6412d35..148b2edb 100644
--- a/models/transcription/transcription_whisper.py
+++ b/models/transcription/transcription_whisper.py
@@ -15,6 +15,9 @@ _MODELS = {
     "large-v1": "Systran/faster-whisper-large-v1",
     "large-v2": "Systran/faster-whisper-large-v2",
     "large-v3": "Systran/faster-whisper-large-v3",
+    "distil-small": "Systran/faster-distil-whisper-small.en",
+    "distil-medium": "Systran/faster-distil-whisper-medium.en",
+    "distil-large-v2": "Systran/faster-distil-whisper-large-v2"
 }
 
 _FILENAMES = [
diff --git a/view.py b/view.py
index 84ebd550..1efb3f22 100644
--- a/view.py
+++ b/view.py
@@ -947,13 +947,16 @@ class View():
     @staticmethod
     def getSelectableWhisperWeightTypeDict():
         return {
-            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["tiny"]: i18n.t("config_window.whisper_weight_type.tiny", capacity="t"),
-            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["base"]: i18n.t("config_window.whisper_weight_type.base", capacity="b"),
-            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["small"]: i18n.t("config_window.whisper_weight_type.small", capacity="s"),
-            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["medium"]: i18n.t("config_window.whisper_weight_type.medium", capacity="m"),
-            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v1"]: i18n.t("config_window.whisper_weight_type.large_v1", capacity="l_v1"),
-            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v2"]: i18n.t("config_window.whisper_weight_type.large_v2", capacity="l_v2"),
-            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v3"]: i18n.t("config_window.whisper_weight_type.large_v3", capacity="l_v3"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["tiny"]: i18n.t("config_window.whisper_weight_type.tiny", capacity="74.5MB"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["base"]: i18n.t("config_window.whisper_weight_type.base", capacity="141MB"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["small"]: i18n.t("config_window.whisper_weight_type.small", capacity="463MB"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["medium"]: i18n.t("config_window.whisper_weight_type.medium", capacity="1.42GB"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v1"]: i18n.t("config_window.whisper_weight_type.large_v1", capacity="2.87GB"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v2"]: i18n.t("config_window.whisper_weight_type.large_v2", capacity="2.87GB"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v3"]: i18n.t("config_window.whisper_weight_type.large_v3", capacity="2.87GB"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-small"]: i18n.t("config_window.whisper_weight_type.distil_small", capacity="319MB"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-medium"]: i18n.t("config_window.whisper_weight_type.distil_medium", capacity="755MB"),
+            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-large-v2"]: i18n.t("config_window.whisper_weight_type.distil_large_v2", capacity="1.41GB"),
         }
 
 # Open Webpage Functions

From 61a6eb792b2a8c3aad67f3ef94b67f898561636d Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Sun, 4 Feb 2024 02:42:08 +0900
Subject: [PATCH 11/11] =?UTF-8?q?[WIP/TEST]=20distil-wisper=E3=82=92?=
 =?UTF-8?q?=E5=89=8A=E9=99=A4/faster-wisper=E3=81=AE=E5=87=A6=E7=90=86?=
 =?UTF-8?q?=E3=82=92=E4=BF=AE=E6=AD=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 config.py                                     |  5 +---
 locales/en.yml                                |  3 --
 model.py                                      | 10 +++----
 .../transcription_transcriber.py              | 29 ++++++++++---------
 models/transcription/transcription_whisper.py |  3 --
 view.py                                       |  3 --
 6 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/config.py b/config.py
index ff1f7263..55ba2d40 100644
--- a/config.py
+++ b/config.py
@@ -812,9 +812,6 @@ class Config:
             "large-v1": "large-v1",
             "large-v2": "large-v2",
             "large-v3": "large-v3",
-            "distil-small": "distil-small",
-            "distil-medium": "distil-medium",
-            "distil-large-v2": "distil-large-v2",
         }
 
         self._MAX_MIC_ENERGY_THRESHOLD = 2000
@@ -895,7 +892,7 @@ class Config:
         }
         self._USE_TRANSLATION_FEATURE = True
         self._CTRANSLATE2_WEIGHT_TYPE = "Small"
-        self._USE_WHISPER_FEATURE = True
+        self._USE_WHISPER_FEATURE = False
         self._WHISPER_WEIGHT_TYPE = "base"
         self._SEND_MESSAGE_FORMAT = "[message]"
         self._SEND_MESSAGE_FORMAT_WITH_T = "[message]([translation])"
diff --git a/locales/en.yml b/locales/en.yml
index c799c9d0..f68aa32c 100644
--- a/locales/en.yml
+++ b/locales/en.yml
@@ -140,9 +140,6 @@ config_window:
     large_v1: "large_v1 model (%{capacity})"
     large_v2: "large_v2 model (%{capacity})"
     large_v3: "large_v3 model (%{capacity})"
-    distil_small: "distil-small model (%{capacity})"
-    distil_medium: "distil-medium model (%{capacity})"
-    distil_large_v2: "distil-large-v2 model (%{capacity})"
 
   deepl_auth_key:
     label: DeepL Auth Key
diff --git a/model.py b/model.py
index 228bc253..5b17e167 100644
--- a/model.py
+++ b/model.py
@@ -339,12 +339,11 @@ class Model:
             source=self.mic_audio_recorder.source,
             phrase_timeout=phase_timeout,
             max_phrases=config.INPUT_MIC_MAX_PHRASES,
-            transcription_engine=config.SELECTED_TRANSCRIPTION_ENGINE,
-            whisper_weight_type=config.WHISPER_WEIGHT_TYPE,
             root=config.PATH_LOCAL,
+            whisper_weight_type=config.WHISPER_WEIGHT_TYPE,
         )
         def sendMicTranscript():
-            mic_transcriber.transcribeAudioQueue(mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY)
+            mic_transcriber.transcribeAudioQueue(mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY, config.SELECTED_TRANSCRIPTION_ENGINE)
             message = mic_transcriber.getTranscript()
             try:
                 fnc(message)
@@ -423,12 +422,11 @@ class Model:
             source=self.speaker_audio_recorder.source,
             phrase_timeout=phase_timeout,
             max_phrases=config.INPUT_SPEAKER_MAX_PHRASES,
-            transcription_engine=config.SELECTED_TRANSCRIPTION_ENGINE,
-            whisper_weight_type=config.WHISPER_WEIGHT_TYPE,
             root=config.PATH_LOCAL,
+            whisper_weight_type=config.WHISPER_WEIGHT_TYPE,
         )
         def sendSpeakerTranscript():
-            speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY)
+            speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY, config.SELECTED_TRANSCRIPTION_ENGINE)
             message = speaker_transcriber.getTranscript()
             try:
                 fnc(message)
diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py
index b24d3163..08cc6a1a 100644
--- a/models/transcription/transcription_transcriber.py
+++ b/models/transcription/transcription_transcriber.py
@@ -5,7 +5,7 @@ from speech_recognition import Recognizer, AudioData, AudioFile
 from datetime import timedelta
 from pyaudiowpatch import get_sample_size, paInt16
 from .transcription_languages import transcription_lang
-from .transcription_whisper import getWhisperModel
+from .transcription_whisper import getWhisperModel, checkWhisperWeight
 
 import torch
 import numpy as np
@@ -14,7 +14,7 @@ PHRASE_TIMEOUT = 3
 MAX_PHRASES = 10
 
 class AudioTranscriber:
-    def __init__(self, speaker, source, phrase_timeout, max_phrases, transcription_engine, whisper_weight_type=None, root=None):
+    def __init__(self, speaker, source, phrase_timeout, max_phrases, root=None, whisper_weight_type=None, ):
         self.speaker = speaker
         self.phrase_timeout = phrase_timeout
         self.max_phrases = max_phrases
@@ -30,34 +30,37 @@ class AudioTranscriber:
                 "new_phrase": True,
                 "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData
         }
-        self.transcription_engine = transcription_engine
-        match self.transcription_engine:
-            case "Google":
-                self.audio_recognizer = Recognizer()
-            case "Whisper":
-                self.audio_recognizer = getWhisperModel(root, whisper_weight_type)
+        if whisper_weight_type is not None and root is not None and checkWhisperWeight(root, whisper_weight_type) is True:
+            self.whisper_model = getWhisperModel(root, whisper_weight_type)
+        else:
+            self.whisper_model = None
 
-    def transcribeAudioQueue(self, audio_queue, language, country):
+    def transcribeAudioQueue(self, audio_queue, language, country, transcription_engine):
         audio, time_spoken = audio_queue.get()
         self.updateLastSampleAndPhraseStatus(audio, time_spoken)
 
         text = ''
         try:
+            # Whisperが使用できない場合はGoogle Speech-to-Textを使用する
+            if transcription_engine == "Whisper":
+                if self.whisper_model is None:
+                    transcription_engine = "Google"
+
             audio_data = self.audio_sources["process_data_func"]()
-            match self.transcription_engine:
+            match transcription_engine:
                 case "Google":
-                    text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][self.transcription_engine])
+                    text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][transcription_engine])
                 case "Whisper":
                     audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0
                     if isinstance(audio_data, torch.Tensor):
                         audio_data = audio_data.detach().numpy()
-                    segments, _ = self.audio_recognizer.transcribe(
+                    segments, _ = self.whisper_model.transcribe(
                         audio_data,
                         beam_size=5,
                         temperature=0.0,
                         log_prob_threshold=-0.8,
                         no_speech_threshold=0.6,
-                        language=transcription_lang[language][country][self.transcription_engine],
+                        language=transcription_lang[language][country][transcription_engine],
                         word_timestamps=False,
                         without_timestamps=True,
                         task="transcribe",
diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py
index 148b2edb..c6412d35 100644
--- a/models/transcription/transcription_whisper.py
+++ b/models/transcription/transcription_whisper.py
@@ -15,9 +15,6 @@ _MODELS = {
     "large-v1": "Systran/faster-whisper-large-v1",
     "large-v2": "Systran/faster-whisper-large-v2",
     "large-v3": "Systran/faster-whisper-large-v3",
-    "distil-small": "Systran/faster-distil-whisper-small.en",
-    "distil-medium": "Systran/faster-distil-whisper-medium.en",
-    "distil-large-v2": "Systran/faster-distil-whisper-large-v2"
 }
 
 _FILENAMES = [
diff --git a/view.py b/view.py
index 1efb3f22..94a4af8c 100644
--- a/view.py
+++ b/view.py
@@ -954,9 +954,6 @@ class View():
             config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v1"]: i18n.t("config_window.whisper_weight_type.large_v1", capacity="2.87GB"),
             config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v2"]: i18n.t("config_window.whisper_weight_type.large_v2", capacity="2.87GB"),
             config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v3"]: i18n.t("config_window.whisper_weight_type.large_v3", capacity="2.87GB"),
-            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-small"]: i18n.t("config_window.whisper_weight_type.distil_small", capacity="319MB"),
-            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-medium"]: i18n.t("config_window.whisper_weight_type.distil_medium", capacity="755MB"),
-            config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-large-v2"]: i18n.t("config_window.whisper_weight_type.distil_large_v2", capacity="1.41GB"),
         }
 
 # Open Webpage Functions