👍️ [Update] pythonのメイン処理部分を移動/webui_mainloop.pyをビルドできるように修正

2024-07-27 01:30:36 +09:00
parent 7ce3bc9be9
commit 1be04cb571
21 changed files with 46 additions and 28 deletions
--- a/src-python/models/transcription/transcription_languages.py
+++ b/src-python/models/transcription/transcription_languages.py
@@ -0,0 +1,730 @@
+transcription_lang = {
+    "Afrikaans":{
+        "South Africa":{
+            "Google": "af-ZA",
+            "Whisper": "af",
+        },
+    },
+    "Albanian":{
+        "Albania":{
+            "Google": "sq-AL",
+            "Whisper": "sq",
+        },
+    },
+    "Amharic":{
+        "Ethiopia":{
+            "Google": "am-ET",
+            "Whisper": "am",
+        },
+    },
+    "Arabic":{
+        "Algeria":{
+            "Google": "ar-DZ",
+            "Whisper": "ar",
+        },
+        "Bahrain":{
+            "Google": "ar-BH",
+            "Whisper": "ar",
+        },
+        "Egypt":{
+            "Google": "ar-EG",
+            "Whisper": "ar",
+        },
+        "Israel":{
+            "Google": "ar-IL",
+            "Whisper": "ar",
+        },
+        "Iraq":{
+            "Google": "ar-IQ",
+            "Whisper": "ar",
+        },
+        "Jordan":{
+            "Google": "ar-JO",
+            "Whisper": "ar",
+        },
+        "Kuwait":{
+            "Google": "ar-KW",
+            "Whisper": "ar",
+        },
+        "Lebanon":{
+            "Google": "ar-LB",
+            "Whisper": "ar",
+        },
+        "Mauritania":{
+            "Google": "ar-MR",
+            "Whisper": "ar",
+        },
+        "Morocco":{
+            "Google": "ar-MA",
+            "Whisper": "ar",
+        },
+        "Oman":{
+            "Google": "ar-OM",
+            "Whisper": "ar",
+        },
+        "Qatar":{
+            "Google": "ar-QA",
+            "Whisper": "ar",
+        },
+        "Saudi Arabia":{
+            "Google": "ar-SA",
+            "Whisper": "ar",
+        },
+        "Palestine":{
+            "Google": "ar-PS",
+            "Whisper": "ar",
+        },
+        "Syria":{
+            "Google": "ar-SY",
+            "Whisper": "ar",
+        },
+        "Tunisia":{
+            "Google": "ar-TN",
+            "Whisper": "ar",
+        },
+        "United Arab Emirates":{
+            "Google": "ar-AE",
+            "Whisper": "ar",
+        },
+        "Yemen":{
+            "Google": "ar-YE",
+            "Whisper": "ar",
+        },
+    },
+    "Armenian": {
+        "Armenia": {
+            "Google": "hy-AM",
+            "Whisper": "hy",
+        },
+    },
+    "Azerbaijani": {
+        "Azerbaijan": {
+            "Google": "az-AZ",
+            "Whisper": "az",
+        },
+    },
+    "Basque":{
+        "Spain":{
+            "Google": "eu-ES",
+            "Whisper": "eu",
+        },
+    },
+    "Bengali":{
+        "Bangladesh":{
+            "Google": "bn-BD",
+            "Whisper": "bn",
+        },
+        "India":{
+            "Google": "bn-IN",
+            "Whisper": "bn",
+        },
+    },
+    "Bosnian":{
+        "Bosnia and Herzegovina":{
+            "Google": "bs-BA",
+            "Whisper": "bs",
+        }
+    },
+    "Bulgarian":{
+        "Bulgaria":{
+            "Google": "bg-BG",
+            "Whisper": "bg",
+        },
+    },
+    "Burmese":{
+        "Myanmar":{
+            "Google": "my-MM",
+            "Whisper": "my",
+        },
+    },
+    "Catalan":{
+        "Spain":{
+            "Google": "ca-ES",
+            "Whisper": "ca",
+        },
+    },
+    "Chinese Simplified":{
+        "China":{
+            "Google": "cmn-Hans-CN",
+            "Whisper": "zh",
+        },
+        "Hong Kong":{
+            "Google": "cmn-Hans-HK",
+            "Whisper": "zh",
+        },
+    },
+    "Chinese Traditional":{
+        "Taiwan":{
+            "Google": "cmn-Hant-TW",
+            "Whisper": "zh",
+        },
+        "Hong Kong":{
+            "Google": "yue-Hant-HK",
+            "Whisper": "yue",
+        },
+    },
+    "Croatian":{
+        "Croatia":{
+            "Google": "hr-HR",
+            "Whisper": "hr",
+        },
+    },
+    "Czech":{
+        "Czech Republic":{
+            "Google": "cs-CZ",
+            "Whisper": "cs",
+        },
+    },
+    "Danish":{
+        "Denmark":{
+            "Google": "da-DK",
+            "Whisper": "da",
+        },
+    },
+    "Dutch":{
+        "Belgium":{
+            "Google": "nl-BE",
+            "Whisper": "nl",
+        },
+        "Netherlands":{
+            "Google": "nl-NL",
+            "Whisper": "nl",
+        },
+    },
+    "English": {
+        "Australia":{
+            "Google": "en-AU",
+            "Whisper": "en",
+        },
+        "Canada":{
+            "Google": "en-CA",
+            "Whisper": "en",
+        },
+        "Ghana":{
+            "Google": "en-GH",
+            "Whisper": "en",
+        },
+        "Hong Kong":{
+            "Google": "en-HK",
+            "Whisper": "en",
+        },
+        "India":{
+            "Google": "en-IN",
+            "Whisper": "en",
+        },
+        "Ireland":{
+            "Google": "en-IE",
+            "Whisper": "en",
+        },
+        "Kenya":{
+            "Google": "en-KE",
+            "Whisper": "en",
+        },
+        "New Zealand":{
+            "Google": "en-NZ",
+            "Whisper": "en",
+        },
+        "Nigeria":{
+            "Google": "en-NG",
+            "Whisper": "en",
+        },
+        "Philippines":{
+            "Google": "en-PH",
+            "Whisper": "en",
+        },
+        "Singapore":{
+            "Google": "en-SG",
+            "Whisper": "en",
+        },
+        "South Africa":{
+            "Google": "en-ZA",
+            "Whisper": "en",
+        },
+        "Tanzania":{
+            "Google": "en-TZ",
+            "Whisper": "en",
+        },
+        "United Kingdom":{
+            "Google": "en-GB",
+            "Whisper": "en",
+        },
+        "United States":{
+            "Google": "en-US",
+            "Whisper": "en",
+        },
+    },
+    "Estonian":{
+        "Estonia":{
+            "Google": "et-EE",
+            "Whisper": "et",
+        },
+    },
+    "Filipino":{
+        "Philippines":{
+            "Google": "fil-PH",
+            "Whisper": "tl",
+        },
+    },
+    "Finnish":{
+        "Finland":{
+            "Google": "fi-FI",
+            "Whisper": "fi",
+        },
+    },
+    "French":{
+        "Belgium":{
+            "Google": "fr-BE",
+            "Whisper": "fr",
+        },
+        "Canada":{
+            "Google": "fr-CA",
+            "Whisper": "fr",
+        },
+        "France":{
+            "Google": "fr-FR",
+            "Whisper": "fr",
+        },
+        "Switzerland":{
+            "Google": "fr-CH",
+            "Whisper": "fr",
+        },
+    },
+    "Galician":{
+        "Spain":{
+            "Google": "gl-ES",
+            "Whisper": "gl",
+        },
+    },
+    "Georgian":{
+        "Georgia":{
+            "Google": "ka-GE",
+            "Whisper": "ka",
+        },
+    },
+    "German":{
+        "Austria":{
+            "Google": "de-AT",
+            "Whisper": "de",
+        },
+        "Germany":{
+            "Google": "de-DE",
+            "Whisper": "de",
+        },
+        "Switzerland":{
+            "Google": "de-CH",
+            "Whisper": "de",
+        },
+    },
+    "Greek":{
+        "Greece":{
+            "Google": "el-GR",
+            "Whisper": "el",
+        },
+    },
+    "Gujarati":{
+        "India":{
+            "Google": "gu-IN",
+            "Whisper": "gu",
+        },
+    },
+    "Hebrew":{
+        "Israel":{
+            "Google": "iw-IL",
+            "Whisper": "he",
+        },
+    },
+    "Hindi": {
+        "India":{
+            "Google": "hi-IN",
+            "Whisper": "hi",
+        },
+    },
+    "Hungarian":{
+        "Hungary":{
+            "Google": "hu-HU",
+            "Whisper": "hu",
+        },
+    },
+    "Icelandic":{
+        "Iceland":{
+            "Google": "is-IS",
+            "Whisper": "is",
+        },
+    },
+    "Indonesian":{
+        "Indonesia":{
+            "Google": "id-ID",
+            "Whisper": "id",
+        },
+    },
+    "Italian":{
+        "Italy":{
+            "Google": "it-IT",
+            "Whisper": "it",
+        },
+        "Switzerland":{
+            "Google": "it-CH",
+            "Whisper": "it",
+        },
+    },
+    "Japanese":{
+        "Japan":{
+            "Google": "ja-JP",
+            "Whisper": "ja",
+        },
+    },
+    # "Javanese":{
+    #     "Indonesia":{
+    #         "Google": "jv-ID",
+    #     },
+    # },
+    "Kannada":{
+        "India":{
+            "Google": "kn-IN",
+            "Whisper": "kn",
+        },
+    },
+    "Kazakh":{
+        "Kazakhstan":{
+            "Google": "kk-KZ",
+            "Whisper": "kk",
+        },
+    },
+    "Khmer":{
+        "Cambodia":{
+            "Google": "km-KH",
+            "Whisper": "km",
+        },
+    },
+    # "Kinyarwanda":{
+    #     "rwanda":{
+    #         "Google": "rw-RW",
+    #     },
+    # },
+    "Korean":{
+        "South Korea":{
+            "Google": "ko-KR",
+            "Whisper": "ko",
+        },
+    },
+    "Lao":{
+        "Laos":{
+            "Google": "lo-LA",
+            "Whisper": "lo",
+        },
+    },
+    "Latvian":{
+        "Latvia":{
+            "Google": "lv-LV",
+            "Whisper": "lv",
+        },
+    },
+    "Lithuanian":{
+        "Lithuania":{
+            "Google": "lt-LT",
+            "Whisper": "lt",
+        },
+    },
+    "Macedonian":{
+        "North Macedonia":{
+            "Google": "mk-MK",
+            "Whisper": "mk",
+        },
+    },
+    "Malay":{
+        "Malaysia":{
+            "Google": "ms-MY",
+            "Whisper": "ms",
+        },
+    },
+    "Malayalam":{
+        "India":{
+            "Google": "ml-IN",
+            "Whisper": "ml",
+        },
+    },
+    "Mongolian":{
+        "Mongolia":{
+            "Google": "mn-MN",
+            "Whisper": "mn",
+        },
+    },
+    "Nepali":{
+        "Nepal":{
+            "Google": "ne-NP",
+            "Whisper": "ne",
+        },
+    },
+    "Norwegian":{
+        "Norway":{
+            "Google": "no-NO",
+            "Whisper": "no",
+        },
+    },
+    "Persian":{
+        "Iran":{
+            "Google": "fa-IR",
+            "Whisper": "fa",
+        },
+    },
+    "Polish":{
+        "Poland":{
+            "Google": "pl-PL",
+            "Whisper": "pl",
+        },
+    },
+    "Portuguese":{
+        "Brazil":{
+            "Google": "pt-BR",
+            "Whisper": "pt",
+        },
+        "Portugal":{
+            "Google": "pt-PT",
+            "Whisper": "pt",
+        },
+    },
+    # "Punjabi":{
+    #     "India":{
+    #         "Google": "pa-Guru-IN",
+    #     },
+    # },
+    "Romanian":{
+        "Romania":{
+            "Google": "ro-RO",
+            "Whisper": "ro",
+        },
+    },
+    "Russian":{
+        "Russia":{
+            "Google": "ru-RU",
+            "Whisper": "ru",
+        },
+    },
+    "Serbian":{
+        "Serbia":{
+            "Google": "sr-RS",
+            "Whisper": "sr",
+        },
+    },
+    "Sinhala":{
+        "Sri Lanka":{
+            "Google": "si-LK",
+            "Whisper": "si",
+        },
+    },
+    "Slovak":{
+        "Slovakia":{
+            "Google": "sk-SK",
+            "Whisper": "sk",
+        },
+    },
+    "Slovenian":{
+        "Slovenia":{
+            "Google": "sl-SI",
+            "Whisper": "sl",
+        },
+    },
+    # "Sesotho":{
+    #     "South Africa":{
+    #         "Google": "st-ZA",
+    #     },
+    # },
+    "Spanish":{
+        "Argentina":{
+            "Google": "es-AR",
+            "Whisper": "es",
+        },
+        "Bolivia":{
+            "Google": "es-BO",
+            "Whisper": "es",
+        },
+        "Chile":{
+            "Google": "es-CL",
+            "Whisper": "es",
+        },
+        "Colombia":{
+            "Google": "es-CO",
+            "Whisper": "es",
+        },
+        "Costa Rica":{
+            "Google": "es-CR",
+            "Whisper": "es",
+        },
+        "Dominican Republic":{
+            "Google": "es-DO",
+            "Whisper": "es",
+        },
+        "Ecuador":{
+            "Google": "es-EC",
+            "Whisper": "es",
+        },
+        "El Salvador":{
+            "Google": "es-SV",
+            "Whisper": "es",
+        },
+        "Guatemala":{
+            "Google": "es-GT",
+            "Whisper": "es",
+        },
+        "Honduras":{
+            "Google": "es-HN",
+            "Whisper": "es",
+        },
+        "Mexico":{
+            "Google": "es-MX",
+            "Whisper": "es",
+        },
+        "Nicaragua":{
+            "Google": "es-NI",
+            "Whisper": "es",
+        },
+        "Panama":{
+            "Google": "es-PA",
+            "Whisper": "es",
+        },
+        "Paraguay":{
+            "Google": "es-PY",
+            "Whisper": "es",
+        },
+        "Peru":{
+            "Google": "es-PE",
+            "Whisper": "es",
+        },
+        "Puerto Rico":{
+            "Google": "es-PR",
+            "Whisper": "es",
+        },
+        "Spain":{
+            "Google": "es-ES",
+            "Whisper": "es",
+        },
+        "United States":{
+            "Google": "es-US",
+            "Whisper": "es",
+        },
+        "Uruguay":{
+            "Google": "es-UY",
+            "Whisper": "es",
+        },
+        "Venezuela":{
+            "Google": "es-VE",
+            "Whisper": "es",
+        },
+    },
+    "Sundanese":{
+        "Indonesia":{
+            "Google": "su-ID",
+            "Whisper": "su",
+        },
+    },
+    "Swahili":{
+        "Kenya":{
+            "Google": "sw-KE",
+            "Whisper": "sw",
+        },
+        "Tanzania":{
+            "Google": "sw-TZ",
+            "Whisper": "sw",
+        },
+    },
+    # "Swazi":{
+    #     "Eswatini":{
+    #         "Google": "ss-Latn-ZA",
+    #     },
+    # },
+    "Swedish":{
+        "Sweden":{
+            "Google": "sv-SE",
+            "Whisper": "sv",
+        },
+    },
+    "Tamil":{
+        "India":{
+            "Google": "ta-IN",
+            "Whisper": "ta",
+        },
+        "malaysia":{
+            "Google": "ta-MY",
+            "Whisper": "ta",
+        },
+        "Singapore":{
+            "Google": "ta-SG",
+            "Whisper": "ta",
+        },
+        "Sri Lanka":{
+            "Google": "ta-LK",
+            "Whisper": "ta",
+        },
+    },
+    "Telugu":{
+        "India":{
+            "Google": "te-IN",
+            "Whisper": "te",
+        },
+    },
+    "Thai":{
+        "Thailand":{
+            "Google": "th-TH",
+            "Whisper": "th",
+        },
+    },
+    # "Tsonga":{
+    #     "South Africa":{
+    #         "Google": "ts-ZA",
+    #     },
+    # },
+    # "Setswana":{
+    #     "South Africa":{
+    #         "Google": "tn-Latn-ZA",
+    #     },
+    # },
+    "Turkish":{
+        "Turkey":{
+            "Google": "tr-TR",
+            "Whisper": "tr",
+        },
+    },
+    "Ukrainian":{
+        "Ukraine":{
+            "Google": "uk-UA",
+            "Whisper": "uk",
+        },
+    },
+    "Urdu":{
+        "India":{
+            "Google": "ur-IN",
+            "Whisper": "ur",
+        },
+        "Pakistan":{
+            "Google": "ur-PK",
+            "Whisper": "ur",
+        },
+    },
+    "Uzbek":{
+        "Uzbekistan":{
+            "Google": "uz-UZ",
+            "Whisper": "uz",
+        },
+    },
+    # "Venda":{
+    #     "South Africa":{
+    #         "Google": "ve-ZA",
+    #     },
+    # },
+    "Vietnamese":{
+        "Vietnam":{
+            "Google": "vi-VN",
+            "Whisper": "vi",
+        },
+    },
+    # "Xhosa":{
+    #     "South Africa":{
+    #         "Google": "xh-ZA",
+    #     },
+    # },
+    # "Zulu":{
+    #     "South Africa":{
+    #         "Google": "zu-ZA",
+    #     },
+    # },
+}
--- a/src-python/models/transcription/transcription_recorder.py
+++ b/src-python/models/transcription/transcription_recorder.py
@@ -0,0 +1,142 @@
+from speech_recognition import Recognizer, Microphone
+from pyaudiowpatch import get_sample_size, paInt16
+from datetime import datetime
+from queue import Queue
+
+class BaseRecorder:
+    def __init__(self, source, energy_threshold, dynamic_energy_threshold, record_timeout):
+        self.recorder = Recognizer()
+        self.recorder.energy_threshold = energy_threshold
+        self.recorder.dynamic_energy_threshold = dynamic_energy_threshold
+        self.record_timeout = record_timeout
+        self.stop = None
+
+        if source is None:
+            raise ValueError("audio source can't be None")
+
+        self.source = source
+
+    def adjustForNoise(self):
+        with self.source:
+            self.recorder.adjust_for_ambient_noise(self.source)
+
+    def recordIntoQueue(self, audio_queue):
+        def record_callback(_, audio):
+            audio_queue.put((audio.get_raw_data(), datetime.now()))
+
+        self.stop, self.pause, self.resume = self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=self.record_timeout)
+
+class SelectedMicRecorder(BaseRecorder):
+    def __init__(self, device, energy_threshold, dynamic_energy_threshold, record_timeout):
+        source=Microphone(
+            device_index=device['index'],
+            sample_rate=int(device["defaultSampleRate"]),
+        )
+        super().__init__(source=source, energy_threshold=energy_threshold, dynamic_energy_threshold=dynamic_energy_threshold, record_timeout=record_timeout)
+        # self.adjustForNoise()
+
+class SelectedSpeakerRecorder(BaseRecorder):
+    def __init__(self, device, energy_threshold, dynamic_energy_threshold, record_timeout):
+
+        source = Microphone(speaker=True,
+            device_index= device["index"],
+            sample_rate=int(device["defaultSampleRate"]),
+            chunk_size=get_sample_size(paInt16),
+            channels=device["maxInputChannels"]
+        )
+        super().__init__(source=source, energy_threshold=energy_threshold, dynamic_energy_threshold=dynamic_energy_threshold, record_timeout=record_timeout)
+        # self.adjustForNoise()
+
+class BaseEnergyRecorder:
+    def __init__(self, source):
+        self.recorder = Recognizer()
+        self.recorder.energy_threshold = 0
+        self.recorder.dynamic_energy_threshold = False
+        self.record_timeout = 0
+        self.stop = None
+
+        if source is None:
+            raise ValueError("audio source can't be None")
+
+        self.source = source
+
+    def adjustForNoise(self):
+        with self.source:
+            self.recorder.adjust_for_ambient_noise(self.source)
+
+    def recordIntoQueue(self, energy_queue):
+        def recordCallback(_, energy):
+            energy_queue.put(energy)
+
+        self.stop, self.pause, self.resume = self.recorder.listen_energy_in_background(self.source, recordCallback)
+
+class SelectedMicEnergyRecorder(BaseEnergyRecorder):
+    def __init__(self, device):
+        source=Microphone(
+            device_index=device['index'],
+            sample_rate=int(device["defaultSampleRate"]),
+        )
+        super().__init__(source=source)
+        # self.adjustForNoise()
+
+class SelectedSpeakerEnergyRecorder(BaseEnergyRecorder):
+    def __init__(self, device):
+
+        source = Microphone(speaker=True,
+            device_index= device["index"],
+            sample_rate=int(device["defaultSampleRate"]),
+            channels=device["maxInputChannels"]
+        )
+        super().__init__(source=source)
+        # self.adjustForNoise()
+
+class BaseEnergyAndAudioRecorder:
+    def __init__(self, source, energy_threshold, dynamic_energy_threshold, record_timeout):
+        self.recorder = Recognizer()
+        self.recorder.energy_threshold = energy_threshold
+        self.recorder.dynamic_energy_threshold = dynamic_energy_threshold
+        self.record_timeout = record_timeout
+        self.stop = None
+
+        if source is None:
+            raise ValueError("audio source can't be None")
+
+        self.source = source
+
+    def adjustForNoise(self):
+        with self.source:
+            self.recorder.adjust_for_ambient_noise(self.source)
+
+    def recordIntoQueue(self, audio_queue, energy_queue=None):
+        def audioRecordCallback(_, audio):
+            audio_queue.put((audio.get_raw_data(), datetime.now()))
+
+        def energyRecordCallback(energy):
+            energy_queue.put(energy)
+
+        self.stop, self.pause, self.resume = self.recorder.listen_energy_and_audio_in_background(
+            source=self.source,
+            callback=audioRecordCallback,
+            phrase_time_limit=self.record_timeout,
+            callback_energy=energyRecordCallback if energy_queue is not None else None)
+
+class SelectedMicEnergyAndAudioRecorder(BaseEnergyAndAudioRecorder):
+    def __init__(self, device, energy_threshold, dynamic_energy_threshold, record_timeout):
+        source=Microphone(
+            device_index=device['index'],
+            sample_rate=int(device["defaultSampleRate"]),
+        )
+        super().__init__(source=source, energy_threshold=energy_threshold, dynamic_energy_threshold=dynamic_energy_threshold, record_timeout=record_timeout)
+        # self.adjustForNoise()
+
+class SelectedSpeakerEnergyAndAudioRecorder(BaseEnergyAndAudioRecorder):
+    def __init__(self, device, energy_threshold, dynamic_energy_threshold, record_timeout):
+
+        source = Microphone(speaker=True,
+            device_index= device["index"],
+            sample_rate=int(device["defaultSampleRate"]),
+            chunk_size=get_sample_size(paInt16),
+            channels=device["maxInputChannels"]
+        )
+        super().__init__(source=source, energy_threshold=energy_threshold, dynamic_energy_threshold=dynamic_energy_threshold, record_timeout=record_timeout)
+        # self.adjustForNoise()
--- a/src-python/models/transcription/transcription_transcriber.py
+++ b/src-python/models/transcription/transcription_transcriber.py
@@ -0,0 +1,141 @@
+import time
+from io import BytesIO
+from threading import Event
+import wave
+from speech_recognition import Recognizer, AudioData, AudioFile
+from datetime import timedelta
+from pyaudiowpatch import get_sample_size, paInt16
+from .transcription_languages import transcription_lang
+from .transcription_whisper import getWhisperModel, checkWhisperWeight
+
+import torch
+import numpy as np
+from pydub import AudioSegment
+
+PHRASE_TIMEOUT = 3
+MAX_PHRASES = 10
+
+class AudioTranscriber:
+    def __init__(self, speaker, source, phrase_timeout, max_phrases, transcription_engine, root=None, whisper_weight_type=None):
+        self.speaker = speaker
+        self.phrase_timeout = phrase_timeout
+        self.max_phrases = max_phrases
+        self.transcript_data = []
+        self.transcript_changed_event = Event()
+        self.audio_recognizer = Recognizer()
+        self.transcription_engine = "Google"
+        self.whisper_model = None
+        self.audio_sources = {
+                "sample_rate": source.SAMPLE_RATE,
+                "sample_width": source.SAMPLE_WIDTH,
+                "channels": source.channels,
+                "last_sample": bytes(),
+                "last_spoken": None,
+                "new_phrase": True,
+                "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData
+        }
+
+        if transcription_engine == "Whisper" and checkWhisperWeight(root, whisper_weight_type) is True:
+            self.whisper_model = getWhisperModel(root, whisper_weight_type)
+            self.transcription_engine = "Whisper"
+
+    def transcribeAudioQueue(self, audio_queue, language, country, avg_logprob=-0.8, no_speech_prob=0.6):
+        if audio_queue.empty():
+            time.sleep(0.01)
+            return False
+        audio, time_spoken = audio_queue.get()
+        self.updateLastSampleAndPhraseStatus(audio, time_spoken)
+
+        text = ''
+        try:
+            audio_data = self.audio_sources["process_data_func"]()
+            match self.transcription_engine:
+                case "Google":
+                    text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][self.transcription_engine])
+                case "Whisper":
+                    audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0
+                    if isinstance(audio_data, torch.Tensor):
+                        audio_data = audio_data.detach().numpy()
+                    segments, _ = self.whisper_model.transcribe(
+                        audio_data,
+                        beam_size=5,
+                        temperature=0.0,
+                        log_prob_threshold=-0.8,
+                        no_speech_threshold=0.6,
+                        language=transcription_lang[language][country][self.transcription_engine],
+                        word_timestamps=False,
+                        without_timestamps=True,
+                        task="transcribe",
+                        vad_filter=False,
+                        )
+                    for s in segments:
+                        if s.avg_logprob < avg_logprob or s.no_speech_prob > no_speech_prob:
+                            continue
+                        text += s.text
+
+        except Exception:
+            pass
+        finally:
+            pass
+
+        if text != '':
+            self.updateTranscript(text)
+        return True
+
+    def updateLastSampleAndPhraseStatus(self, data, time_spoken):
+        source_info = self.audio_sources
+        if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=self.phrase_timeout):
+            source_info["last_sample"] = bytes()
+            source_info["new_phrase"] = True
+        else:
+            source_info["new_phrase"] = False
+
+        source_info["last_sample"] += data
+        source_info["last_spoken"] = time_spoken
+
+    def processMicData(self):
+        audio_data = AudioData(self.audio_sources["last_sample"], self.audio_sources["sample_rate"], self.audio_sources["sample_width"])
+        return audio_data
+
+    def processSpeakerData(self):
+        temp_file = BytesIO()
+        with wave.open(temp_file, 'wb') as wf:
+            wf.setnchannels(self.audio_sources["channels"])
+            wf.setsampwidth(get_sample_size(paInt16))
+            wf.setframerate(self.audio_sources["sample_rate"])
+            wf.writeframes(self.audio_sources["last_sample"])
+        temp_file.seek(0)
+
+        if self.audio_sources["channels"] > 2:
+            audio = AudioSegment.from_file(temp_file, format="wav")
+            mono_audio = audio.set_channels(1)
+            temp_file = BytesIO()
+            mono_audio.export(temp_file, format="wav")
+            temp_file.seek(0)
+
+        with AudioFile(temp_file) as source:
+            audio = self.audio_recognizer.record(source)
+        return audio
+
+    def updateTranscript(self, text):
+        source_info = self.audio_sources
+        transcript = self.transcript_data
+
+        if source_info["new_phrase"] or len(transcript) == 0:
+            if len(transcript) > self.max_phrases:
+                transcript.pop(-1)
+            transcript.insert(0, text)
+        else:
+            transcript[0] = text
+
+    def getTranscript(self):
+        if len(self.transcript_data) > 0:
+            text = self.transcript_data.pop(-1)
+        else:
+            text = ""
+        return text
+
+    def clearTranscriptData(self):
+        self.transcript_data.clear()
+        self.audio_sources["last_sample"] = bytes()
+        self.audio_sources["new_phrase"] = True
--- a/src-python/models/transcription/transcription_utils.py
+++ b/src-python/models/transcription/transcription_utils.py
@@ -0,0 +1,70 @@
+from pyaudiowpatch import PyAudio, paWASAPI
+
+def getInputDevices():
+    devices = {}
+    with PyAudio() as p:
+        for host_index in range(0, p.get_host_api_count()):
+            host = p.get_host_api_info_by_index(host_index)
+            for device_index in range(0, p.get_host_api_info_by_index(host_index)['deviceCount']):
+                device = p.get_device_info_by_host_api_device_index(host_index, device_index)
+                if device["maxInputChannels"] > 0 and device["isLoopbackDevice"] is False:
+                    if host["name"] in devices.keys():
+                        devices[host["name"]].append(device)
+                    else:
+                        devices[host["name"]] = [device]
+    if len(devices) == 0:
+        devices = {"NoHost": [{"name": "NoDevice"}]}
+    return devices
+
+def getDefaultInputDevice():
+    with PyAudio() as p:
+        api_info = p.get_default_host_api_info()
+        defaultInputDevice = api_info["defaultInputDevice"]
+
+        for host_index in range(0, p.get_host_api_count()):
+            host = p.get_host_api_info_by_index(host_index)
+            for device_index in range(0, p.get_host_api_info_by_index(host_index)['deviceCount']):
+                device = p.get_device_info_by_host_api_device_index(host_index, device_index)
+                if device["index"] == defaultInputDevice:
+                    return {"host": host, "device": device}
+    return {"host": {"name": "NoHost"}, "device": {"name": "NoDevice"}}
+
+def getOutputDevices():
+    devices = []
+    with PyAudio() as p:
+        wasapi_info = p.get_host_api_info_by_type(paWASAPI)
+        for host_index in range(0, p.get_host_api_count()):
+            host = p.get_host_api_info_by_index(host_index)
+            if host["name"] == wasapi_info["name"]:
+                for device_index in range(0, p.get_host_api_info_by_index(host_index)['deviceCount']):
+                    device = p.get_device_info_by_host_api_device_index(host_index, device_index)
+                    if not device["isLoopbackDevice"]:
+                        for loopback in p.get_loopback_device_info_generator():
+                            if device["name"] in loopback["name"]:
+                                devices.append(loopback)
+
+        if len(devices) == 0:
+            devices = [{"name": "NoDevice"}]
+        else:
+            devices = [dict(t) for t in {tuple(d.items()) for d in devices}]
+    return devices
+
+def getDefaultOutputDevice():
+    with PyAudio() as p:
+        wasapi_info = p.get_host_api_info_by_type(paWASAPI)
+        defaultOutputDevice = wasapi_info["defaultOutputDevice"]
+
+        for host_index in range(0, p.get_host_api_count()):
+            for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
+                device = p.get_device_info_by_host_api_device_index(host_index, device_index)
+                if device["index"] == defaultOutputDevice:
+                    default_speakers = device
+                    if not default_speakers["isLoopbackDevice"]:
+                        for loopback in p.get_loopback_device_info_generator():
+                            if default_speakers["name"] in loopback["name"]:
+                                return {"device": loopback}
+    return {"device": {"name": "NoDevice"}}
+
+if __name__ == "__main__":
+    print("getOutputDevices()", getOutputDevices())
+    print("getDefaultOutputDevice()", getDefaultOutputDevice())
--- a/src-python/models/transcription/transcription_whisper.py
+++ b/src-python/models/transcription/transcription_whisper.py
@@ -0,0 +1,98 @@
+from os import path as os_path, makedirs as os_makedirs
+from requests import get as requests_get
+from typing import Callable
+import huggingface_hub
+from faster_whisper import WhisperModel
+import logging
+logger = logging.getLogger('faster_whisper')
+logger.setLevel(logging.CRITICAL)
+
+_MODELS = {
+    "tiny": "Systran/faster-whisper-tiny",
+    "base": "Systran/faster-whisper-base",
+    "small": "Systran/faster-whisper-small",
+    "medium": "Systran/faster-whisper-medium",
+    "large-v1": "Systran/faster-whisper-large-v1",
+    "large-v2": "Systran/faster-whisper-large-v2",
+    "large-v3": "Systran/faster-whisper-large-v3",
+}
+
+_FILENAMES = [
+    "config.json",
+    "preprocessor_config.json",
+    "model.bin",
+    "tokenizer.json",
+    "vocabulary.txt",
+    "vocabulary.json",
+]
+
+def downloadFile(url, path, func=None):
+    try:
+        res = requests_get(url, stream=True)
+        res.raise_for_status()
+        file_size = int(res.headers.get('content-length', 0))
+        total_chunk = 0
+        with open(os_path.join(path), 'wb') as file:
+            for chunk in res.iter_content(chunk_size=1024*5):
+                file.write(chunk)
+                if isinstance(func, Callable):
+                    total_chunk += len(chunk)
+                    func(total_chunk/file_size)
+
+    except Exception as e:
+            print("error:downloadFile()", e)
+
+def checkWhisperWeight(root, weight_type):
+    path = os_path.join(root, "weights", "whisper", weight_type)
+    result = False
+    try:
+        WhisperModel(
+            path,
+            device="cpu",
+            device_index=0,
+            compute_type="int8",
+            cpu_threads=4,
+            num_workers=1,
+            local_files_only=True,
+        )
+        result = True
+    except Exception:
+        pass
+    return result
+
+def downloadWhisperWeight(root, weight_type, callbackFunc):
+    path = os_path.join(root, "weights", "whisper", weight_type)
+    os_makedirs(path, exist_ok=True)
+    if checkWhisperWeight(root, weight_type) is True:
+        return
+
+    for filename in _FILENAMES:
+        print("Downloading", filename, "...")
+        file_path = os_path.join(path, filename)
+        url = huggingface_hub.hf_hub_url(_MODELS[weight_type], filename)
+        downloadFile(url, file_path, func=callbackFunc)
+
+def getWhisperModel(root, weight_type):
+    path = os_path.join(root, "weights", "whisper", weight_type)
+    return WhisperModel(
+        path,
+        device="cpu",
+        device_index=0,
+        compute_type="int8",
+        cpu_threads=4,
+        num_workers=1,
+        local_files_only=True,
+    )
+
+if __name__ == "__main__":
+    def callback(value):
+        print(value)
+        pass
+
+    downloadWhisperWeight("./", "tiny", callback)
+    downloadWhisperWeight("./", "base", callback)
+    downloadWhisperWeight("./", "small", callback)
+    downloadWhisperWeight("./", "medium", callback)
+    downloadWhisperWeight("./", "large-v1", callback)
+    downloadWhisperWeight("./", "large-v2", callback)
+    downloadWhisperWeight("./", "large-v3", callback)