[WIP/TEST] faster-whisperが最低限動く形で実装

config.jsonで設定変更で実行可能
2024-01-30 18:21:55 +09:00
parent ba12e39bbc
commit 9cd1831ecb
8 changed files with 511 additions and 137 deletions
--- a/models/transcription/transcription_languages.py
+++ b/models/transcription/transcription_languages.py
@@ -1,177 +1,438 @@
 transcription_lang = {
    "Afrikaans":{
-        "South Africa":"af-ZA",
+        "South Africa":{
+            "Google": "af-ZA",
+            "Whisper": "af",
+        },
    },
    "Arabic":{
-        "Algeria":"ar-DZ",
-        "Bahrain":"ar-BH",
-        "Egypt":"ar-EG",
-        "Israel":"ar-IL",
-        "Iraq":"ar-IQ",
-        "Jordan":"ar-JO",
-        "Kuwait":"ar-KW",
-        "Lebanon":"ar-LB",
-        "Morocco":"ar-MA",
-        "Oman":"ar-OM",
-        "State of Palestine":"ar-PS",
-        "Qatar":"ar-QA",
-        "Saudi Arabia":"ar-SA",
-        "Tunisia":"ar-TN",
-        "United Arab Emirates":"ar-AE",
+        "Algeria":{
+            "Google": "ar-DZ",
+            "Whisper": "ar",
+        },
+        "Bahrain":{
+            "Google": "ar-BH",
+            "Whisper": "ar",
+        },
+        "Egypt":{
+            "Google": "ar-EG",
+            "Whisper": "ar",
+        },
+        "Israel":{
+            "Google": "ar-IL",
+            "Whisper": "ar",
+        },
+        "Iraq":{
+            "Google": "ar-IQ",
+            "Whisper": "ar",
+        },
+        "Jordan":{
+            "Google": "ar-JO",
+            "Whisper": "ar",
+        },
+        "Kuwait":{
+            "Google": "ar-KW",
+            "Whisper": "ar",
+        },
+        "Lebanon":{
+            "Google": "ar-LB",
+            "Whisper": "ar",
+        },
+        "Morocco":{
+            "Google": "ar-MA",
+            "Whisper": "ar",
+        },
+        "Oman":{
+            "Google": "ar-OM",
+            "Whisper": "ar",
+        },
+        "State of Palestine":{
+            "Google": "ar-PS",
+            "Whisper": "ar",
+        },
+        "Qatar":{
+            "Google": "ar-QA",
+            "Whisper": "ar",
+        },
+        "Saudi Arabia":{
+            "Google": "ar-SA",
+            "Whisper": "ar",
+        },
+        "Tunisia":{
+            "Google": "ar-TN",
+            "Whisper": "ar",
+        },
+        "United Arab Emirates":{
+            "Google": "ar-AE",
+            "Whisper": "ar",
+        },
    },
    "Basque":{
-        "Spain":"eu-ES",
+        "Spain":{
+            "Google": "eu-ES",
+            "Whisper": "eu",
+        },
    },
    "Bulgarian":{
-        "Bulgaria":"bg-BG",
+        "Bulgaria":{
+            "Google": "bg-BG",
+            "Whisper": "bg",
+        },
    },
    "Catalan":{
-        "Spain":"ca-ES",
+        "Spain":{
+            "Google": "ca-ES",
+            "Whisper": "ca",
+        },
    },
    "Chinese":{
-        "Mandarin (Simplified, China)":"cmn-Hans-CN",
-        "Mandarin (Simplified, Hong Kong)":"cmn-Hans-HK",
-        "Mandarin (Traditional, Taiwan)":"cmn-Hant-TW",
-        "Cantonese (Traditional Hong Kong)":"yue-Hant-HK",
+        "Mandarin (Simplified, China)":{
+            "Google": "cmn-Hans-CN",
+            "Whisper": "zh",
+        },
+        "Mandarin (Simplified, Hong Kong)":{
+            "Google": "cmn-Hans-HK",
+            "Whisper": "zh",
+        },
+        "Mandarin (Traditional, Taiwan)":{
+            "Google": "cmn-Hant-TW",
+            "Whisper": "zh",
+        },
+        "Cantonese (Traditional Hong Kong)":{
+            "Google": "yue-Hant-HK",
+            "Whisper": "yue",
+        },
    },
    "Croatian":{
-        "Croatia":"hr-HR",
+        "Croatia":{
+            "Google": "hr-HR",
+            "Whisper": "hr",
+        },
    },
    "Czech":{
-        "Czech Republic":"cs-CZ",
+        "Czech Republic":{
+            "Google": "cs-CZ",
+            "Whisper": "cs",
+        },
    },
    "Danish":{
-        "Denmark":"da-DK",
+        "Denmark":{
+            "Google": "da-DK",
+            "Whisper": "da",
+        },
    },
    "Dutch":{
-        "Netherlands":"nl-NL",
+        "Netherlands":{
+            "Google": "nl-NL",
+            "Whisper": "nl",
+        },
    },
    "English": {
-        "United States":"en-US",
-        "United Kingdom":"en-GB",
-        "Australia":"en-AU",
-        "Canada":"en-CA",
-        "India":"en-IN",
-        "Ireland":"en-IE",
-        "New Zealand":"en-NZ",
-        "Philippines":"en-PH",
-        "South Africa":"en-ZA",
+        "United States":{
+            "Google": "en-US",
+            "Whisper": "en",
+        },
+        "United Kingdom":{
+            "Google": "en-GB",
+            "Whisper": "en",
+        },
+        "Australia":{
+            "Google": "en-AU",
+            "Whisper": "en",
+        },
+        "Canada":{
+            "Google": "en-CA",
+            "Whisper": "en",
+        },
+        "India":{
+            "Google": "en-IN",
+            "Whisper": "en",
+        },
+        "Ireland":{
+            "Google": "en-IE",
+            "Whisper": "en",
+        },
+        "New Zealand":{
+            "Google": "en-NZ",
+            "Whisper": "en",
+        },
+        "Philippines":{
+            "Google": "en-PH",
+            "Whisper": "en",
+        },
+        "South Africa":{
+            "Google": "en-ZA",
+            "Whisper": "en",
+        },
    },
    "Filipino":{
-        "Philippines":"fil-PH",
+        "Philippines":{
+            "Google": "fil-PH",
+            "Whisper": "tl",
+        },
    },
    "Finnish":{
-        "Finland":"fi-FI",
+        "Finland":{
+            "Google": "fi-FI",
+            "Whisper": "fi",
+        },
    },
    "French":{
-        "France":"fr-FR",
+        "France":{
+            "Google": "fr-FR",
+            "Whisper": "fr",
+        },
    },
    "Galician":{
-        "Spain":"gl-ES",
+        "Spain":{
+            "Google": "gl-ES",
+            "Whisper": "gl",
+        },
    },
    "German":{
-        "Germany":"de-DE",
+        "Germany":{
+            "Google": "de-DE",
+            "Whisper": "de",
+        },
    },
    "Greek":{
-        "Greece":"el-GR",
+        "Greece":{
+            "Google": "el-GR",
+            "Whisper": "el",
+        },
    },
    "Hebrew":{
-        "Israel":"he-IL",
+        "Israel":{
+            "Google": "he-IL",
+            "Whisper": "he",
+        },
    },
    "Hindi": {
-        "India":"hi-IN",
+        "India":{
+            "Google": "hi-IN",
+            "Whisper": "hi",
+        },
    },
    "Hungarian":{
-        "Hungary":"hu-HU",
+        "Hungary":{
+            "Google": "hu-HU",
+            "Whisper": "hu",
+        },
    },
    "Indonesian":{
-        "Indonesia":"id-ID",
+        "Indonesia":{
+            "Google": "id-ID",
+            "Whisper": "id",
+        },
    },
    "Icelandic":{
-        "Iceland":"is-IS",
+        "Iceland":{
+            "Google": "is-IS",
+            "Whisper": "is",
+        },
    },
    "Italian":{
-        "Italy":"it-IT",
-        "Switzerland":"it-CH",
+        "Italy":{
+            "Google": "it-IT",
+            "Whisper": "it",
+        },
+        "Switzerland":{
+            "Google": "it-CH",
+            "Whisper": "it",
+        },
    },
    "Japanese":{
-        "Japan":"ja-JP",
+        "Japan":{
+            "Google": "ja-JP",
+            "Whisper": "ja",
+        },
    },
    "Korean":{
-        "South Korea":"ko-KR",
+        "South Korea":{
+            "Google": "ko-KR",
+            "Whisper": "ko",
+        },
    },
    "Lithuanian":{
-        "Lithuania":"lt-LT",
+        "Lithuania":{
+            "Google": "lt-LT",
+            "Whisper": "lt",
+        },
    },
    "Malay":{
-        "Malaysia":"ms-MY",
+        "Malaysia":{
+            "Google": "ms-MY",
+            "Whisper": "ms",
+        },
    },
    "Norwegian":{
-        "Norway":"nb-NO",
+        "Norway":{
+            "Google": "nb-NO",
+            "Whisper": "no",
+        },
    },
    "Persian":{
-        "Iran":"fa-IR",
+        "Iran":{
+            "Google": "fa-IR",
+            "Whisper": "fa",
+        },
    },
    "Polish":{
-        "Poland":"pl-PL",
+        "Poland":{
+            "Google": "pl-PL",
+            "Whisper": "pl",
+        },
    },
    "Portuguese":{
-        "Brazil":"pt-BR",
-        "Portugal":"pt-PT",
+        "Brazil":{
+            "Google": "pt-BR",
+            "Whisper": "pt",
+        },
+        "Portugal":{
+            "Google": "pt-PT",
+            "Whisper": "pt",
+        },
    },
    "Romanian":{
-        "Romania":"ro-RO",
+        "Romania":{
+            "Google": "ro-RO",
+            "Whisper": "ro",
+        },
    },
    "Russian":{
-        "Russia":"ru-RU",
+        "Russia":{
+            "Google": "ru-RU",
+            "Whisper": "ru",
+        },
    },
    "Serbian":{
-        "Serbia":"sr-RS",
+        "Serbia":{
+            "Google": "sr-RS",
+            "Whisper": "sr",
+        },
    },
    "Slovak":{
-        "Slovakia":"sk-SK",
+        "Slovakia":{
+            "Google": "sk-SK",
+            "Whisper": "sk",
+        },
    },
    "Slovenian":{
-        "Slovenia":"sl-SI",
+        "Slovenia":{
+            "Google": "sl-SI",
+            "Whisper": "sl",
+        },
    },
    "Spanish":{
-        "Argentina":"es-AR",
-        "Bolivia":"es-BO",
-        "Chile":"es-CL",
-        "Colombia":"es-CO",
-        "Costa Rica":"es-CR",
-        "Dominican Republic":"es-DO",
-        "Ecuador":"es-EC",
-        "El Salvador":"es-SV",
-        "Guatemala":"es-GT",
-        "Honduras":"es-HN",
-        "Mexico":"es-MX",
-        "Nicaragua":"es-NI",
-        "Panama":"es-PA",
-        "Paraguay":"es-PY",
-        "Peru":"es-PE",
-        "Puerto Rico":"es-PR",
-        "Spain":"es-ES",
-        "Uruguay":"es-UY",
-        "United States":"es-US",
-        "Venezuela":"es-VE",
+        "Argentina":{
+            "Google": "es-AR",
+            "Whisper": "es",
+        },
+        "Bolivia":{
+            "Google": "es-BO",
+            "Whisper": "es",
+        },
+        "Chile":{
+            "Google": "es-CL",
+            "Whisper": "es",
+        },
+        "Colombia":{
+            "Google": "es-CO",
+            "Whisper": "es",
+        },
+        "Costa Rica":{
+            "Google": "es-CR",
+            "Whisper": "es",
+        },
+        "Dominican Republic":{
+            "Google": "es-DO",
+            "Whisper": "es",
+        },
+        "Ecuador":{
+            "Google": "es-EC",
+            "Whisper": "es",
+        },
+        "El Salvador":{
+            "Google": "es-SV",
+            "Whisper": "es",
+        },
+        "Guatemala":{
+            "Google": "es-GT",
+            "Whisper": "es",
+        },
+        "Honduras":{
+            "Google": "es-HN",
+            "Whisper": "es",
+        },
+        "Mexico":{
+            "Google": "es-MX",
+            "Whisper": "es",
+        },
+        "Nicaragua":{
+            "Google": "es-NI",
+            "Whisper": "es",
+        },
+        "Panama":{
+            "Google": "es-PA",
+            "Whisper": "es",
+        },
+        "Paraguay":{
+            "Google": "es-PY",
+            "Whisper": "es",
+        },
+        "Peru":{
+            "Google": "es-PE",
+            "Whisper": "es",
+        },
+        "Puerto Rico":{
+            "Google": "es-PR",
+            "Whisper": "es",
+        },
+        "Spain":{
+            "Google": "es-ES",
+            "Whisper": "es",
+        },
+        "Uruguay":{
+            "Google": "es-UY",
+            "Whisper": "es",
+        },
+        "United States":{
+            "Google": "es-US",
+            "Whisper": "es",
+        },
+        "Venezuela":{
+            "Google": "es-VE",
+            "Whisper": "es",
+        },
    },
    "Swedish":{
-        "Sweden":"sv-SE",
+        "Sweden":{
+            "Google": "sv-SE",
+            "Whisper": "sv",
+        },
    },
    "Thai":{
-        "Thailand":"th-TH",
+        "Thailand":{
+            "Google": "th-TH",
+            "Whisper": "th",
+        },
    },
    "Turkish":{
-        "Turkey":"tr-TR",
+        "Turkey":{
+            "Google": "tr-TR",
+            "Whisper": "tr",
+        },
    },
    "Ukrainian":{
-        "Ukraine":"uk-UA",
+        "Ukraine":{
+            "Google": "uk-UA",
+            "Whisper": "uk",
+        },
    },
    "Vietnamese":{
-        "Vietnam":"vi-VN",
-    },
-    "Zulu":{
-        "South Africa":"zu-ZA"
+        "Vietnam":{
+            "Google": "vi-VN",
+            "Whisper": "vi",
+        },
    },
 }
--- a/models/transcription/transcription_transcriber.py
+++ b/models/transcription/transcription_transcriber.py
@@ -14,7 +14,7 @@ PHRASE_TIMEOUT = 3
 MAX_PHRASES = 10

 class AudioTranscriber:
-    def __init__(self, speaker, source, phrase_timeout, max_phrases):
+    def __init__(self, speaker, source, phrase_timeout, max_phrases, whisper_enabled, whisper_weight_type, whisper_weight_path):
        self.speaker = speaker
        self.phrase_timeout = phrase_timeout
        self.max_phrases = max_phrases
@@ -30,47 +30,59 @@ class AudioTranscriber:
                "new_phrase": True,
                "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData
        }
-        self.whisper_model = WhisperModel("base", device="cpu", device_index=0, compute_type="int8", cpu_threads=4, num_workers=1)
+        if whisper_enabled is True:
+            self.whisper_model = WhisperModel(
+                model_size_or_path=whisper_weight_type,
+                device="cpu",
+                device_index=0,
+                compute_type="int8",
+                cpu_threads=4,
+                num_workers=1,
+                download_root=whisper_weight_path)
+        else:
+            self.whisper_model = None

-    def transcribeAudioQueue(self, audio_queue, language, country):
+    def transcribeAudioQueue(self, recognizer, audio_queue, language, country):
        # while True:
        audio, time_spoken = audio_queue.get()
        self.updateLastSampleAndPhraseStatus(audio, time_spoken)

        text = ''
        try:
-            # fd, path = tempfile.mkstemp(suffix=".wav")
-            # os.close(fd)
-            audio_data = self.audio_sources["process_data_func"]()
-            text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country])
+            # Whisperが使用できない場合はGoogle Speech-to-Textを使用する
+            if recognizer == "Whisper":
+                if self.whisper_model is None:
+                    recognizer = "Google"

-            audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0
-            if isinstance(audio_data, torch.Tensor):
-                audio_data = audio_data.detach().numpy()
-            segments, _ = self.whisper_model.transcribe(
-                audio_data,
-                beam_size=5,
-                temperature=0.0,
-                log_prob_threshold=-0.8,
-                no_speech_threshold=0.6,
-                language="ja",
-                word_timestamps=False,
-                without_timestamps=True,
-                task="transcribe",
-                vad_filter=False,
-                )
-            _text = ""
-            for s in segments:
-                if s.avg_logprob < -0.8 or s.no_speech_prob > 0.6:
-                    continue
-                _text += s.text
-            print(_text)
+            audio_data = self.audio_sources["process_data_func"]()
+            match recognizer:
+                case "Google":
+                    text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][recognizer])
+                case "Whisper":
+                    audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0
+                    if isinstance(audio_data, torch.Tensor):
+                        audio_data = audio_data.detach().numpy()
+                    segments, _ = self.whisper_model.transcribe(
+                        audio_data,
+                        beam_size=5,
+                        temperature=0.0,
+                        log_prob_threshold=-0.8,
+                        no_speech_threshold=0.6,
+                        language=transcription_lang[language][country][recognizer],
+                        word_timestamps=False,
+                        without_timestamps=True,
+                        task="transcribe",
+                        vad_filter=False,
+                        )
+                    for s in segments:
+                        if s.avg_logprob < -0.8 or s.no_speech_prob > 0.6:
+                            continue
+                        text += s.text

        except Exception:
            pass
        finally:
            pass
-            # os.unlink(path)

        if text != '':
            self.updateTranscript(text)
--- a/models/transcription/transcription_utils.py
+++ b/models/transcription/transcription_utils.py
@@ -1,4 +1,8 @@
 from pyaudiowpatch import PyAudio, paWASAPI
+from faster_whisper.utils import download_model
+import logging
+logger = logging.getLogger('faster_whisper')
+logger.setLevel(logging.CRITICAL)

 def getInputDevices():
    devices = {}
@@ -44,4 +48,38 @@ def getDefaultOutputDevice():
                            if default_speakers["name"] in loopback["name"]:
                                default_device = loopback
                                return default_device
-    return {"name":"NoDevice"}
+    return {"name":"NoDevice"}
+
+def downloadWhisperWeight(weight_type, path):
+    result = False
+    try:
+        download_model(
+            weight_type,
+            cache_dir=path)
+        result = True
+    except Exception:
+        pass
+    return result
+
+def checkWhisperWeight(weight_type, path):
+    result = False
+    try:
+        result = download_model(
+            weight_type,
+            local_files_only=True,
+            cache_dir=path)
+        result = True
+    except Exception:
+        pass
+    return result
+
+if __name__ == "__main__":
+
+
+    downloadWhisperWeight("base", "./weight/whisper/")
+
+    from faster_whisper import WhisperModel
+    whisper_model = WhisperModel("base", device="cpu", device_index=0, compute_type="int8", cpu_threads=4, num_workers=1, download_root="./weight/whisper/")
+
+    print(checkWhisperWeight("base", "./weight/whisper/"))
+    print(checkWhisperWeight("tiny", "./weight/whisper/"))