diff --git a/config.py b/config.py index ff1f7263..55ba2d40 100644 --- a/config.py +++ b/config.py @@ -812,9 +812,6 @@ class Config: "large-v1": "large-v1", "large-v2": "large-v2", "large-v3": "large-v3", - "distil-small": "distil-small", - "distil-medium": "distil-medium", - "distil-large-v2": "distil-large-v2", } self._MAX_MIC_ENERGY_THRESHOLD = 2000 @@ -895,7 +892,7 @@ class Config: } self._USE_TRANSLATION_FEATURE = True self._CTRANSLATE2_WEIGHT_TYPE = "Small" - self._USE_WHISPER_FEATURE = True + self._USE_WHISPER_FEATURE = False self._WHISPER_WEIGHT_TYPE = "base" self._SEND_MESSAGE_FORMAT = "[message]" self._SEND_MESSAGE_FORMAT_WITH_T = "[message]([translation])" diff --git a/locales/en.yml b/locales/en.yml index c799c9d0..f68aa32c 100644 --- a/locales/en.yml +++ b/locales/en.yml @@ -140,9 +140,6 @@ config_window: large_v1: "large_v1 model (%{capacity})" large_v2: "large_v2 model (%{capacity})" large_v3: "large_v3 model (%{capacity})" - distil_small: "distil-small model (%{capacity})" - distil_medium: "distil-medium model (%{capacity})" - distil_large_v2: "distil-large-v2 model (%{capacity})" deepl_auth_key: label: DeepL Auth Key diff --git a/model.py b/model.py index 228bc253..5b17e167 100644 --- a/model.py +++ b/model.py @@ -339,12 +339,11 @@ class Model: source=self.mic_audio_recorder.source, phrase_timeout=phase_timeout, max_phrases=config.INPUT_MIC_MAX_PHRASES, - transcription_engine=config.SELECTED_TRANSCRIPTION_ENGINE, - whisper_weight_type=config.WHISPER_WEIGHT_TYPE, root=config.PATH_LOCAL, + whisper_weight_type=config.WHISPER_WEIGHT_TYPE, ) def sendMicTranscript(): - mic_transcriber.transcribeAudioQueue(mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY) + mic_transcriber.transcribeAudioQueue(mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY, config.SELECTED_TRANSCRIPTION_ENGINE) message = mic_transcriber.getTranscript() try: fnc(message) @@ -423,12 +422,11 @@ class Model: source=self.speaker_audio_recorder.source, phrase_timeout=phase_timeout, max_phrases=config.INPUT_SPEAKER_MAX_PHRASES, - transcription_engine=config.SELECTED_TRANSCRIPTION_ENGINE, - whisper_weight_type=config.WHISPER_WEIGHT_TYPE, root=config.PATH_LOCAL, + whisper_weight_type=config.WHISPER_WEIGHT_TYPE, ) def sendSpeakerTranscript(): - speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY) + speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY, config.SELECTED_TRANSCRIPTION_ENGINE) message = speaker_transcriber.getTranscript() try: fnc(message) diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py index b24d3163..08cc6a1a 100644 --- a/models/transcription/transcription_transcriber.py +++ b/models/transcription/transcription_transcriber.py @@ -5,7 +5,7 @@ from speech_recognition import Recognizer, AudioData, AudioFile from datetime import timedelta from pyaudiowpatch import get_sample_size, paInt16 from .transcription_languages import transcription_lang -from .transcription_whisper import getWhisperModel +from .transcription_whisper import getWhisperModel, checkWhisperWeight import torch import numpy as np @@ -14,7 +14,7 @@ PHRASE_TIMEOUT = 3 MAX_PHRASES = 10 class AudioTranscriber: - def __init__(self, speaker, source, phrase_timeout, max_phrases, transcription_engine, whisper_weight_type=None, root=None): + def __init__(self, speaker, source, phrase_timeout, max_phrases, root=None, whisper_weight_type=None, ): self.speaker = speaker self.phrase_timeout = phrase_timeout self.max_phrases = max_phrases @@ -30,34 +30,37 @@ class AudioTranscriber: "new_phrase": True, "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData } - self.transcription_engine = transcription_engine - match self.transcription_engine: - case "Google": - self.audio_recognizer = Recognizer() - case "Whisper": - self.audio_recognizer = getWhisperModel(root, whisper_weight_type) + if whisper_weight_type is not None and root is not None and checkWhisperWeight(root, whisper_weight_type) is True: + self.whisper_model = getWhisperModel(root, whisper_weight_type) + else: + self.whisper_model = None - def transcribeAudioQueue(self, audio_queue, language, country): + def transcribeAudioQueue(self, audio_queue, language, country, transcription_engine): audio, time_spoken = audio_queue.get() self.updateLastSampleAndPhraseStatus(audio, time_spoken) text = '' try: + # Whisperが使用できない場合はGoogle Speech-to-Textを使用する + if transcription_engine == "Whisper": + if self.whisper_model is None: + transcription_engine = "Google" + audio_data = self.audio_sources["process_data_func"]() - match self.transcription_engine: + match transcription_engine: case "Google": - text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][self.transcription_engine]) + text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][transcription_engine]) case "Whisper": audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0 if isinstance(audio_data, torch.Tensor): audio_data = audio_data.detach().numpy() - segments, _ = self.audio_recognizer.transcribe( + segments, _ = self.whisper_model.transcribe( audio_data, beam_size=5, temperature=0.0, log_prob_threshold=-0.8, no_speech_threshold=0.6, - language=transcription_lang[language][country][self.transcription_engine], + language=transcription_lang[language][country][transcription_engine], word_timestamps=False, without_timestamps=True, task="transcribe", diff --git a/models/transcription/transcription_whisper.py b/models/transcription/transcription_whisper.py index 148b2edb..c6412d35 100644 --- a/models/transcription/transcription_whisper.py +++ b/models/transcription/transcription_whisper.py @@ -15,9 +15,6 @@ _MODELS = { "large-v1": "Systran/faster-whisper-large-v1", "large-v2": "Systran/faster-whisper-large-v2", "large-v3": "Systran/faster-whisper-large-v3", - "distil-small": "Systran/faster-distil-whisper-small.en", - "distil-medium": "Systran/faster-distil-whisper-medium.en", - "distil-large-v2": "Systran/faster-distil-whisper-large-v2" } _FILENAMES = [ diff --git a/view.py b/view.py index 1efb3f22..94a4af8c 100644 --- a/view.py +++ b/view.py @@ -954,9 +954,6 @@ class View(): config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v1"]: i18n.t("config_window.whisper_weight_type.large_v1", capacity="2.87GB"), config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v2"]: i18n.t("config_window.whisper_weight_type.large_v2", capacity="2.87GB"), config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["large-v3"]: i18n.t("config_window.whisper_weight_type.large_v3", capacity="2.87GB"), - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-small"]: i18n.t("config_window.whisper_weight_type.distil_small", capacity="319MB"), - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-medium"]: i18n.t("config_window.whisper_weight_type.distil_medium", capacity="755MB"), - config.SELECTABLE_WHISPER_WEIGHT_TYPE_DICT["distil-large-v2"]: i18n.t("config_window.whisper_weight_type.distil_large_v2", capacity="1.41GB"), } # Open Webpage Functions