👍️[Update] Model : whisperの設定項目を追加(avg_logprob, no_speech_prob)

2024-06-18 23:46:04 +09:00
parent 5178e5aeba
commit 01f73bc1f8
3 changed files with 64 additions and 4 deletions
--- a/config.py
+++ b/config.py
@@ -546,6 +546,28 @@ class Config:
            self._INPUT_MIC_WORD_FILTER = sorted(set(value), key=value.index)
            saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)

+    @property
+    @json_serializable('INPUT_MIC_AVG_LOGPROB')
+    def INPUT_MIC_AVG_LOGPROB(self):
+        return self._INPUT_MIC_AVG_LOGPROB
+
+    @INPUT_MIC_AVG_LOGPROB.setter
+    def INPUT_MIC_AVG_LOGPROB(self, value):
+        if isinstance(value, float) or isinstance(value, int):
+            self._INPUT_MIC_AVG_LOGPROB = value
+            saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
+
+    @property
+    @json_serializable('INPUT_MIC_NO_SPEECH_PROB')
+    def INPUT_MIC_NO_SPEECH_PROB(self):
+        return self._INPUT_MIC_NO_SPEECH_PROB
+
+    @INPUT_MIC_NO_SPEECH_PROB.setter
+    def INPUT_MIC_NO_SPEECH_PROB(self, value):
+        if isinstance(value, float) or isinstance(value, int):
+            self._INPUT_MIC_NO_SPEECH_PROB = value
+            saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
+
    @property
    @json_serializable('CHOICE_SPEAKER_DEVICE')
    def CHOICE_SPEAKER_DEVICE(self):
@@ -612,6 +634,28 @@ class Config:
            self._INPUT_SPEAKER_MAX_PHRASES = value
            saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)

+    @property
+    @json_serializable('INPUT_SPEAKER_AVG_LOGPROB')
+    def INPUT_SPEAKER_AVG_LOGPROB(self):
+        return self._INPUT_SPEAKER_AVG_LOGPROB
+
+    @INPUT_SPEAKER_AVG_LOGPROB.setter
+    def INPUT_SPEAKER_AVG_LOGPROB(self, value):
+        if isinstance(value, float) or isinstance(value, int):
+            self._INPUT_SPEAKER_AVG_LOGPROB = value
+            saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
+
+    @property
+    @json_serializable('INPUT_SPEAKER_NO_SPEECH_PROB')
+    def INPUT_SPEAKER_NO_SPEECH_PROB(self):
+        return self._INPUT_SPEAKER_NO_SPEECH_PROB
+
+    @INPUT_SPEAKER_NO_SPEECH_PROB.setter
+    def INPUT_SPEAKER_NO_SPEECH_PROB(self, value):
+        if isinstance(value, float) or isinstance(value, int):
+            self._INPUT_SPEAKER_NO_SPEECH_PROB = value
+            saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
+
    @property
    @json_serializable('OSC_IP_ADDRESS')
    def OSC_IP_ADDRESS(self):
@@ -1043,12 +1087,16 @@ class Config:
        self._INPUT_MIC_PHRASE_TIMEOUT = 3
        self._INPUT_MIC_MAX_PHRASES = 10
        self._INPUT_MIC_WORD_FILTER = []
+        self._INPUT_MIC_AVG_LOGPROB=-0.8
+        self._INPUT_MIC_NO_SPEECH_PROB=0.6
        self._CHOICE_SPEAKER_DEVICE = getDefaultOutputDevice()["device"]["name"]
        self._INPUT_SPEAKER_ENERGY_THRESHOLD = 300
        self._INPUT_SPEAKER_DYNAMIC_ENERGY_THRESHOLD = False
        self._INPUT_SPEAKER_RECORD_TIMEOUT = 3
        self._INPUT_SPEAKER_PHRASE_TIMEOUT = 3
        self._INPUT_SPEAKER_MAX_PHRASES = 10
+        self._INPUT_SPEAKER_AVG_LOGPROB=-0.8
+        self._INPUT_SPEAKER_NO_SPEECH_PROB=0.6
        self._OSC_IP_ADDRESS = "127.0.0.1"
        self._OSC_PORT = 9000
        self._AUTH_KEYS = {
--- a/model.py
+++ b/model.py
@@ -427,7 +427,13 @@ class Model:
        )
        def sendMicTranscript():
            try:
-                res = self.mic_transcriber.transcribeAudioQueue(self.mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY)
+                res = self.mic_transcriber.transcribeAudioQueue(
+                    self.mic_audio_queue,
+                    config.SOURCE_LANGUAGE,
+                    config.SOURCE_COUNTRY,
+                    config.INPUT_MIC_AVG_LOGPROB,
+                    config.INPUT_MIC_NO_SPEECH_PROB
+                )
                if res:
                    message = self.mic_transcriber.getTranscript()
                    fnc(message)
@@ -581,7 +587,13 @@ class Model:
        )
        def sendSpeakerTranscript():
            try:
-                res = self.speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY)
+                res = self.speaker_transcriber.transcribeAudioQueue(
+                    speaker_audio_queue,
+                    config.TARGET_LANGUAGE,
+                    config.TARGET_COUNTRY,
+                    config.INPUT_SPEAKER_AVG_LOGPROB,
+                    config.INPUT_SPEAKER_NO_SPEECH_PROB
+                )
                if res:
                    message = self.speaker_transcriber.getTranscript()
                    fnc(message)
--- a/models/transcription/transcription_transcriber.py
+++ b/models/transcription/transcription_transcriber.py
@@ -38,7 +38,7 @@ class AudioTranscriber:
            self.whisper_model = getWhisperModel(root, whisper_weight_type)
            self.transcription_engine = "Whisper"

-    def transcribeAudioQueue(self, audio_queue, language, country):
+    def transcribeAudioQueue(self, audio_queue, language, country, avg_logprob=-0.8, no_speech_prob=0.6):
        if audio_queue.empty():
            time.sleep(0.01)
            return False
@@ -68,7 +68,7 @@ class AudioTranscriber:
                        vad_filter=False,
                        )
                    for s in segments:
-                        if s.avg_logprob < -0.8 or s.no_speech_prob > 0.6:
+                        if s.avg_logprob < avg_logprob or s.no_speech_prob > no_speech_prob:
                            continue
                        text += s.text