From 01f73bc1f8496596a872f03c9618b07cb4ff718e Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Tue, 18 Jun 2024 23:46:04 +0900 Subject: [PATCH] =?UTF-8?q?=F0=9F=91=8D=EF=B8=8F[Update]=20Model=20:=20whi?= =?UTF-8?q?sper=E3=81=AE=E8=A8=AD=E5=AE=9A=E9=A0=85=E7=9B=AE=E3=82=92?= =?UTF-8?q?=E8=BF=BD=E5=8A=A0(avg=5Flogprob,=20no=5Fspeech=5Fprob)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config.py | 48 +++++++++++++++++++ model.py | 16 ++++++- .../transcription_transcriber.py | 4 +- 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/config.py b/config.py index 35ab252c..a520de05 100644 --- a/config.py +++ b/config.py @@ -546,6 +546,28 @@ class Config: self._INPUT_MIC_WORD_FILTER = sorted(set(value), key=value.index) saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) + @property + @json_serializable('INPUT_MIC_AVG_LOGPROB') + def INPUT_MIC_AVG_LOGPROB(self): + return self._INPUT_MIC_AVG_LOGPROB + + @INPUT_MIC_AVG_LOGPROB.setter + def INPUT_MIC_AVG_LOGPROB(self, value): + if isinstance(value, float) or isinstance(value, int): + self._INPUT_MIC_AVG_LOGPROB = value + saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) + + @property + @json_serializable('INPUT_MIC_NO_SPEECH_PROB') + def INPUT_MIC_NO_SPEECH_PROB(self): + return self._INPUT_MIC_NO_SPEECH_PROB + + @INPUT_MIC_NO_SPEECH_PROB.setter + def INPUT_MIC_NO_SPEECH_PROB(self, value): + if isinstance(value, float) or isinstance(value, int): + self._INPUT_MIC_NO_SPEECH_PROB = value + saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) + @property @json_serializable('CHOICE_SPEAKER_DEVICE') def CHOICE_SPEAKER_DEVICE(self): @@ -612,6 +634,28 @@ class Config: self._INPUT_SPEAKER_MAX_PHRASES = value saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) + @property + @json_serializable('INPUT_SPEAKER_AVG_LOGPROB') + def INPUT_SPEAKER_AVG_LOGPROB(self): + return self._INPUT_SPEAKER_AVG_LOGPROB + + @INPUT_SPEAKER_AVG_LOGPROB.setter + def INPUT_SPEAKER_AVG_LOGPROB(self, value): + if isinstance(value, float) or isinstance(value, int): + self._INPUT_SPEAKER_AVG_LOGPROB = value + saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) + + @property + @json_serializable('INPUT_SPEAKER_NO_SPEECH_PROB') + def INPUT_SPEAKER_NO_SPEECH_PROB(self): + return self._INPUT_SPEAKER_NO_SPEECH_PROB + + @INPUT_SPEAKER_NO_SPEECH_PROB.setter + def INPUT_SPEAKER_NO_SPEECH_PROB(self, value): + if isinstance(value, float) or isinstance(value, int): + self._INPUT_SPEAKER_NO_SPEECH_PROB = value + saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value) + @property @json_serializable('OSC_IP_ADDRESS') def OSC_IP_ADDRESS(self): @@ -1043,12 +1087,16 @@ class Config: self._INPUT_MIC_PHRASE_TIMEOUT = 3 self._INPUT_MIC_MAX_PHRASES = 10 self._INPUT_MIC_WORD_FILTER = [] + self._INPUT_MIC_AVG_LOGPROB=-0.8 + self._INPUT_MIC_NO_SPEECH_PROB=0.6 self._CHOICE_SPEAKER_DEVICE = getDefaultOutputDevice()["device"]["name"] self._INPUT_SPEAKER_ENERGY_THRESHOLD = 300 self._INPUT_SPEAKER_DYNAMIC_ENERGY_THRESHOLD = False self._INPUT_SPEAKER_RECORD_TIMEOUT = 3 self._INPUT_SPEAKER_PHRASE_TIMEOUT = 3 self._INPUT_SPEAKER_MAX_PHRASES = 10 + self._INPUT_SPEAKER_AVG_LOGPROB=-0.8 + self._INPUT_SPEAKER_NO_SPEECH_PROB=0.6 self._OSC_IP_ADDRESS = "127.0.0.1" self._OSC_PORT = 9000 self._AUTH_KEYS = { diff --git a/model.py b/model.py index 2017a787..dddb056f 100644 --- a/model.py +++ b/model.py @@ -427,7 +427,13 @@ class Model: ) def sendMicTranscript(): try: - res = self.mic_transcriber.transcribeAudioQueue(self.mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY) + res = self.mic_transcriber.transcribeAudioQueue( + self.mic_audio_queue, + config.SOURCE_LANGUAGE, + config.SOURCE_COUNTRY, + config.INPUT_MIC_AVG_LOGPROB, + config.INPUT_MIC_NO_SPEECH_PROB + ) if res: message = self.mic_transcriber.getTranscript() fnc(message) @@ -581,7 +587,13 @@ class Model: ) def sendSpeakerTranscript(): try: - res = self.speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY) + res = self.speaker_transcriber.transcribeAudioQueue( + speaker_audio_queue, + config.TARGET_LANGUAGE, + config.TARGET_COUNTRY, + config.INPUT_SPEAKER_AVG_LOGPROB, + config.INPUT_SPEAKER_NO_SPEECH_PROB + ) if res: message = self.speaker_transcriber.getTranscript() fnc(message) diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py index 56d9d979..82f1e981 100644 --- a/models/transcription/transcription_transcriber.py +++ b/models/transcription/transcription_transcriber.py @@ -38,7 +38,7 @@ class AudioTranscriber: self.whisper_model = getWhisperModel(root, whisper_weight_type) self.transcription_engine = "Whisper" - def transcribeAudioQueue(self, audio_queue, language, country): + def transcribeAudioQueue(self, audio_queue, language, country, avg_logprob=-0.8, no_speech_prob=0.6): if audio_queue.empty(): time.sleep(0.01) return False @@ -68,7 +68,7 @@ class AudioTranscriber: vad_filter=False, ) for s in segments: - if s.avg_logprob < -0.8 or s.no_speech_prob > 0.6: + if s.avg_logprob < avg_logprob or s.no_speech_prob > no_speech_prob: continue text += s.text