👍️[Update] Model : whisperの設定項目を追加(avg_logprob, no_speech_prob)

This commit is contained in:
misyaguziya
2024-06-18 23:46:04 +09:00
parent 5178e5aeba
commit 01f73bc1f8
3 changed files with 64 additions and 4 deletions

View File

@@ -546,6 +546,28 @@ class Config:
self._INPUT_MIC_WORD_FILTER = sorted(set(value), key=value.index)
saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
@property
@json_serializable('INPUT_MIC_AVG_LOGPROB')
def INPUT_MIC_AVG_LOGPROB(self):
return self._INPUT_MIC_AVG_LOGPROB
@INPUT_MIC_AVG_LOGPROB.setter
def INPUT_MIC_AVG_LOGPROB(self, value):
if isinstance(value, float) or isinstance(value, int):
self._INPUT_MIC_AVG_LOGPROB = value
saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
@property
@json_serializable('INPUT_MIC_NO_SPEECH_PROB')
def INPUT_MIC_NO_SPEECH_PROB(self):
return self._INPUT_MIC_NO_SPEECH_PROB
@INPUT_MIC_NO_SPEECH_PROB.setter
def INPUT_MIC_NO_SPEECH_PROB(self, value):
if isinstance(value, float) or isinstance(value, int):
self._INPUT_MIC_NO_SPEECH_PROB = value
saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
@property
@json_serializable('CHOICE_SPEAKER_DEVICE')
def CHOICE_SPEAKER_DEVICE(self):
@@ -612,6 +634,28 @@ class Config:
self._INPUT_SPEAKER_MAX_PHRASES = value
saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
@property
@json_serializable('INPUT_SPEAKER_AVG_LOGPROB')
def INPUT_SPEAKER_AVG_LOGPROB(self):
return self._INPUT_SPEAKER_AVG_LOGPROB
@INPUT_SPEAKER_AVG_LOGPROB.setter
def INPUT_SPEAKER_AVG_LOGPROB(self, value):
if isinstance(value, float) or isinstance(value, int):
self._INPUT_SPEAKER_AVG_LOGPROB = value
saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
@property
@json_serializable('INPUT_SPEAKER_NO_SPEECH_PROB')
def INPUT_SPEAKER_NO_SPEECH_PROB(self):
return self._INPUT_SPEAKER_NO_SPEECH_PROB
@INPUT_SPEAKER_NO_SPEECH_PROB.setter
def INPUT_SPEAKER_NO_SPEECH_PROB(self, value):
if isinstance(value, float) or isinstance(value, int):
self._INPUT_SPEAKER_NO_SPEECH_PROB = value
saveJson(self.PATH_CONFIG, inspect.currentframe().f_code.co_name, value)
@property
@json_serializable('OSC_IP_ADDRESS')
def OSC_IP_ADDRESS(self):
@@ -1043,12 +1087,16 @@ class Config:
self._INPUT_MIC_PHRASE_TIMEOUT = 3
self._INPUT_MIC_MAX_PHRASES = 10
self._INPUT_MIC_WORD_FILTER = []
self._INPUT_MIC_AVG_LOGPROB=-0.8
self._INPUT_MIC_NO_SPEECH_PROB=0.6
self._CHOICE_SPEAKER_DEVICE = getDefaultOutputDevice()["device"]["name"]
self._INPUT_SPEAKER_ENERGY_THRESHOLD = 300
self._INPUT_SPEAKER_DYNAMIC_ENERGY_THRESHOLD = False
self._INPUT_SPEAKER_RECORD_TIMEOUT = 3
self._INPUT_SPEAKER_PHRASE_TIMEOUT = 3
self._INPUT_SPEAKER_MAX_PHRASES = 10
self._INPUT_SPEAKER_AVG_LOGPROB=-0.8
self._INPUT_SPEAKER_NO_SPEECH_PROB=0.6
self._OSC_IP_ADDRESS = "127.0.0.1"
self._OSC_PORT = 9000
self._AUTH_KEYS = {

View File

@@ -427,7 +427,13 @@ class Model:
)
def sendMicTranscript():
try:
res = self.mic_transcriber.transcribeAudioQueue(self.mic_audio_queue, config.SOURCE_LANGUAGE, config.SOURCE_COUNTRY)
res = self.mic_transcriber.transcribeAudioQueue(
self.mic_audio_queue,
config.SOURCE_LANGUAGE,
config.SOURCE_COUNTRY,
config.INPUT_MIC_AVG_LOGPROB,
config.INPUT_MIC_NO_SPEECH_PROB
)
if res:
message = self.mic_transcriber.getTranscript()
fnc(message)
@@ -581,7 +587,13 @@ class Model:
)
def sendSpeakerTranscript():
try:
res = self.speaker_transcriber.transcribeAudioQueue(speaker_audio_queue, config.TARGET_LANGUAGE, config.TARGET_COUNTRY)
res = self.speaker_transcriber.transcribeAudioQueue(
speaker_audio_queue,
config.TARGET_LANGUAGE,
config.TARGET_COUNTRY,
config.INPUT_SPEAKER_AVG_LOGPROB,
config.INPUT_SPEAKER_NO_SPEECH_PROB
)
if res:
message = self.speaker_transcriber.getTranscript()
fnc(message)

View File

@@ -38,7 +38,7 @@ class AudioTranscriber:
self.whisper_model = getWhisperModel(root, whisper_weight_type)
self.transcription_engine = "Whisper"
def transcribeAudioQueue(self, audio_queue, language, country):
def transcribeAudioQueue(self, audio_queue, language, country, avg_logprob=-0.8, no_speech_prob=0.6):
if audio_queue.empty():
time.sleep(0.01)
return False
@@ -68,7 +68,7 @@ class AudioTranscriber:
vad_filter=False,
)
for s in segments:
if s.avg_logprob < -0.8 or s.no_speech_prob > 0.6:
if s.avg_logprob < avg_logprob or s.no_speech_prob > no_speech_prob:
continue
text += s.text