diff --git a/src-python/config.py b/src-python/config.py index 24dd42e1..f2c1e28a 100644 --- a/src-python/config.py +++ b/src-python/config.py @@ -639,6 +639,9 @@ class Config: MIC_MAX_PHRASES = ManagedProperty('MIC_MAX_PHRASES', type_=int) MIC_AVG_LOGPROB = ManagedProperty('MIC_AVG_LOGPROB', type_=(int, float)) MIC_NO_SPEECH_PROB = ManagedProperty('MIC_NO_SPEECH_PROB', type_=(int, float)) + MIC_NO_REPEAT_NGRAM_SIZE = ManagedProperty('MIC_NO_REPEAT_NGRAM_SIZE', type_=int) + MIC_VAD_FILTER = ManagedProperty('MIC_VAD_FILTER', type_=bool) + MIC_VAD_PARAMETERS = ManagedProperty('MIC_VAD_PARAMETERS', type_=dict, mutable_tracking=True) HOTKEYS = ValidatedProperty('HOTKEYS', validator=lambda val, inst: ( {k: (v if (isinstance(v, list) or v is None) else inst.HOTKEYS.get(k)) @@ -655,6 +658,9 @@ class Config: SPEAKER_MAX_PHRASES = ManagedProperty('SPEAKER_MAX_PHRASES', type_=int) SPEAKER_AVG_LOGPROB = ManagedProperty('SPEAKER_AVG_LOGPROB', type_=(int, float)) SPEAKER_NO_SPEECH_PROB = ManagedProperty('SPEAKER_NO_SPEECH_PROB', type_=(int, float)) + SPEAKER_NO_REPEAT_NGRAM_SIZE = ManagedProperty('SPEAKER_NO_REPEAT_NGRAM_SIZE', type_=int) + SPEAKER_VAD_FILTER = ManagedProperty('SPEAKER_VAD_FILTER', type_=bool) + SPEAKER_VAD_PARAMETERS = ManagedProperty('SPEAKER_VAD_PARAMETERS', type_=dict, mutable_tracking=True) # --- Auth and API settings --- AUTH_KEYS = ValidatedProperty('AUTH_KEYS', @@ -862,6 +868,16 @@ class Config: self._PLUGINS_STATUS = [] self._MIC_AVG_LOGPROB = -0.8 self._MIC_NO_SPEECH_PROB = 0.6 + self._MIC_NO_REPEAT_NGRAM_SIZE = 0 + self._MIC_VAD_FILTER = False + self._MIC_VAD_PARAMETERS = { + "threshold": 0.5, + "neg_threshold": None, + "min_speech_duration_ms": 0, + "max_speech_duration_s": float("inf"), + "min_silence_duration_ms": 2000, + "speech_pad_ms": 400, + } self._AUTO_SPEAKER_SELECT = True try: if device_manager is not None: @@ -879,6 +895,16 @@ class Config: self._SPEAKER_MAX_PHRASES = 10 self._SPEAKER_AVG_LOGPROB = -0.8 self._SPEAKER_NO_SPEECH_PROB = 0.6 + self._SPEAKER_NO_REPEAT_NGRAM_SIZE = 0 + self._SPEAKER_VAD_FILTER = False + self._SPEAKER_VAD_PARAMETERS = { + "threshold": 0.5, + "neg_threshold": None, + "min_speech_duration_ms": 0, + "max_speech_duration_s": float("inf"), + "min_silence_duration_ms": 2000, + "speech_pad_ms": 400, + } self._OSC_IP_ADDRESS = "127.0.0.1" self._OSC_PORT = 9000 self._AUTH_KEYS = { diff --git a/src-python/model.py b/src-python/model.py index 975977fc..58f947dc 100644 --- a/src-python/model.py +++ b/src-python/model.py @@ -664,7 +664,10 @@ class Model: languages, countries, config.MIC_AVG_LOGPROB, - config.MIC_NO_SPEECH_PROB + config.MIC_NO_SPEECH_PROB, + config.MIC_NO_REPEAT_NGRAM_SIZE, + config.MIC_VAD_FILTER, + config.MIC_VAD_PARAMETERS, ) if res: result = self.mic_transcriber.getTranscript() @@ -856,7 +859,10 @@ class Model: languages, countries, config.SPEAKER_AVG_LOGPROB, - config.SPEAKER_NO_SPEECH_PROB + config.SPEAKER_NO_SPEECH_PROB, + config.SPEAKER_NO_REPEAT_NGRAM_SIZE, + config.SPEAKER_VAD_FILTER, + config.SPEAKER_VAD_PARAMETERS, ) if res: result = self.speaker_transcriber.getTranscript() diff --git a/src-python/models/transcription/transcription_transcriber.py b/src-python/models/transcription/transcription_transcriber.py index 15a9aa51..6ea3ad8f 100644 --- a/src-python/models/transcription/transcription_transcriber.py +++ b/src-python/models/transcription/transcription_transcriber.py @@ -8,7 +8,7 @@ import time from io import BytesIO from threading import Event import wave -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Union from speech_recognition import Recognizer, AudioData, AudioFile from speech_recognition.exceptions import UnknownValueError from datetime import timedelta @@ -84,6 +84,9 @@ class AudioTranscriber: countries: List[str], avg_logprob: float = -0.8, no_speech_prob: float = 0.6, + no_repeat_ngram_size: int = 0, + vad_filter: bool = False, + vad_parameters: Optional[Union[dict, Any]] = None, ) -> bool: if audio_queue.empty(): time.sleep(0.01) @@ -130,7 +133,9 @@ class AudioTranscriber: word_timestamps=False, without_timestamps=True, task="transcribe", - vad_filter=False, + no_repeat_ngram_size=no_repeat_ngram_size, + vad_filter=vad_filter, + vad_parameters=vad_parameters, ) for s in segments: if s.avg_logprob < avg_logprob or s.no_speech_prob > no_speech_prob: