[Feature] Add VAD parameters and no-repeat N-gram size to Config and AudioTranscriber

2025-11-12 12:42:04 +09:00
parent 2062849887
commit cfc0e9fcc4
3 changed files with 41 additions and 4 deletions
--- a/src-python/config.py
+++ b/src-python/config.py
@@ -639,6 +639,9 @@ class Config:
    MIC_MAX_PHRASES = ManagedProperty('MIC_MAX_PHRASES', type_=int)
    MIC_AVG_LOGPROB = ManagedProperty('MIC_AVG_LOGPROB', type_=(int, float))
    MIC_NO_SPEECH_PROB = ManagedProperty('MIC_NO_SPEECH_PROB', type_=(int, float))
+    MIC_NO_REPEAT_NGRAM_SIZE = ManagedProperty('MIC_NO_REPEAT_NGRAM_SIZE', type_=int)
+    MIC_VAD_FILTER = ManagedProperty('MIC_VAD_FILTER', type_=bool)
+    MIC_VAD_PARAMETERS = ManagedProperty('MIC_VAD_PARAMETERS', type_=dict, mutable_tracking=True)
    HOTKEYS = ValidatedProperty('HOTKEYS',
        validator=lambda val, inst: (
            {k: (v if (isinstance(v, list) or v is None) else inst.HOTKEYS.get(k))
@@ -655,6 +658,9 @@ class Config:
    SPEAKER_MAX_PHRASES = ManagedProperty('SPEAKER_MAX_PHRASES', type_=int)
    SPEAKER_AVG_LOGPROB = ManagedProperty('SPEAKER_AVG_LOGPROB', type_=(int, float))
    SPEAKER_NO_SPEECH_PROB = ManagedProperty('SPEAKER_NO_SPEECH_PROB', type_=(int, float))
+    SPEAKER_NO_REPEAT_NGRAM_SIZE = ManagedProperty('SPEAKER_NO_REPEAT_NGRAM_SIZE', type_=int)
+    SPEAKER_VAD_FILTER = ManagedProperty('SPEAKER_VAD_FILTER', type_=bool)
+    SPEAKER_VAD_PARAMETERS = ManagedProperty('SPEAKER_VAD_PARAMETERS', type_=dict, mutable_tracking=True)

    # --- Auth and API settings ---
    AUTH_KEYS = ValidatedProperty('AUTH_KEYS',
@@ -862,6 +868,16 @@ class Config:
        self._PLUGINS_STATUS = []
        self._MIC_AVG_LOGPROB = -0.8
        self._MIC_NO_SPEECH_PROB = 0.6
+        self._MIC_NO_REPEAT_NGRAM_SIZE = 0
+        self._MIC_VAD_FILTER = False
+        self._MIC_VAD_PARAMETERS = {
+            "threshold": 0.5,
+            "neg_threshold": None,
+            "min_speech_duration_ms": 0,
+            "max_speech_duration_s": float("inf"),
+            "min_silence_duration_ms": 2000,
+            "speech_pad_ms": 400,
+        }
        self._AUTO_SPEAKER_SELECT = True
        try:
            if device_manager is not None:
@@ -879,6 +895,16 @@ class Config:
        self._SPEAKER_MAX_PHRASES = 10
        self._SPEAKER_AVG_LOGPROB = -0.8
        self._SPEAKER_NO_SPEECH_PROB = 0.6
+        self._SPEAKER_NO_REPEAT_NGRAM_SIZE = 0
+        self._SPEAKER_VAD_FILTER = False
+        self._SPEAKER_VAD_PARAMETERS = {
+            "threshold": 0.5,
+            "neg_threshold": None,
+            "min_speech_duration_ms": 0,
+            "max_speech_duration_s": float("inf"),
+            "min_silence_duration_ms": 2000,
+            "speech_pad_ms": 400,
+        }
        self._OSC_IP_ADDRESS = "127.0.0.1"
        self._OSC_PORT = 9000
        self._AUTH_KEYS = {
--- a/src-python/model.py
+++ b/src-python/model.py
@@ -664,7 +664,10 @@ class Model:
                            languages,
                            countries,
                            config.MIC_AVG_LOGPROB,
-                            config.MIC_NO_SPEECH_PROB
+                            config.MIC_NO_SPEECH_PROB,
+                            config.MIC_NO_REPEAT_NGRAM_SIZE,
+                            config.MIC_VAD_FILTER,
+                            config.MIC_VAD_PARAMETERS,
                        )
                        if res:
                            result = self.mic_transcriber.getTranscript()
@@ -856,7 +859,10 @@ class Model:
                            languages,
                            countries,
                            config.SPEAKER_AVG_LOGPROB,
-                            config.SPEAKER_NO_SPEECH_PROB
+                            config.SPEAKER_NO_SPEECH_PROB,
+                            config.SPEAKER_NO_REPEAT_NGRAM_SIZE,
+                            config.SPEAKER_VAD_FILTER,
+                            config.SPEAKER_VAD_PARAMETERS,
                        )
                        if res:
                            result = self.speaker_transcriber.getTranscript()
--- a/src-python/models/transcription/transcription_transcriber.py
+++ b/src-python/models/transcription/transcription_transcriber.py
@@ -8,7 +8,7 @@ import time
 from io import BytesIO
 from threading import Event
 import wave
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Union
 from speech_recognition import Recognizer, AudioData, AudioFile
 from speech_recognition.exceptions import UnknownValueError
 from datetime import timedelta
@@ -84,6 +84,9 @@ class AudioTranscriber:
        countries: List[str],
        avg_logprob: float = -0.8,
        no_speech_prob: float = 0.6,
+        no_repeat_ngram_size: int = 0,
+        vad_filter: bool = False,
+        vad_parameters: Optional[Union[dict, Any]] = None,
    ) -> bool:
        if audio_queue.empty():
            time.sleep(0.01)
@@ -130,7 +133,9 @@ class AudioTranscriber:
                            word_timestamps=False,
                            without_timestamps=True,
                            task="transcribe",
-                            vad_filter=False,
+                            no_repeat_ngram_size=no_repeat_ngram_size,
+                            vad_filter=vad_filter,
+                            vad_parameters=vad_parameters,
                        )
                        for s in segments:
                            if s.avg_logprob < avg_logprob or s.no_speech_prob > no_speech_prob: