[Feature] Add VAD parameters and no-repeat N-gram size to Config and AudioTranscriber

This commit is contained in:
misyaguziya
2025-11-12 12:42:04 +09:00
parent 2062849887
commit cfc0e9fcc4
3 changed files with 41 additions and 4 deletions

View File

@@ -639,6 +639,9 @@ class Config:
MIC_MAX_PHRASES = ManagedProperty('MIC_MAX_PHRASES', type_=int) MIC_MAX_PHRASES = ManagedProperty('MIC_MAX_PHRASES', type_=int)
MIC_AVG_LOGPROB = ManagedProperty('MIC_AVG_LOGPROB', type_=(int, float)) MIC_AVG_LOGPROB = ManagedProperty('MIC_AVG_LOGPROB', type_=(int, float))
MIC_NO_SPEECH_PROB = ManagedProperty('MIC_NO_SPEECH_PROB', type_=(int, float)) MIC_NO_SPEECH_PROB = ManagedProperty('MIC_NO_SPEECH_PROB', type_=(int, float))
MIC_NO_REPEAT_NGRAM_SIZE = ManagedProperty('MIC_NO_REPEAT_NGRAM_SIZE', type_=int)
MIC_VAD_FILTER = ManagedProperty('MIC_VAD_FILTER', type_=bool)
MIC_VAD_PARAMETERS = ManagedProperty('MIC_VAD_PARAMETERS', type_=dict, mutable_tracking=True)
HOTKEYS = ValidatedProperty('HOTKEYS', HOTKEYS = ValidatedProperty('HOTKEYS',
validator=lambda val, inst: ( validator=lambda val, inst: (
{k: (v if (isinstance(v, list) or v is None) else inst.HOTKEYS.get(k)) {k: (v if (isinstance(v, list) or v is None) else inst.HOTKEYS.get(k))
@@ -655,6 +658,9 @@ class Config:
SPEAKER_MAX_PHRASES = ManagedProperty('SPEAKER_MAX_PHRASES', type_=int) SPEAKER_MAX_PHRASES = ManagedProperty('SPEAKER_MAX_PHRASES', type_=int)
SPEAKER_AVG_LOGPROB = ManagedProperty('SPEAKER_AVG_LOGPROB', type_=(int, float)) SPEAKER_AVG_LOGPROB = ManagedProperty('SPEAKER_AVG_LOGPROB', type_=(int, float))
SPEAKER_NO_SPEECH_PROB = ManagedProperty('SPEAKER_NO_SPEECH_PROB', type_=(int, float)) SPEAKER_NO_SPEECH_PROB = ManagedProperty('SPEAKER_NO_SPEECH_PROB', type_=(int, float))
SPEAKER_NO_REPEAT_NGRAM_SIZE = ManagedProperty('SPEAKER_NO_REPEAT_NGRAM_SIZE', type_=int)
SPEAKER_VAD_FILTER = ManagedProperty('SPEAKER_VAD_FILTER', type_=bool)
SPEAKER_VAD_PARAMETERS = ManagedProperty('SPEAKER_VAD_PARAMETERS', type_=dict, mutable_tracking=True)
# --- Auth and API settings --- # --- Auth and API settings ---
AUTH_KEYS = ValidatedProperty('AUTH_KEYS', AUTH_KEYS = ValidatedProperty('AUTH_KEYS',
@@ -862,6 +868,16 @@ class Config:
self._PLUGINS_STATUS = [] self._PLUGINS_STATUS = []
self._MIC_AVG_LOGPROB = -0.8 self._MIC_AVG_LOGPROB = -0.8
self._MIC_NO_SPEECH_PROB = 0.6 self._MIC_NO_SPEECH_PROB = 0.6
self._MIC_NO_REPEAT_NGRAM_SIZE = 0
self._MIC_VAD_FILTER = False
self._MIC_VAD_PARAMETERS = {
"threshold": 0.5,
"neg_threshold": None,
"min_speech_duration_ms": 0,
"max_speech_duration_s": float("inf"),
"min_silence_duration_ms": 2000,
"speech_pad_ms": 400,
}
self._AUTO_SPEAKER_SELECT = True self._AUTO_SPEAKER_SELECT = True
try: try:
if device_manager is not None: if device_manager is not None:
@@ -879,6 +895,16 @@ class Config:
self._SPEAKER_MAX_PHRASES = 10 self._SPEAKER_MAX_PHRASES = 10
self._SPEAKER_AVG_LOGPROB = -0.8 self._SPEAKER_AVG_LOGPROB = -0.8
self._SPEAKER_NO_SPEECH_PROB = 0.6 self._SPEAKER_NO_SPEECH_PROB = 0.6
self._SPEAKER_NO_REPEAT_NGRAM_SIZE = 0
self._SPEAKER_VAD_FILTER = False
self._SPEAKER_VAD_PARAMETERS = {
"threshold": 0.5,
"neg_threshold": None,
"min_speech_duration_ms": 0,
"max_speech_duration_s": float("inf"),
"min_silence_duration_ms": 2000,
"speech_pad_ms": 400,
}
self._OSC_IP_ADDRESS = "127.0.0.1" self._OSC_IP_ADDRESS = "127.0.0.1"
self._OSC_PORT = 9000 self._OSC_PORT = 9000
self._AUTH_KEYS = { self._AUTH_KEYS = {

View File

@@ -664,7 +664,10 @@ class Model:
languages, languages,
countries, countries,
config.MIC_AVG_LOGPROB, config.MIC_AVG_LOGPROB,
config.MIC_NO_SPEECH_PROB config.MIC_NO_SPEECH_PROB,
config.MIC_NO_REPEAT_NGRAM_SIZE,
config.MIC_VAD_FILTER,
config.MIC_VAD_PARAMETERS,
) )
if res: if res:
result = self.mic_transcriber.getTranscript() result = self.mic_transcriber.getTranscript()
@@ -856,7 +859,10 @@ class Model:
languages, languages,
countries, countries,
config.SPEAKER_AVG_LOGPROB, config.SPEAKER_AVG_LOGPROB,
config.SPEAKER_NO_SPEECH_PROB config.SPEAKER_NO_SPEECH_PROB,
config.SPEAKER_NO_REPEAT_NGRAM_SIZE,
config.SPEAKER_VAD_FILTER,
config.SPEAKER_VAD_PARAMETERS,
) )
if res: if res:
result = self.speaker_transcriber.getTranscript() result = self.speaker_transcriber.getTranscript()

View File

@@ -8,7 +8,7 @@ import time
from io import BytesIO from io import BytesIO
from threading import Event from threading import Event
import wave import wave
from typing import Any, Callable, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Union
from speech_recognition import Recognizer, AudioData, AudioFile from speech_recognition import Recognizer, AudioData, AudioFile
from speech_recognition.exceptions import UnknownValueError from speech_recognition.exceptions import UnknownValueError
from datetime import timedelta from datetime import timedelta
@@ -84,6 +84,9 @@ class AudioTranscriber:
countries: List[str], countries: List[str],
avg_logprob: float = -0.8, avg_logprob: float = -0.8,
no_speech_prob: float = 0.6, no_speech_prob: float = 0.6,
no_repeat_ngram_size: int = 0,
vad_filter: bool = False,
vad_parameters: Optional[Union[dict, Any]] = None,
) -> bool: ) -> bool:
if audio_queue.empty(): if audio_queue.empty():
time.sleep(0.01) time.sleep(0.01)
@@ -130,7 +133,9 @@ class AudioTranscriber:
word_timestamps=False, word_timestamps=False,
without_timestamps=True, without_timestamps=True,
task="transcribe", task="transcribe",
vad_filter=False, no_repeat_ngram_size=no_repeat_ngram_size,
vad_filter=vad_filter,
vad_parameters=vad_parameters,
) )
for s in segments: for s in segments:
if s.avg_logprob < avg_logprob or s.no_speech_prob > no_speech_prob: if s.avg_logprob < avg_logprob or s.no_speech_prob > no_speech_prob: