[Feature] Add VAD parameters and no-repeat N-gram size to Config and AudioTranscriber
This commit is contained in:
@@ -639,6 +639,9 @@ class Config:
|
||||
MIC_MAX_PHRASES = ManagedProperty('MIC_MAX_PHRASES', type_=int)
|
||||
MIC_AVG_LOGPROB = ManagedProperty('MIC_AVG_LOGPROB', type_=(int, float))
|
||||
MIC_NO_SPEECH_PROB = ManagedProperty('MIC_NO_SPEECH_PROB', type_=(int, float))
|
||||
MIC_NO_REPEAT_NGRAM_SIZE = ManagedProperty('MIC_NO_REPEAT_NGRAM_SIZE', type_=int)
|
||||
MIC_VAD_FILTER = ManagedProperty('MIC_VAD_FILTER', type_=bool)
|
||||
MIC_VAD_PARAMETERS = ManagedProperty('MIC_VAD_PARAMETERS', type_=dict, mutable_tracking=True)
|
||||
HOTKEYS = ValidatedProperty('HOTKEYS',
|
||||
validator=lambda val, inst: (
|
||||
{k: (v if (isinstance(v, list) or v is None) else inst.HOTKEYS.get(k))
|
||||
@@ -655,6 +658,9 @@ class Config:
|
||||
SPEAKER_MAX_PHRASES = ManagedProperty('SPEAKER_MAX_PHRASES', type_=int)
|
||||
SPEAKER_AVG_LOGPROB = ManagedProperty('SPEAKER_AVG_LOGPROB', type_=(int, float))
|
||||
SPEAKER_NO_SPEECH_PROB = ManagedProperty('SPEAKER_NO_SPEECH_PROB', type_=(int, float))
|
||||
SPEAKER_NO_REPEAT_NGRAM_SIZE = ManagedProperty('SPEAKER_NO_REPEAT_NGRAM_SIZE', type_=int)
|
||||
SPEAKER_VAD_FILTER = ManagedProperty('SPEAKER_VAD_FILTER', type_=bool)
|
||||
SPEAKER_VAD_PARAMETERS = ManagedProperty('SPEAKER_VAD_PARAMETERS', type_=dict, mutable_tracking=True)
|
||||
|
||||
# --- Auth and API settings ---
|
||||
AUTH_KEYS = ValidatedProperty('AUTH_KEYS',
|
||||
@@ -862,6 +868,16 @@ class Config:
|
||||
self._PLUGINS_STATUS = []
|
||||
self._MIC_AVG_LOGPROB = -0.8
|
||||
self._MIC_NO_SPEECH_PROB = 0.6
|
||||
self._MIC_NO_REPEAT_NGRAM_SIZE = 0
|
||||
self._MIC_VAD_FILTER = False
|
||||
self._MIC_VAD_PARAMETERS = {
|
||||
"threshold": 0.5,
|
||||
"neg_threshold": None,
|
||||
"min_speech_duration_ms": 0,
|
||||
"max_speech_duration_s": float("inf"),
|
||||
"min_silence_duration_ms": 2000,
|
||||
"speech_pad_ms": 400,
|
||||
}
|
||||
self._AUTO_SPEAKER_SELECT = True
|
||||
try:
|
||||
if device_manager is not None:
|
||||
@@ -879,6 +895,16 @@ class Config:
|
||||
self._SPEAKER_MAX_PHRASES = 10
|
||||
self._SPEAKER_AVG_LOGPROB = -0.8
|
||||
self._SPEAKER_NO_SPEECH_PROB = 0.6
|
||||
self._SPEAKER_NO_REPEAT_NGRAM_SIZE = 0
|
||||
self._SPEAKER_VAD_FILTER = False
|
||||
self._SPEAKER_VAD_PARAMETERS = {
|
||||
"threshold": 0.5,
|
||||
"neg_threshold": None,
|
||||
"min_speech_duration_ms": 0,
|
||||
"max_speech_duration_s": float("inf"),
|
||||
"min_silence_duration_ms": 2000,
|
||||
"speech_pad_ms": 400,
|
||||
}
|
||||
self._OSC_IP_ADDRESS = "127.0.0.1"
|
||||
self._OSC_PORT = 9000
|
||||
self._AUTH_KEYS = {
|
||||
|
||||
@@ -664,7 +664,10 @@ class Model:
|
||||
languages,
|
||||
countries,
|
||||
config.MIC_AVG_LOGPROB,
|
||||
config.MIC_NO_SPEECH_PROB
|
||||
config.MIC_NO_SPEECH_PROB,
|
||||
config.MIC_NO_REPEAT_NGRAM_SIZE,
|
||||
config.MIC_VAD_FILTER,
|
||||
config.MIC_VAD_PARAMETERS,
|
||||
)
|
||||
if res:
|
||||
result = self.mic_transcriber.getTranscript()
|
||||
@@ -856,7 +859,10 @@ class Model:
|
||||
languages,
|
||||
countries,
|
||||
config.SPEAKER_AVG_LOGPROB,
|
||||
config.SPEAKER_NO_SPEECH_PROB
|
||||
config.SPEAKER_NO_SPEECH_PROB,
|
||||
config.SPEAKER_NO_REPEAT_NGRAM_SIZE,
|
||||
config.SPEAKER_VAD_FILTER,
|
||||
config.SPEAKER_VAD_PARAMETERS,
|
||||
)
|
||||
if res:
|
||||
result = self.speaker_transcriber.getTranscript()
|
||||
|
||||
@@ -8,7 +8,7 @@ import time
|
||||
from io import BytesIO
|
||||
from threading import Event
|
||||
import wave
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from speech_recognition import Recognizer, AudioData, AudioFile
|
||||
from speech_recognition.exceptions import UnknownValueError
|
||||
from datetime import timedelta
|
||||
@@ -84,6 +84,9 @@ class AudioTranscriber:
|
||||
countries: List[str],
|
||||
avg_logprob: float = -0.8,
|
||||
no_speech_prob: float = 0.6,
|
||||
no_repeat_ngram_size: int = 0,
|
||||
vad_filter: bool = False,
|
||||
vad_parameters: Optional[Union[dict, Any]] = None,
|
||||
) -> bool:
|
||||
if audio_queue.empty():
|
||||
time.sleep(0.01)
|
||||
@@ -130,7 +133,9 @@ class AudioTranscriber:
|
||||
word_timestamps=False,
|
||||
without_timestamps=True,
|
||||
task="transcribe",
|
||||
vad_filter=False,
|
||||
no_repeat_ngram_size=no_repeat_ngram_size,
|
||||
vad_filter=vad_filter,
|
||||
vad_parameters=vad_parameters,
|
||||
)
|
||||
for s in segments:
|
||||
if s.avg_logprob < avg_logprob or s.no_speech_prob > no_speech_prob:
|
||||
|
||||
Reference in New Issue
Block a user