[Feature] Add VAD parameters and no-repeat N-gram size to Config and AudioTranscriber
This commit is contained in:
@@ -639,6 +639,9 @@ class Config:
|
|||||||
MIC_MAX_PHRASES = ManagedProperty('MIC_MAX_PHRASES', type_=int)
|
MIC_MAX_PHRASES = ManagedProperty('MIC_MAX_PHRASES', type_=int)
|
||||||
MIC_AVG_LOGPROB = ManagedProperty('MIC_AVG_LOGPROB', type_=(int, float))
|
MIC_AVG_LOGPROB = ManagedProperty('MIC_AVG_LOGPROB', type_=(int, float))
|
||||||
MIC_NO_SPEECH_PROB = ManagedProperty('MIC_NO_SPEECH_PROB', type_=(int, float))
|
MIC_NO_SPEECH_PROB = ManagedProperty('MIC_NO_SPEECH_PROB', type_=(int, float))
|
||||||
|
MIC_NO_REPEAT_NGRAM_SIZE = ManagedProperty('MIC_NO_REPEAT_NGRAM_SIZE', type_=int)
|
||||||
|
MIC_VAD_FILTER = ManagedProperty('MIC_VAD_FILTER', type_=bool)
|
||||||
|
MIC_VAD_PARAMETERS = ManagedProperty('MIC_VAD_PARAMETERS', type_=dict, mutable_tracking=True)
|
||||||
HOTKEYS = ValidatedProperty('HOTKEYS',
|
HOTKEYS = ValidatedProperty('HOTKEYS',
|
||||||
validator=lambda val, inst: (
|
validator=lambda val, inst: (
|
||||||
{k: (v if (isinstance(v, list) or v is None) else inst.HOTKEYS.get(k))
|
{k: (v if (isinstance(v, list) or v is None) else inst.HOTKEYS.get(k))
|
||||||
@@ -655,6 +658,9 @@ class Config:
|
|||||||
SPEAKER_MAX_PHRASES = ManagedProperty('SPEAKER_MAX_PHRASES', type_=int)
|
SPEAKER_MAX_PHRASES = ManagedProperty('SPEAKER_MAX_PHRASES', type_=int)
|
||||||
SPEAKER_AVG_LOGPROB = ManagedProperty('SPEAKER_AVG_LOGPROB', type_=(int, float))
|
SPEAKER_AVG_LOGPROB = ManagedProperty('SPEAKER_AVG_LOGPROB', type_=(int, float))
|
||||||
SPEAKER_NO_SPEECH_PROB = ManagedProperty('SPEAKER_NO_SPEECH_PROB', type_=(int, float))
|
SPEAKER_NO_SPEECH_PROB = ManagedProperty('SPEAKER_NO_SPEECH_PROB', type_=(int, float))
|
||||||
|
SPEAKER_NO_REPEAT_NGRAM_SIZE = ManagedProperty('SPEAKER_NO_REPEAT_NGRAM_SIZE', type_=int)
|
||||||
|
SPEAKER_VAD_FILTER = ManagedProperty('SPEAKER_VAD_FILTER', type_=bool)
|
||||||
|
SPEAKER_VAD_PARAMETERS = ManagedProperty('SPEAKER_VAD_PARAMETERS', type_=dict, mutable_tracking=True)
|
||||||
|
|
||||||
# --- Auth and API settings ---
|
# --- Auth and API settings ---
|
||||||
AUTH_KEYS = ValidatedProperty('AUTH_KEYS',
|
AUTH_KEYS = ValidatedProperty('AUTH_KEYS',
|
||||||
@@ -862,6 +868,16 @@ class Config:
|
|||||||
self._PLUGINS_STATUS = []
|
self._PLUGINS_STATUS = []
|
||||||
self._MIC_AVG_LOGPROB = -0.8
|
self._MIC_AVG_LOGPROB = -0.8
|
||||||
self._MIC_NO_SPEECH_PROB = 0.6
|
self._MIC_NO_SPEECH_PROB = 0.6
|
||||||
|
self._MIC_NO_REPEAT_NGRAM_SIZE = 0
|
||||||
|
self._MIC_VAD_FILTER = False
|
||||||
|
self._MIC_VAD_PARAMETERS = {
|
||||||
|
"threshold": 0.5,
|
||||||
|
"neg_threshold": None,
|
||||||
|
"min_speech_duration_ms": 0,
|
||||||
|
"max_speech_duration_s": float("inf"),
|
||||||
|
"min_silence_duration_ms": 2000,
|
||||||
|
"speech_pad_ms": 400,
|
||||||
|
}
|
||||||
self._AUTO_SPEAKER_SELECT = True
|
self._AUTO_SPEAKER_SELECT = True
|
||||||
try:
|
try:
|
||||||
if device_manager is not None:
|
if device_manager is not None:
|
||||||
@@ -879,6 +895,16 @@ class Config:
|
|||||||
self._SPEAKER_MAX_PHRASES = 10
|
self._SPEAKER_MAX_PHRASES = 10
|
||||||
self._SPEAKER_AVG_LOGPROB = -0.8
|
self._SPEAKER_AVG_LOGPROB = -0.8
|
||||||
self._SPEAKER_NO_SPEECH_PROB = 0.6
|
self._SPEAKER_NO_SPEECH_PROB = 0.6
|
||||||
|
self._SPEAKER_NO_REPEAT_NGRAM_SIZE = 0
|
||||||
|
self._SPEAKER_VAD_FILTER = False
|
||||||
|
self._SPEAKER_VAD_PARAMETERS = {
|
||||||
|
"threshold": 0.5,
|
||||||
|
"neg_threshold": None,
|
||||||
|
"min_speech_duration_ms": 0,
|
||||||
|
"max_speech_duration_s": float("inf"),
|
||||||
|
"min_silence_duration_ms": 2000,
|
||||||
|
"speech_pad_ms": 400,
|
||||||
|
}
|
||||||
self._OSC_IP_ADDRESS = "127.0.0.1"
|
self._OSC_IP_ADDRESS = "127.0.0.1"
|
||||||
self._OSC_PORT = 9000
|
self._OSC_PORT = 9000
|
||||||
self._AUTH_KEYS = {
|
self._AUTH_KEYS = {
|
||||||
|
|||||||
@@ -664,7 +664,10 @@ class Model:
|
|||||||
languages,
|
languages,
|
||||||
countries,
|
countries,
|
||||||
config.MIC_AVG_LOGPROB,
|
config.MIC_AVG_LOGPROB,
|
||||||
config.MIC_NO_SPEECH_PROB
|
config.MIC_NO_SPEECH_PROB,
|
||||||
|
config.MIC_NO_REPEAT_NGRAM_SIZE,
|
||||||
|
config.MIC_VAD_FILTER,
|
||||||
|
config.MIC_VAD_PARAMETERS,
|
||||||
)
|
)
|
||||||
if res:
|
if res:
|
||||||
result = self.mic_transcriber.getTranscript()
|
result = self.mic_transcriber.getTranscript()
|
||||||
@@ -856,7 +859,10 @@ class Model:
|
|||||||
languages,
|
languages,
|
||||||
countries,
|
countries,
|
||||||
config.SPEAKER_AVG_LOGPROB,
|
config.SPEAKER_AVG_LOGPROB,
|
||||||
config.SPEAKER_NO_SPEECH_PROB
|
config.SPEAKER_NO_SPEECH_PROB,
|
||||||
|
config.SPEAKER_NO_REPEAT_NGRAM_SIZE,
|
||||||
|
config.SPEAKER_VAD_FILTER,
|
||||||
|
config.SPEAKER_VAD_PARAMETERS,
|
||||||
)
|
)
|
||||||
if res:
|
if res:
|
||||||
result = self.speaker_transcriber.getTranscript()
|
result = self.speaker_transcriber.getTranscript()
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import time
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from threading import Event
|
from threading import Event
|
||||||
import wave
|
import wave
|
||||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Union
|
||||||
from speech_recognition import Recognizer, AudioData, AudioFile
|
from speech_recognition import Recognizer, AudioData, AudioFile
|
||||||
from speech_recognition.exceptions import UnknownValueError
|
from speech_recognition.exceptions import UnknownValueError
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
@@ -84,6 +84,9 @@ class AudioTranscriber:
|
|||||||
countries: List[str],
|
countries: List[str],
|
||||||
avg_logprob: float = -0.8,
|
avg_logprob: float = -0.8,
|
||||||
no_speech_prob: float = 0.6,
|
no_speech_prob: float = 0.6,
|
||||||
|
no_repeat_ngram_size: int = 0,
|
||||||
|
vad_filter: bool = False,
|
||||||
|
vad_parameters: Optional[Union[dict, Any]] = None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
if audio_queue.empty():
|
if audio_queue.empty():
|
||||||
time.sleep(0.01)
|
time.sleep(0.01)
|
||||||
@@ -130,7 +133,9 @@ class AudioTranscriber:
|
|||||||
word_timestamps=False,
|
word_timestamps=False,
|
||||||
without_timestamps=True,
|
without_timestamps=True,
|
||||||
task="transcribe",
|
task="transcribe",
|
||||||
vad_filter=False,
|
no_repeat_ngram_size=no_repeat_ngram_size,
|
||||||
|
vad_filter=vad_filter,
|
||||||
|
vad_parameters=vad_parameters,
|
||||||
)
|
)
|
||||||
for s in segments:
|
for s in segments:
|
||||||
if s.avg_logprob < avg_logprob or s.no_speech_prob > no_speech_prob:
|
if s.avg_logprob < avg_logprob or s.no_speech_prob > no_speech_prob:
|
||||||
|
|||||||
Reference in New Issue
Block a user