update transcription mic/spk
This commit is contained in:
90
VRCT.py
90
VRCT.py
@@ -259,8 +259,8 @@ class App(customtkinter.CTk):
|
|||||||
|
|
||||||
## set transcription instance
|
## set transcription instance
|
||||||
self.vr = transcription.VoiceRecognizer()
|
self.vr = transcription.VoiceRecognizer()
|
||||||
self.CHOICE_MIC_DEVICE = self.CHOICE_MIC_DEVICE if self.CHOICE_MIC_DEVICE is not None else self.vr.search_default_device_index()[0]["name"]
|
self.CHOICE_MIC_DEVICE = self.CHOICE_MIC_DEVICE if self.CHOICE_MIC_DEVICE is not None else self.vr.search_default_device()[0]
|
||||||
self.CHOICE_SPEAKER_DEVICE = self.CHOICE_SPEAKER_DEVICE if self.CHOICE_SPEAKER_DEVICE is not None else self.vr.search_default_device_index()[1]["name"]
|
self.CHOICE_SPEAKER_DEVICE = self.CHOICE_SPEAKER_DEVICE if self.CHOICE_SPEAKER_DEVICE is not None else self.vr.search_default_device()[1]
|
||||||
|
|
||||||
## set checkbox enable translation
|
## set checkbox enable translation
|
||||||
if self.ENABLE_TRANSLATION:
|
if self.ENABLE_TRANSLATION:
|
||||||
@@ -324,33 +324,44 @@ class App(customtkinter.CTk):
|
|||||||
self.ENABLE_TRANSCRIPTION = self.checkbox_transcription.get()
|
self.ENABLE_TRANSCRIPTION = self.checkbox_transcription.get()
|
||||||
if self.ENABLE_TRANSCRIPTION is True:
|
if self.ENABLE_TRANSCRIPTION is True:
|
||||||
# start threading
|
# start threading
|
||||||
th = threading.Thread(target = self.voice_input)
|
self.vr.set_mic(self.CHOICE_MIC_DEVICE)
|
||||||
th.start()
|
self.vr.init_mic(threshold=self.MIC_THRESHOLD, is_dynamic=self.ENABLE_MIC_IS_DYNAMIC)
|
||||||
|
th_vr_listen_mic = threading.Thread(target = self.vr_listen_mic)
|
||||||
|
th_vr_recognize_mic = threading.Thread(target = self.vr_recognize_mic)
|
||||||
|
th_vr_listen_mic.start()
|
||||||
|
th_vr_recognize_mic.start()
|
||||||
|
|
||||||
|
self.vr.set_spk(self.CHOICE_SPEAKER_DEVICE)
|
||||||
|
self.vr.init_spk()
|
||||||
|
th_vr_listen_spk = threading.Thread(target = self.vr_listen_spk)
|
||||||
|
th_vr_recognize_spk = threading.Thread(target = self.vr_recognize_spk)
|
||||||
|
th_vr_listen_spk.start()
|
||||||
|
th_vr_recognize_spk.start()
|
||||||
|
|
||||||
utils.save_json(self.PATH_CONFIG, "ENABLE_TRANSCRIPTION", self.ENABLE_TRANSCRIPTION)
|
utils.save_json(self.PATH_CONFIG, "ENABLE_TRANSCRIPTION", self.ENABLE_TRANSCRIPTION)
|
||||||
|
|
||||||
def voice_input(self):
|
def vr_listen_mic(self):
|
||||||
self.vr.set_mic(self.CHOICE_MIC_DEVICE)
|
while self.checkbox_transcription.get() is True:
|
||||||
self.vr.init_mic(threshold=self.MIC_THRESHOLD, is_dynamic=self.ENABLE_MIC_IS_DYNAMIC)
|
self.vr.listen_mic()
|
||||||
|
|
||||||
# start voice_input
|
def vr_recognize_mic(self):
|
||||||
if self.checkbox_transcription.get() is True:
|
self.textbox_message_log.configure(state='normal')
|
||||||
self.textbox_message_log.configure(state='normal')
|
self.textbox_message_log.insert("end", f"[INFO] start transcription\n")
|
||||||
self.textbox_message_log.insert("end", f"[INFO] start transcription\n")
|
self.textbox_message_log.configure(state='disabled')
|
||||||
self.textbox_message_log.configure(state='disabled')
|
self.textbox_message_log.see("end")
|
||||||
self.textbox_message_log.see("end")
|
|
||||||
|
|
||||||
while self.checkbox_transcription.get() is True:
|
while self.checkbox_transcription.get() is True:
|
||||||
message = self.vr.listen_voice(language=self.INPUT_MIC_VOICE_LANGUAGE)
|
message = self.vr.recognize_mic(language=self.INPUT_MIC_VOICE_LANGUAGE)
|
||||||
if len(message) > 0:
|
if len(message) > 0:
|
||||||
# translate
|
# translate
|
||||||
if self.checkbox_translation.get() is False:
|
if self.checkbox_translation.get() is False:
|
||||||
chat_message = f"{message}"
|
voice_message = f"{message}"
|
||||||
elif (self.translator.translator_status[self.CHOICE_TRANSLATOR] is False) or (self.INPUT_SOURCE_LANG == "None") or (self.INPUT_TARGET_LANG == "None"):
|
elif self.translator.translator_status[self.CHOICE_TRANSLATOR] is False:
|
||||||
self.textbox_message_log.configure(state='normal')
|
self.textbox_message_log.configure(state='normal')
|
||||||
self.textbox_message_log.insert("end", f"[ERROR] Auth Keyもしくは言語の設定が間違っています\n")
|
self.textbox_message_log.insert("end", f"[ERROR] Auth Keyもしくは言語の設定が間違っています\n")
|
||||||
self.textbox_message_log.configure(state='disabled')
|
self.textbox_message_log.configure(state='disabled')
|
||||||
self.textbox_message_log.see("end")
|
self.textbox_message_log.see("end")
|
||||||
chat_message = f"{message}"
|
voice_message = f"{message}"
|
||||||
else:
|
else:
|
||||||
result = self.translator.translate(
|
result = self.translator.translate(
|
||||||
translator_name=self.CHOICE_TRANSLATOR,
|
translator_name=self.CHOICE_TRANSLATOR,
|
||||||
@@ -358,20 +369,51 @@ class App(customtkinter.CTk):
|
|||||||
target_language=self.INPUT_TARGET_LANG,
|
target_language=self.INPUT_TARGET_LANG,
|
||||||
message=message
|
message=message
|
||||||
)
|
)
|
||||||
chat_message = self.MESSAGE_FORMAT.replace("[message]", message).replace("[translation]", result)
|
voice_message = self.MESSAGE_FORMAT.replace("[message]", message).replace("[translation]", result)
|
||||||
|
|
||||||
# send OSC message
|
# send OSC message
|
||||||
osc_tools.send_message(chat_message, self.OSC_IP_ADDRESS, self.OSC_PORT)
|
osc_tools.send_message(voice_message, self.OSC_IP_ADDRESS, self.OSC_PORT)
|
||||||
|
|
||||||
# update textbox message log
|
# update textbox message log
|
||||||
self.textbox_message_log.configure(state='normal')
|
self.textbox_message_log.configure(state='normal')
|
||||||
self.textbox_message_log.insert("end", f"[VOICE] {chat_message}\n")
|
self.textbox_message_log.insert("end", f"[VOICE] {voice_message}\n")
|
||||||
|
self.textbox_message_log.configure(state='disabled')
|
||||||
|
self.textbox_message_log.see("end")
|
||||||
|
|
||||||
|
def vr_listen_spk(self):
|
||||||
|
while self.checkbox_transcription.get() is True:
|
||||||
|
self.vr.listen_spk()
|
||||||
|
|
||||||
|
def vr_recognize_spk(self):
|
||||||
|
while self.checkbox_transcription.get() is True:
|
||||||
|
message = self.vr.recognize_spk(language=self.INPUT_SPEAKER_VOICE_LANGUAGE)
|
||||||
|
if len(message) > 0:
|
||||||
|
# translate
|
||||||
|
if self.checkbox_translation.get() is False:
|
||||||
|
voice_message = f"{message}"
|
||||||
|
elif self.translator.translator_status[self.CHOICE_TRANSLATOR] is False:
|
||||||
|
self.textbox_message_log.configure(state='normal')
|
||||||
|
self.textbox_message_log.insert("end", f"[ERROR] Auth Keyもしくは言語の設定が間違っています\n")
|
||||||
|
self.textbox_message_log.configure(state='disabled')
|
||||||
|
self.textbox_message_log.see("end")
|
||||||
|
voice_message = f"{message}"
|
||||||
|
else:
|
||||||
|
result = self.translator.translate(
|
||||||
|
translator_name=self.CHOICE_TRANSLATOR,
|
||||||
|
source_language=self.INPUT_SOURCE_LANG,
|
||||||
|
target_language=self.INPUT_TARGET_LANG,
|
||||||
|
message=message
|
||||||
|
)
|
||||||
|
voice_message = self.MESSAGE_FORMAT.replace("[message]", message).replace("[translation]", result)
|
||||||
|
|
||||||
|
# send OSC message
|
||||||
|
osc_tools.send_message(voice_message, self.OSC_IP_ADDRESS, self.OSC_PORT)
|
||||||
|
|
||||||
|
# update textbox message log
|
||||||
|
self.textbox_message_log.configure(state='normal')
|
||||||
|
self.textbox_message_log.insert("end", f"[VOICE] {voice_message}\n")
|
||||||
self.textbox_message_log.configure(state='disabled')
|
self.textbox_message_log.configure(state='disabled')
|
||||||
self.textbox_message_log.see("end")
|
self.textbox_message_log.see("end")
|
||||||
self.textbox_message_log.configure(state='normal')
|
|
||||||
self.textbox_message_log.insert("end", f"[INFO] stop transcription\n")
|
|
||||||
self.textbox_message_log.configure(state='disabled')
|
|
||||||
self.textbox_message_log.see("end")
|
|
||||||
|
|
||||||
def checkbox_foreground_callback(self):
|
def checkbox_foreground_callback(self):
|
||||||
self.ENABLE_FOREGROUND = self.checkbox_foreground.get()
|
self.ENABLE_FOREGROUND = self.checkbox_foreground.get()
|
||||||
|
|||||||
121
transcription.py
121
transcription.py
@@ -1,3 +1,8 @@
|
|||||||
|
import io
|
||||||
|
import queue
|
||||||
|
import numpy as np
|
||||||
|
import soundcard as sc
|
||||||
|
import soundfile as sf
|
||||||
import sounddevice as sd
|
import sounddevice as sd
|
||||||
import speech_recognition as sr
|
import speech_recognition as sr
|
||||||
|
|
||||||
@@ -5,7 +10,6 @@ import speech_recognition as sr
|
|||||||
class VoiceRecognizer():
|
class VoiceRecognizer():
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.r = sr.Recognizer()
|
self.r = sr.Recognizer()
|
||||||
self.mic = None
|
|
||||||
self.languages = [
|
self.languages = [
|
||||||
"ja-JP","en-US","en-GB","af-ZA","ar-DZ","ar-BH","ar-EG","ar-IL","ar-IQ","ar-JO","ar-KW","ar-LB","ar-MA",
|
"ja-JP","en-US","en-GB","af-ZA","ar-DZ","ar-BH","ar-EG","ar-IL","ar-IQ","ar-JO","ar-KW","ar-LB","ar-MA",
|
||||||
"ar-OM","ar-PS","ar-QA","ar-SA","ar-TN","ar-AE","eu-ES","bg-BG","ca-ES","cmn-Hans-CN","cmn-Hans-HK",
|
"ar-OM","ar-PS","ar-QA","ar-SA","ar-TN","ar-AE","eu-ES","bg-BG","ca-ES","cmn-Hans-CN","cmn-Hans-HK",
|
||||||
@@ -16,6 +20,18 @@ class VoiceRecognizer():
|
|||||||
"es-NI","es-PA","es-PY","es-PE","es-PR","es-ES","es-UY","es-US","es-VE","sv-SE","th-TH","tr-TR","uk-UA",
|
"es-NI","es-PA","es-PY","es-PE","es-PR","es-ES","es-UY","es-US","es-VE","sv-SE","th-TH","tr-TR","uk-UA",
|
||||||
"vi-VN","zu-ZA"
|
"vi-VN","zu-ZA"
|
||||||
]
|
]
|
||||||
|
self.mic = None
|
||||||
|
self.enable_mic_recognize = False
|
||||||
|
self.queue_mic = queue.Queue()
|
||||||
|
|
||||||
|
self.spk_device_name = None
|
||||||
|
self.spk_sample_rate = 16000
|
||||||
|
self.spk_interval = 3
|
||||||
|
self.spk_buffer_size = 4096
|
||||||
|
self.spk_language = "en-US"
|
||||||
|
self.spk_audio = np.empty(self.spk_sample_rate * self.spk_interval + self.spk_buffer_size, dtype=np.float32)
|
||||||
|
self.n = 0
|
||||||
|
self.queue_spk = queue.Queue()
|
||||||
|
|
||||||
def search_input_device(self):
|
def search_input_device(self):
|
||||||
device_list = sd.query_devices()
|
device_list = sd.query_devices()
|
||||||
@@ -28,21 +44,20 @@ class VoiceRecognizer():
|
|||||||
return input_device_list
|
return input_device_list
|
||||||
|
|
||||||
def search_output_device(self):
|
def search_output_device(self):
|
||||||
device_list = sd.query_devices()
|
device_list = sc.all_speakers()
|
||||||
output_device_list = []
|
output_device_list = []
|
||||||
|
|
||||||
for device in device_list:
|
for device in device_list:
|
||||||
if device["max_output_channels"] > 0:
|
output_device_list.append(str(device.name))
|
||||||
output_device_list.append({"name": device["name"], "index": device["index"]})
|
|
||||||
|
|
||||||
return output_device_list
|
return output_device_list
|
||||||
|
|
||||||
def search_default_device_index(self):
|
def search_default_device(self):
|
||||||
device_list = sd.query_devices()
|
device_list = sd.query_devices()
|
||||||
default_device_list = []
|
mic_index = sd.default.device[0]
|
||||||
for i in sd.default.device:
|
name_mic = device_list[mic_index]["name"]
|
||||||
default_device_list.append({"name": device_list[i]["name"], "index": device_list[i]["index"]})
|
name_spk = str(sc.default_speaker().name)
|
||||||
return default_device_list
|
return name_mic, name_spk
|
||||||
|
|
||||||
def set_mic(self, device_name, threshold=50, is_dynamic=False):
|
def set_mic(self, device_name, threshold=50, is_dynamic=False):
|
||||||
input_device_list = self.search_input_device()
|
input_device_list = self.search_input_device()
|
||||||
@@ -68,14 +83,86 @@ class VoiceRecognizer():
|
|||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def listen_voice(self, language):
|
def listen_mic(self):
|
||||||
if self.mic != None:
|
if self.mic != None:
|
||||||
with self.mic as source:
|
with self.mic as source:
|
||||||
audio = self.r.listen(source)
|
audio = self.r.listen(source)
|
||||||
try:
|
self.queue_mic.put(audio)
|
||||||
text = self.r.recognize_google(audio, language=language)
|
|
||||||
return text
|
def recognize_mic(self, language):
|
||||||
except:
|
try:
|
||||||
return ""
|
audio = self.queue_mic.get()
|
||||||
else:
|
text = self.r.recognize_google(audio, language=language)
|
||||||
return False
|
except:
|
||||||
|
text = ""
|
||||||
|
return text
|
||||||
|
|
||||||
|
def set_spk(self, device_name=str(sc.default_speaker().name), sample_rate=16000, interval=3, buffer_size=4096, language="en-US"):
|
||||||
|
self.spk_device_name = device_name
|
||||||
|
self.spk_sample_rate = sample_rate
|
||||||
|
self.spk_interval = interval
|
||||||
|
self.spk_buffer_size = buffer_size
|
||||||
|
self.spk_language = language
|
||||||
|
|
||||||
|
def init_spk(self):
|
||||||
|
self.spk_audio = np.empty(self.spk_sample_rate * self.spk_interval + self.spk_buffer_size, dtype=np.float32)
|
||||||
|
self.n = 0
|
||||||
|
|
||||||
|
def listen_spk(self):
|
||||||
|
audio = self.spk_audio
|
||||||
|
n = self.n
|
||||||
|
with sc.get_microphone(id=self.spk_device_name, include_loopback=True).recorder(samplerate=self.spk_sample_rate, channels=1) as source:
|
||||||
|
while n < self.spk_sample_rate * self.spk_interval:
|
||||||
|
data = source.record(self.spk_buffer_size)
|
||||||
|
audio[n:n+len(data)] = data.reshape(-1)
|
||||||
|
n += len(data)
|
||||||
|
m = n * 4 // 5
|
||||||
|
vol = np.convolve(audio[m:n] ** 2, np.ones(100) / 100, 'same')
|
||||||
|
m += vol.argmin()
|
||||||
|
audio_prev = audio.copy()
|
||||||
|
self.queue_spk.put(audio[:m])
|
||||||
|
audio = np.empty(self.spk_sample_rate * self.spk_interval + self.spk_buffer_size, dtype=np.float32)
|
||||||
|
audio[:n-m] = audio_prev[m:n]
|
||||||
|
n = n-m
|
||||||
|
self.spk_audio = audio
|
||||||
|
self.n = n
|
||||||
|
|
||||||
|
def recognize_spk(self):
|
||||||
|
try:
|
||||||
|
audio = self.queue_spk.get()
|
||||||
|
with io.BytesIO() as memory_file:
|
||||||
|
sf.write(file=memory_file, data=audio, format="WAV", samplerate=self.spk_sample_rate)
|
||||||
|
memory_file.seek(0)
|
||||||
|
with sr.AudioFile(memory_file) as source:
|
||||||
|
audio = self.r.record(source)
|
||||||
|
text = self.r.recognize_google(audio, language=self.spk_language)
|
||||||
|
except Exception as e:
|
||||||
|
text = ""
|
||||||
|
return text
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
|
||||||
|
vr = VoiceRecognizer()
|
||||||
|
mic_name, spk_name = vr.search_default_device()
|
||||||
|
vr.spk_enable_recognize = True
|
||||||
|
vr.set_spk(language="ja-JP")
|
||||||
|
vr.init_spk()
|
||||||
|
|
||||||
|
def vr_listen_spk():
|
||||||
|
while True:
|
||||||
|
vr.listen_spk()
|
||||||
|
|
||||||
|
def vr_recognize_spk():
|
||||||
|
while True:
|
||||||
|
text = vr.recognize_spk()
|
||||||
|
print(text)
|
||||||
|
|
||||||
|
th_vr_listen_spk = threading.Thread(target=vr_listen_spk)
|
||||||
|
th_vr_recognize_spk = threading.Thread(target=vr_recognize_spk)
|
||||||
|
th_vr_listen_spk.start()
|
||||||
|
th_vr_recognize_spk.start()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
time.sleep(60)
|
||||||
@@ -277,7 +277,7 @@ class ToplevelWindowConfig(customtkinter.CTkToplevel):
|
|||||||
self.label_input_speaker_device.grid(row=4, column=0, columnspan=1, padx=5, pady=5, sticky="nsw")
|
self.label_input_speaker_device.grid(row=4, column=0, columnspan=1, padx=5, pady=5, sticky="nsw")
|
||||||
self.optionmenu_input_speaker_device = customtkinter.CTkOptionMenu(
|
self.optionmenu_input_speaker_device = customtkinter.CTkOptionMenu(
|
||||||
self.tabview_config.tab("Transcription"),
|
self.tabview_config.tab("Transcription"),
|
||||||
values=[device["name"] for device in self.parent.vr.search_output_device()],
|
values=self.parent.vr.search_output_device(),
|
||||||
# command=self.optionmenu_input_speaker_device_callback,
|
# command=self.optionmenu_input_speaker_device_callback,
|
||||||
font=customtkinter.CTkFont(family=self.parent.FONT_FAMILY),
|
font=customtkinter.CTkFont(family=self.parent.FONT_FAMILY),
|
||||||
variable=customtkinter.StringVar(value=self.parent.CHOICE_SPEAKER_DEVICE),
|
variable=customtkinter.StringVar(value=self.parent.CHOICE_SPEAKER_DEVICE),
|
||||||
|
|||||||
Reference in New Issue
Block a user