diff --git a/AudioTranscriber.py b/AudioTranscriber.py index 22c07dce..715e065b 100644 --- a/AudioTranscriber.py +++ b/AudioTranscriber.py @@ -1,14 +1,18 @@ - +import os +from io import BytesIO +import tempfile import threading +import wave import custom_speech_recognition as sr from datetime import timedelta -from heapq import merge +import pyaudiowpatch as pyaudio PHRASE_TIMEOUT = 3.05 -MAX_PHRASES = 10 +MAX_PHRASES = 5 class AudioTranscriber: - def __init__(self, source, language): + def __init__(self, speaker, source, language): + self.speaker = speaker self.language = language self.transcript_data = [] self.transcript_changed_event = threading.Event() @@ -20,6 +24,7 @@ class AudioTranscriber: "last_sample": bytes(), "last_spoken": None, "new_phrase": True, + "process_data_func": self.process_speaker_data if speaker else self.process_speaker_data } def transcribe_audio_queue(self, audio_queue): @@ -29,12 +34,14 @@ class AudioTranscriber: text = '' try: - audio_data = self.process_data() + fd, path = tempfile.mkstemp(suffix=".wav") + os.close(fd) + audio_data = self.audio_sources["process_data_func"](path) text = self.audio_recognizer.recognize_google(audio_data, language=self.language) except Exception as e: pass finally: - pass + os.unlink(path) if text != '': self.update_transcript(text) @@ -50,11 +57,21 @@ class AudioTranscriber: source_info["last_sample"] += data source_info["last_spoken"] = time_spoken - def process_data(self): - print(self.audio_sources["last_sample"]) + def process_mic_data(self): audio_data = sr.AudioData(self.audio_sources["last_sample"], self.audio_sources["sample_rate"], self.audio_sources["sample_width"]) return audio_data + def process_speaker_data(self, path): + with wave.open(path, 'wb') as wf: + wf.setnchannels(self.audio_sources["channels"]) + p = pyaudio.PyAudio() + wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) + wf.setframerate(self.audio_sources["sample_rate"]) + wf.writeframes(self.audio_sources["last_sample"]) + with sr.AudioFile(path) as source: + audio = self.audio_recognizer.record(source) + return audio + def update_transcript(self, text): source_info = self.audio_sources transcript = self.transcript_data diff --git a/test_main.py b/test_main.py index 0b13baee..8821a081 100644 --- a/test_main.py +++ b/test_main.py @@ -6,12 +6,11 @@ import AudioRecorder import audio_utils mic_audio_queue = queue.Queue() - mic_device = audio_utils.get_default_input_device() mic_audio_recorder = AudioRecorder.SelectedMicRecorder(mic_device) mic_audio_recorder.record_into_queue(mic_audio_queue) -mic_transcriber = AudioTranscriber.AudioTranscriber(source=mic_audio_recorder.source, language="ja-JP") +mic_transcriber = AudioTranscriber.AudioTranscriber(speaker=False, source=mic_audio_recorder.source, language="ja-JP") mic_transcribe = threading.Thread(target=mic_transcriber.transcribe_audio_queue, args=(mic_audio_queue,)) mic_transcribe.daemon = True mic_transcribe.start() @@ -23,7 +22,7 @@ spk_device = audio_utils.get_default_output_device() spk_audio_recorder = AudioRecorder.SelectedSpeakerRecorder(spk_device) spk_audio_recorder.record_into_queue(spk_audio_queue) -spk_transcriber = AudioTranscriber.AudioTranscriber(source=mic_audio_recorder.source, language="ja-JP") +spk_transcriber = AudioTranscriber.AudioTranscriber(speaker=True, source=spk_audio_recorder.source, language="ja-JP") spk_transcribe = threading.Thread(target=spk_transcriber.transcribe_audio_queue, args=(spk_audio_queue,)) spk_transcribe.daemon = True spk_transcribe.start() @@ -32,7 +31,7 @@ while True: text = mic_transcriber.get_transcript() if len(text) > 0: print("mic:", text) - # text = spk_transcriber.get_transcript() - # if len(text) > 0: - # print("spk:", text) + text = spk_transcriber.get_transcript() + if len(text) > 0: + print("spk:", text) time.sleep(0.1) \ No newline at end of file