From ba12e39bbc4187bad8831fa67d8e04350c3cd874 Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Tue, 30 Jan 2024 02:15:05 +0900
Subject: [PATCH] =?UTF-8?q?[WIP/TEST]=20Model=20:=20faster-whisper?=
 =?UTF-8?q?=E3=82=92=E8=BF=BD=E5=8A=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../transcription_transcriber.py              | 28 +++++++++++++++++++
 requirements.txt                              |  3 +-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py
index bf78566e..fbea0e74 100644
--- a/models/transcription/transcription_transcriber.py
+++ b/models/transcription/transcription_transcriber.py
@@ -6,6 +6,10 @@ from datetime import timedelta
 from pyaudiowpatch import get_sample_size, paInt16
 from .transcription_languages import transcription_lang
 
+import torch
+import numpy as np
+from faster_whisper import WhisperModel
+
 PHRASE_TIMEOUT = 3
 MAX_PHRASES = 10
 
@@ -26,6 +30,7 @@ class AudioTranscriber:
                 "new_phrase": True,
                 "process_data_func": self.processSpeakerData if speaker else self.processSpeakerData
         }
+        self.whisper_model = WhisperModel("base", device="cpu", device_index=0, compute_type="int8", cpu_threads=4, num_workers=1)
 
     def transcribeAudioQueue(self, audio_queue, language, country):
         # while True:
@@ -38,6 +43,29 @@ class AudioTranscriber:
             # os.close(fd)
             audio_data = self.audio_sources["process_data_func"]()
             text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country])
+
+            audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0
+            if isinstance(audio_data, torch.Tensor):
+                audio_data = audio_data.detach().numpy()
+            segments, _ = self.whisper_model.transcribe(
+                audio_data,
+                beam_size=5,
+                temperature=0.0,
+                log_prob_threshold=-0.8,
+                no_speech_threshold=0.6,
+                language="ja",
+                word_timestamps=False,
+                without_timestamps=True,
+                task="transcribe",
+                vad_filter=False,
+                )
+            _text = ""
+            for s in segments:
+                if s.avg_logprob < -0.8 or s.no_speech_prob > 0.6:
+                    continue
+                _text += s.text
+            print(_text)
+
         except Exception:
             pass
         finally:
diff --git a/requirements.txt b/requirements.txt
index b6e14d85..68a6ce15 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,5 @@ CTkToolTip == 0.8
 pyinstaller==6.2.0
 transformers[torch]
 sentencepiece==0.1.99
-ctranslate2==3.21.0
\ No newline at end of file
+ctranslate2==3.21.0
+faster-whisper==0.10.0
\ No newline at end of file