From 593eed7988660d479fb111662456678ca4b80fd6 Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Wed, 19 Jun 2024 17:34:57 +0900
Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=90=9B[bugfix]=20Model=20:=20speaker?=
 =?UTF-8?q?=20channel=20audio=5Fsources["channels"]=20->=202=20=E3=81=AB?=
 =?UTF-8?q?=E5=9B=BA=E5=AE=9A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 models/transcription/transcription_transcriber.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py
index 82f1e981..3766b826 100644
--- a/models/transcription/transcription_transcriber.py
+++ b/models/transcription/transcription_transcriber.py
@@ -99,7 +99,7 @@ class AudioTranscriber:
     def processSpeakerData(self):
         temp_file = BytesIO()
         with wave.open(temp_file, 'wb') as wf:
-            wf.setnchannels(self.audio_sources["channels"])
+            wf.setnchannels(2)
             wf.setsampwidth(get_sample_size(paInt16))
             wf.setframerate(self.audio_sources["sample_rate"])
             wf.writeframes(self.audio_sources["last_sample"])

From 44b36801ed1a597a6b523990e603e0a03283d78e Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Thu, 20 Jun 2024 00:56:12 +0900
Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=90=9B[bugfix]=20Model=20:=20speaker?=
 =?UTF-8?q?=20channel=20audio=5Fsources["channels"]=20>=3D=202=20=E3=81=AE?=
 =?UTF-8?q?=E6=99=82sample=5Frate=E3=82=92=E8=AA=BF=E6=95=B4=E3=81=99?=
 =?UTF-8?q?=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E4=BF=AE=E6=AD=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 models/transcription/transcription_transcriber.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py
index 3766b826..c1856b34 100644
--- a/models/transcription/transcription_transcriber.py
+++ b/models/transcription/transcription_transcriber.py
@@ -97,11 +97,19 @@ class AudioTranscriber:
         return audio_data
 
     def processSpeakerData(self):
+        original_channels = self.audio_sources["channels"]
+        if original_channels <= 2:
+            channels = original_channels
+            sample_rate = self.audio_sources["sample_rate"]
+        else:
+            channels = 2
+            sample_rate = self.audio_sources["sample_rate"]*original_channels/2
+
         temp_file = BytesIO()
         with wave.open(temp_file, 'wb') as wf:
-            wf.setnchannels(2)
+            wf.setnchannels(channels)
             wf.setsampwidth(get_sample_size(paInt16))
-            wf.setframerate(self.audio_sources["sample_rate"])
+            wf.setframerate(sample_rate)
             wf.writeframes(self.audio_sources["last_sample"])
         temp_file.seek(0)
         with AudioFile(temp_file) as source:

From 5bb3152d02c60114a2a6396424be431d1df1984a Mon Sep 17 00:00:00 2001
From: misyaguziya <misyaguziya@gmail.com>
Date: Thu, 20 Jun 2024 16:59:59 +0900
Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=90=9B[bugfix]=20Model=20:=20speaker?=
 =?UTF-8?q?=E3=81=8C=E3=82=B5=E3=83=A9=E3=82=A6=E3=83=B3=E3=83=89=E3=83=87?=
 =?UTF-8?q?=E3=83=90=E3=82=A4=E3=82=B9=E3=81=AE=E5=A0=B4=E5=90=88=E3=81=AB?=
 =?UTF-8?q?=E9=9F=B3=E5=A3=B0=E3=81=8C=E6=96=87=E5=AD=97=E8=B5=B7=E3=81=93?=
 =?UTF-8?q?=E3=81=97=E3=81=95=E3=82=8C=E3=81=AA=E3=81=84=E5=95=8F=E9=A1=8C?=
 =?UTF-8?q?=E3=82=92=E4=BF=AE=E6=AD=A3=20#10?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../transcription_transcriber.py              | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/models/transcription/transcription_transcriber.py b/models/transcription/transcription_transcriber.py
index c1856b34..a535cd8a 100644
--- a/models/transcription/transcription_transcriber.py
+++ b/models/transcription/transcription_transcriber.py
@@ -10,6 +10,7 @@ from .transcription_whisper import getWhisperModel, checkWhisperWeight
 
 import torch
 import numpy as np
+from pydub import AudioSegment
 
 PHRASE_TIMEOUT = 3
 MAX_PHRASES = 10
@@ -97,21 +98,21 @@ class AudioTranscriber:
         return audio_data
 
     def processSpeakerData(self):
-        original_channels = self.audio_sources["channels"]
-        if original_channels <= 2:
-            channels = original_channels
-            sample_rate = self.audio_sources["sample_rate"]
-        else:
-            channels = 2
-            sample_rate = self.audio_sources["sample_rate"]*original_channels/2
-
         temp_file = BytesIO()
         with wave.open(temp_file, 'wb') as wf:
-            wf.setnchannels(channels)
+            wf.setnchannels(self.audio_sources["channels"])
             wf.setsampwidth(get_sample_size(paInt16))
-            wf.setframerate(sample_rate)
+            wf.setframerate(self.audio_sources["sample_rate"])
             wf.writeframes(self.audio_sources["last_sample"])
         temp_file.seek(0)
+
+        if self.audio_sources["channels"] > 2:
+            audio = AudioSegment.from_file(temp_file, format="wav")
+            mono_audio = audio.set_channels(1)
+            temp_file = BytesIO()
+            mono_audio.export(temp_file, format="wav")
+            temp_file.seek(0)
+
         with AudioFile(temp_file) as source:
             audio = self.audio_recognizer.record(source)
         return audio