👍️ [Update] pythonのメイン処理部分を移動/webui_mainloop.pyをビルドできるように修正
This commit is contained in:
730
src-python/models/transcription/transcription_languages.py
Normal file
730
src-python/models/transcription/transcription_languages.py
Normal file
@@ -0,0 +1,730 @@
|
||||
transcription_lang = {
|
||||
"Afrikaans":{
|
||||
"South Africa":{
|
||||
"Google": "af-ZA",
|
||||
"Whisper": "af",
|
||||
},
|
||||
},
|
||||
"Albanian":{
|
||||
"Albania":{
|
||||
"Google": "sq-AL",
|
||||
"Whisper": "sq",
|
||||
},
|
||||
},
|
||||
"Amharic":{
|
||||
"Ethiopia":{
|
||||
"Google": "am-ET",
|
||||
"Whisper": "am",
|
||||
},
|
||||
},
|
||||
"Arabic":{
|
||||
"Algeria":{
|
||||
"Google": "ar-DZ",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Bahrain":{
|
||||
"Google": "ar-BH",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Egypt":{
|
||||
"Google": "ar-EG",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Israel":{
|
||||
"Google": "ar-IL",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Iraq":{
|
||||
"Google": "ar-IQ",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Jordan":{
|
||||
"Google": "ar-JO",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Kuwait":{
|
||||
"Google": "ar-KW",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Lebanon":{
|
||||
"Google": "ar-LB",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Mauritania":{
|
||||
"Google": "ar-MR",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Morocco":{
|
||||
"Google": "ar-MA",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Oman":{
|
||||
"Google": "ar-OM",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Qatar":{
|
||||
"Google": "ar-QA",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Saudi Arabia":{
|
||||
"Google": "ar-SA",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Palestine":{
|
||||
"Google": "ar-PS",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Syria":{
|
||||
"Google": "ar-SY",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Tunisia":{
|
||||
"Google": "ar-TN",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"United Arab Emirates":{
|
||||
"Google": "ar-AE",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
"Yemen":{
|
||||
"Google": "ar-YE",
|
||||
"Whisper": "ar",
|
||||
},
|
||||
},
|
||||
"Armenian": {
|
||||
"Armenia": {
|
||||
"Google": "hy-AM",
|
||||
"Whisper": "hy",
|
||||
},
|
||||
},
|
||||
"Azerbaijani": {
|
||||
"Azerbaijan": {
|
||||
"Google": "az-AZ",
|
||||
"Whisper": "az",
|
||||
},
|
||||
},
|
||||
"Basque":{
|
||||
"Spain":{
|
||||
"Google": "eu-ES",
|
||||
"Whisper": "eu",
|
||||
},
|
||||
},
|
||||
"Bengali":{
|
||||
"Bangladesh":{
|
||||
"Google": "bn-BD",
|
||||
"Whisper": "bn",
|
||||
},
|
||||
"India":{
|
||||
"Google": "bn-IN",
|
||||
"Whisper": "bn",
|
||||
},
|
||||
},
|
||||
"Bosnian":{
|
||||
"Bosnia and Herzegovina":{
|
||||
"Google": "bs-BA",
|
||||
"Whisper": "bs",
|
||||
}
|
||||
},
|
||||
"Bulgarian":{
|
||||
"Bulgaria":{
|
||||
"Google": "bg-BG",
|
||||
"Whisper": "bg",
|
||||
},
|
||||
},
|
||||
"Burmese":{
|
||||
"Myanmar":{
|
||||
"Google": "my-MM",
|
||||
"Whisper": "my",
|
||||
},
|
||||
},
|
||||
"Catalan":{
|
||||
"Spain":{
|
||||
"Google": "ca-ES",
|
||||
"Whisper": "ca",
|
||||
},
|
||||
},
|
||||
"Chinese Simplified":{
|
||||
"China":{
|
||||
"Google": "cmn-Hans-CN",
|
||||
"Whisper": "zh",
|
||||
},
|
||||
"Hong Kong":{
|
||||
"Google": "cmn-Hans-HK",
|
||||
"Whisper": "zh",
|
||||
},
|
||||
},
|
||||
"Chinese Traditional":{
|
||||
"Taiwan":{
|
||||
"Google": "cmn-Hant-TW",
|
||||
"Whisper": "zh",
|
||||
},
|
||||
"Hong Kong":{
|
||||
"Google": "yue-Hant-HK",
|
||||
"Whisper": "yue",
|
||||
},
|
||||
},
|
||||
"Croatian":{
|
||||
"Croatia":{
|
||||
"Google": "hr-HR",
|
||||
"Whisper": "hr",
|
||||
},
|
||||
},
|
||||
"Czech":{
|
||||
"Czech Republic":{
|
||||
"Google": "cs-CZ",
|
||||
"Whisper": "cs",
|
||||
},
|
||||
},
|
||||
"Danish":{
|
||||
"Denmark":{
|
||||
"Google": "da-DK",
|
||||
"Whisper": "da",
|
||||
},
|
||||
},
|
||||
"Dutch":{
|
||||
"Belgium":{
|
||||
"Google": "nl-BE",
|
||||
"Whisper": "nl",
|
||||
},
|
||||
"Netherlands":{
|
||||
"Google": "nl-NL",
|
||||
"Whisper": "nl",
|
||||
},
|
||||
},
|
||||
"English": {
|
||||
"Australia":{
|
||||
"Google": "en-AU",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"Canada":{
|
||||
"Google": "en-CA",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"Ghana":{
|
||||
"Google": "en-GH",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"Hong Kong":{
|
||||
"Google": "en-HK",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"India":{
|
||||
"Google": "en-IN",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"Ireland":{
|
||||
"Google": "en-IE",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"Kenya":{
|
||||
"Google": "en-KE",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"New Zealand":{
|
||||
"Google": "en-NZ",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"Nigeria":{
|
||||
"Google": "en-NG",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"Philippines":{
|
||||
"Google": "en-PH",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"Singapore":{
|
||||
"Google": "en-SG",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"South Africa":{
|
||||
"Google": "en-ZA",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"Tanzania":{
|
||||
"Google": "en-TZ",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"United Kingdom":{
|
||||
"Google": "en-GB",
|
||||
"Whisper": "en",
|
||||
},
|
||||
"United States":{
|
||||
"Google": "en-US",
|
||||
"Whisper": "en",
|
||||
},
|
||||
},
|
||||
"Estonian":{
|
||||
"Estonia":{
|
||||
"Google": "et-EE",
|
||||
"Whisper": "et",
|
||||
},
|
||||
},
|
||||
"Filipino":{
|
||||
"Philippines":{
|
||||
"Google": "fil-PH",
|
||||
"Whisper": "tl",
|
||||
},
|
||||
},
|
||||
"Finnish":{
|
||||
"Finland":{
|
||||
"Google": "fi-FI",
|
||||
"Whisper": "fi",
|
||||
},
|
||||
},
|
||||
"French":{
|
||||
"Belgium":{
|
||||
"Google": "fr-BE",
|
||||
"Whisper": "fr",
|
||||
},
|
||||
"Canada":{
|
||||
"Google": "fr-CA",
|
||||
"Whisper": "fr",
|
||||
},
|
||||
"France":{
|
||||
"Google": "fr-FR",
|
||||
"Whisper": "fr",
|
||||
},
|
||||
"Switzerland":{
|
||||
"Google": "fr-CH",
|
||||
"Whisper": "fr",
|
||||
},
|
||||
},
|
||||
"Galician":{
|
||||
"Spain":{
|
||||
"Google": "gl-ES",
|
||||
"Whisper": "gl",
|
||||
},
|
||||
},
|
||||
"Georgian":{
|
||||
"Georgia":{
|
||||
"Google": "ka-GE",
|
||||
"Whisper": "ka",
|
||||
},
|
||||
},
|
||||
"German":{
|
||||
"Austria":{
|
||||
"Google": "de-AT",
|
||||
"Whisper": "de",
|
||||
},
|
||||
"Germany":{
|
||||
"Google": "de-DE",
|
||||
"Whisper": "de",
|
||||
},
|
||||
"Switzerland":{
|
||||
"Google": "de-CH",
|
||||
"Whisper": "de",
|
||||
},
|
||||
},
|
||||
"Greek":{
|
||||
"Greece":{
|
||||
"Google": "el-GR",
|
||||
"Whisper": "el",
|
||||
},
|
||||
},
|
||||
"Gujarati":{
|
||||
"India":{
|
||||
"Google": "gu-IN",
|
||||
"Whisper": "gu",
|
||||
},
|
||||
},
|
||||
"Hebrew":{
|
||||
"Israel":{
|
||||
"Google": "iw-IL",
|
||||
"Whisper": "he",
|
||||
},
|
||||
},
|
||||
"Hindi": {
|
||||
"India":{
|
||||
"Google": "hi-IN",
|
||||
"Whisper": "hi",
|
||||
},
|
||||
},
|
||||
"Hungarian":{
|
||||
"Hungary":{
|
||||
"Google": "hu-HU",
|
||||
"Whisper": "hu",
|
||||
},
|
||||
},
|
||||
"Icelandic":{
|
||||
"Iceland":{
|
||||
"Google": "is-IS",
|
||||
"Whisper": "is",
|
||||
},
|
||||
},
|
||||
"Indonesian":{
|
||||
"Indonesia":{
|
||||
"Google": "id-ID",
|
||||
"Whisper": "id",
|
||||
},
|
||||
},
|
||||
"Italian":{
|
||||
"Italy":{
|
||||
"Google": "it-IT",
|
||||
"Whisper": "it",
|
||||
},
|
||||
"Switzerland":{
|
||||
"Google": "it-CH",
|
||||
"Whisper": "it",
|
||||
},
|
||||
},
|
||||
"Japanese":{
|
||||
"Japan":{
|
||||
"Google": "ja-JP",
|
||||
"Whisper": "ja",
|
||||
},
|
||||
},
|
||||
# "Javanese":{
|
||||
# "Indonesia":{
|
||||
# "Google": "jv-ID",
|
||||
# },
|
||||
# },
|
||||
"Kannada":{
|
||||
"India":{
|
||||
"Google": "kn-IN",
|
||||
"Whisper": "kn",
|
||||
},
|
||||
},
|
||||
"Kazakh":{
|
||||
"Kazakhstan":{
|
||||
"Google": "kk-KZ",
|
||||
"Whisper": "kk",
|
||||
},
|
||||
},
|
||||
"Khmer":{
|
||||
"Cambodia":{
|
||||
"Google": "km-KH",
|
||||
"Whisper": "km",
|
||||
},
|
||||
},
|
||||
# "Kinyarwanda":{
|
||||
# "rwanda":{
|
||||
# "Google": "rw-RW",
|
||||
# },
|
||||
# },
|
||||
"Korean":{
|
||||
"South Korea":{
|
||||
"Google": "ko-KR",
|
||||
"Whisper": "ko",
|
||||
},
|
||||
},
|
||||
"Lao":{
|
||||
"Laos":{
|
||||
"Google": "lo-LA",
|
||||
"Whisper": "lo",
|
||||
},
|
||||
},
|
||||
"Latvian":{
|
||||
"Latvia":{
|
||||
"Google": "lv-LV",
|
||||
"Whisper": "lv",
|
||||
},
|
||||
},
|
||||
"Lithuanian":{
|
||||
"Lithuania":{
|
||||
"Google": "lt-LT",
|
||||
"Whisper": "lt",
|
||||
},
|
||||
},
|
||||
"Macedonian":{
|
||||
"North Macedonia":{
|
||||
"Google": "mk-MK",
|
||||
"Whisper": "mk",
|
||||
},
|
||||
},
|
||||
"Malay":{
|
||||
"Malaysia":{
|
||||
"Google": "ms-MY",
|
||||
"Whisper": "ms",
|
||||
},
|
||||
},
|
||||
"Malayalam":{
|
||||
"India":{
|
||||
"Google": "ml-IN",
|
||||
"Whisper": "ml",
|
||||
},
|
||||
},
|
||||
"Mongolian":{
|
||||
"Mongolia":{
|
||||
"Google": "mn-MN",
|
||||
"Whisper": "mn",
|
||||
},
|
||||
},
|
||||
"Nepali":{
|
||||
"Nepal":{
|
||||
"Google": "ne-NP",
|
||||
"Whisper": "ne",
|
||||
},
|
||||
},
|
||||
"Norwegian":{
|
||||
"Norway":{
|
||||
"Google": "no-NO",
|
||||
"Whisper": "no",
|
||||
},
|
||||
},
|
||||
"Persian":{
|
||||
"Iran":{
|
||||
"Google": "fa-IR",
|
||||
"Whisper": "fa",
|
||||
},
|
||||
},
|
||||
"Polish":{
|
||||
"Poland":{
|
||||
"Google": "pl-PL",
|
||||
"Whisper": "pl",
|
||||
},
|
||||
},
|
||||
"Portuguese":{
|
||||
"Brazil":{
|
||||
"Google": "pt-BR",
|
||||
"Whisper": "pt",
|
||||
},
|
||||
"Portugal":{
|
||||
"Google": "pt-PT",
|
||||
"Whisper": "pt",
|
||||
},
|
||||
},
|
||||
# "Punjabi":{
|
||||
# "India":{
|
||||
# "Google": "pa-Guru-IN",
|
||||
# },
|
||||
# },
|
||||
"Romanian":{
|
||||
"Romania":{
|
||||
"Google": "ro-RO",
|
||||
"Whisper": "ro",
|
||||
},
|
||||
},
|
||||
"Russian":{
|
||||
"Russia":{
|
||||
"Google": "ru-RU",
|
||||
"Whisper": "ru",
|
||||
},
|
||||
},
|
||||
"Serbian":{
|
||||
"Serbia":{
|
||||
"Google": "sr-RS",
|
||||
"Whisper": "sr",
|
||||
},
|
||||
},
|
||||
"Sinhala":{
|
||||
"Sri Lanka":{
|
||||
"Google": "si-LK",
|
||||
"Whisper": "si",
|
||||
},
|
||||
},
|
||||
"Slovak":{
|
||||
"Slovakia":{
|
||||
"Google": "sk-SK",
|
||||
"Whisper": "sk",
|
||||
},
|
||||
},
|
||||
"Slovenian":{
|
||||
"Slovenia":{
|
||||
"Google": "sl-SI",
|
||||
"Whisper": "sl",
|
||||
},
|
||||
},
|
||||
# "Sesotho":{
|
||||
# "South Africa":{
|
||||
# "Google": "st-ZA",
|
||||
# },
|
||||
# },
|
||||
"Spanish":{
|
||||
"Argentina":{
|
||||
"Google": "es-AR",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Bolivia":{
|
||||
"Google": "es-BO",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Chile":{
|
||||
"Google": "es-CL",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Colombia":{
|
||||
"Google": "es-CO",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Costa Rica":{
|
||||
"Google": "es-CR",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Dominican Republic":{
|
||||
"Google": "es-DO",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Ecuador":{
|
||||
"Google": "es-EC",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"El Salvador":{
|
||||
"Google": "es-SV",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Guatemala":{
|
||||
"Google": "es-GT",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Honduras":{
|
||||
"Google": "es-HN",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Mexico":{
|
||||
"Google": "es-MX",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Nicaragua":{
|
||||
"Google": "es-NI",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Panama":{
|
||||
"Google": "es-PA",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Paraguay":{
|
||||
"Google": "es-PY",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Peru":{
|
||||
"Google": "es-PE",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Puerto Rico":{
|
||||
"Google": "es-PR",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Spain":{
|
||||
"Google": "es-ES",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"United States":{
|
||||
"Google": "es-US",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Uruguay":{
|
||||
"Google": "es-UY",
|
||||
"Whisper": "es",
|
||||
},
|
||||
"Venezuela":{
|
||||
"Google": "es-VE",
|
||||
"Whisper": "es",
|
||||
},
|
||||
},
|
||||
"Sundanese":{
|
||||
"Indonesia":{
|
||||
"Google": "su-ID",
|
||||
"Whisper": "su",
|
||||
},
|
||||
},
|
||||
"Swahili":{
|
||||
"Kenya":{
|
||||
"Google": "sw-KE",
|
||||
"Whisper": "sw",
|
||||
},
|
||||
"Tanzania":{
|
||||
"Google": "sw-TZ",
|
||||
"Whisper": "sw",
|
||||
},
|
||||
},
|
||||
# "Swazi":{
|
||||
# "Eswatini":{
|
||||
# "Google": "ss-Latn-ZA",
|
||||
# },
|
||||
# },
|
||||
"Swedish":{
|
||||
"Sweden":{
|
||||
"Google": "sv-SE",
|
||||
"Whisper": "sv",
|
||||
},
|
||||
},
|
||||
"Tamil":{
|
||||
"India":{
|
||||
"Google": "ta-IN",
|
||||
"Whisper": "ta",
|
||||
},
|
||||
"malaysia":{
|
||||
"Google": "ta-MY",
|
||||
"Whisper": "ta",
|
||||
},
|
||||
"Singapore":{
|
||||
"Google": "ta-SG",
|
||||
"Whisper": "ta",
|
||||
},
|
||||
"Sri Lanka":{
|
||||
"Google": "ta-LK",
|
||||
"Whisper": "ta",
|
||||
},
|
||||
},
|
||||
"Telugu":{
|
||||
"India":{
|
||||
"Google": "te-IN",
|
||||
"Whisper": "te",
|
||||
},
|
||||
},
|
||||
"Thai":{
|
||||
"Thailand":{
|
||||
"Google": "th-TH",
|
||||
"Whisper": "th",
|
||||
},
|
||||
},
|
||||
# "Tsonga":{
|
||||
# "South Africa":{
|
||||
# "Google": "ts-ZA",
|
||||
# },
|
||||
# },
|
||||
# "Setswana":{
|
||||
# "South Africa":{
|
||||
# "Google": "tn-Latn-ZA",
|
||||
# },
|
||||
# },
|
||||
"Turkish":{
|
||||
"Turkey":{
|
||||
"Google": "tr-TR",
|
||||
"Whisper": "tr",
|
||||
},
|
||||
},
|
||||
"Ukrainian":{
|
||||
"Ukraine":{
|
||||
"Google": "uk-UA",
|
||||
"Whisper": "uk",
|
||||
},
|
||||
},
|
||||
"Urdu":{
|
||||
"India":{
|
||||
"Google": "ur-IN",
|
||||
"Whisper": "ur",
|
||||
},
|
||||
"Pakistan":{
|
||||
"Google": "ur-PK",
|
||||
"Whisper": "ur",
|
||||
},
|
||||
},
|
||||
"Uzbek":{
|
||||
"Uzbekistan":{
|
||||
"Google": "uz-UZ",
|
||||
"Whisper": "uz",
|
||||
},
|
||||
},
|
||||
# "Venda":{
|
||||
# "South Africa":{
|
||||
# "Google": "ve-ZA",
|
||||
# },
|
||||
# },
|
||||
"Vietnamese":{
|
||||
"Vietnam":{
|
||||
"Google": "vi-VN",
|
||||
"Whisper": "vi",
|
||||
},
|
||||
},
|
||||
# "Xhosa":{
|
||||
# "South Africa":{
|
||||
# "Google": "xh-ZA",
|
||||
# },
|
||||
# },
|
||||
# "Zulu":{
|
||||
# "South Africa":{
|
||||
# "Google": "zu-ZA",
|
||||
# },
|
||||
# },
|
||||
}
|
||||
142
src-python/models/transcription/transcription_recorder.py
Normal file
142
src-python/models/transcription/transcription_recorder.py
Normal file
@@ -0,0 +1,142 @@
|
||||
from speech_recognition import Recognizer, Microphone
|
||||
from pyaudiowpatch import get_sample_size, paInt16
|
||||
from datetime import datetime
|
||||
from queue import Queue
|
||||
|
||||
class BaseRecorder:
|
||||
def __init__(self, source, energy_threshold, dynamic_energy_threshold, record_timeout):
|
||||
self.recorder = Recognizer()
|
||||
self.recorder.energy_threshold = energy_threshold
|
||||
self.recorder.dynamic_energy_threshold = dynamic_energy_threshold
|
||||
self.record_timeout = record_timeout
|
||||
self.stop = None
|
||||
|
||||
if source is None:
|
||||
raise ValueError("audio source can't be None")
|
||||
|
||||
self.source = source
|
||||
|
||||
def adjustForNoise(self):
|
||||
with self.source:
|
||||
self.recorder.adjust_for_ambient_noise(self.source)
|
||||
|
||||
def recordIntoQueue(self, audio_queue):
|
||||
def record_callback(_, audio):
|
||||
audio_queue.put((audio.get_raw_data(), datetime.now()))
|
||||
|
||||
self.stop, self.pause, self.resume = self.recorder.listen_in_background(self.source, record_callback, phrase_time_limit=self.record_timeout)
|
||||
|
||||
class SelectedMicRecorder(BaseRecorder):
|
||||
def __init__(self, device, energy_threshold, dynamic_energy_threshold, record_timeout):
|
||||
source=Microphone(
|
||||
device_index=device['index'],
|
||||
sample_rate=int(device["defaultSampleRate"]),
|
||||
)
|
||||
super().__init__(source=source, energy_threshold=energy_threshold, dynamic_energy_threshold=dynamic_energy_threshold, record_timeout=record_timeout)
|
||||
# self.adjustForNoise()
|
||||
|
||||
class SelectedSpeakerRecorder(BaseRecorder):
|
||||
def __init__(self, device, energy_threshold, dynamic_energy_threshold, record_timeout):
|
||||
|
||||
source = Microphone(speaker=True,
|
||||
device_index= device["index"],
|
||||
sample_rate=int(device["defaultSampleRate"]),
|
||||
chunk_size=get_sample_size(paInt16),
|
||||
channels=device["maxInputChannels"]
|
||||
)
|
||||
super().__init__(source=source, energy_threshold=energy_threshold, dynamic_energy_threshold=dynamic_energy_threshold, record_timeout=record_timeout)
|
||||
# self.adjustForNoise()
|
||||
|
||||
class BaseEnergyRecorder:
|
||||
def __init__(self, source):
|
||||
self.recorder = Recognizer()
|
||||
self.recorder.energy_threshold = 0
|
||||
self.recorder.dynamic_energy_threshold = False
|
||||
self.record_timeout = 0
|
||||
self.stop = None
|
||||
|
||||
if source is None:
|
||||
raise ValueError("audio source can't be None")
|
||||
|
||||
self.source = source
|
||||
|
||||
def adjustForNoise(self):
|
||||
with self.source:
|
||||
self.recorder.adjust_for_ambient_noise(self.source)
|
||||
|
||||
def recordIntoQueue(self, energy_queue):
|
||||
def recordCallback(_, energy):
|
||||
energy_queue.put(energy)
|
||||
|
||||
self.stop, self.pause, self.resume = self.recorder.listen_energy_in_background(self.source, recordCallback)
|
||||
|
||||
class SelectedMicEnergyRecorder(BaseEnergyRecorder):
|
||||
def __init__(self, device):
|
||||
source=Microphone(
|
||||
device_index=device['index'],
|
||||
sample_rate=int(device["defaultSampleRate"]),
|
||||
)
|
||||
super().__init__(source=source)
|
||||
# self.adjustForNoise()
|
||||
|
||||
class SelectedSpeakerEnergyRecorder(BaseEnergyRecorder):
|
||||
def __init__(self, device):
|
||||
|
||||
source = Microphone(speaker=True,
|
||||
device_index= device["index"],
|
||||
sample_rate=int(device["defaultSampleRate"]),
|
||||
channels=device["maxInputChannels"]
|
||||
)
|
||||
super().__init__(source=source)
|
||||
# self.adjustForNoise()
|
||||
|
||||
class BaseEnergyAndAudioRecorder:
|
||||
def __init__(self, source, energy_threshold, dynamic_energy_threshold, record_timeout):
|
||||
self.recorder = Recognizer()
|
||||
self.recorder.energy_threshold = energy_threshold
|
||||
self.recorder.dynamic_energy_threshold = dynamic_energy_threshold
|
||||
self.record_timeout = record_timeout
|
||||
self.stop = None
|
||||
|
||||
if source is None:
|
||||
raise ValueError("audio source can't be None")
|
||||
|
||||
self.source = source
|
||||
|
||||
def adjustForNoise(self):
|
||||
with self.source:
|
||||
self.recorder.adjust_for_ambient_noise(self.source)
|
||||
|
||||
def recordIntoQueue(self, audio_queue, energy_queue=None):
|
||||
def audioRecordCallback(_, audio):
|
||||
audio_queue.put((audio.get_raw_data(), datetime.now()))
|
||||
|
||||
def energyRecordCallback(energy):
|
||||
energy_queue.put(energy)
|
||||
|
||||
self.stop, self.pause, self.resume = self.recorder.listen_energy_and_audio_in_background(
|
||||
source=self.source,
|
||||
callback=audioRecordCallback,
|
||||
phrase_time_limit=self.record_timeout,
|
||||
callback_energy=energyRecordCallback if energy_queue is not None else None)
|
||||
|
||||
class SelectedMicEnergyAndAudioRecorder(BaseEnergyAndAudioRecorder):
|
||||
def __init__(self, device, energy_threshold, dynamic_energy_threshold, record_timeout):
|
||||
source=Microphone(
|
||||
device_index=device['index'],
|
||||
sample_rate=int(device["defaultSampleRate"]),
|
||||
)
|
||||
super().__init__(source=source, energy_threshold=energy_threshold, dynamic_energy_threshold=dynamic_energy_threshold, record_timeout=record_timeout)
|
||||
# self.adjustForNoise()
|
||||
|
||||
class SelectedSpeakerEnergyAndAudioRecorder(BaseEnergyAndAudioRecorder):
|
||||
def __init__(self, device, energy_threshold, dynamic_energy_threshold, record_timeout):
|
||||
|
||||
source = Microphone(speaker=True,
|
||||
device_index= device["index"],
|
||||
sample_rate=int(device["defaultSampleRate"]),
|
||||
chunk_size=get_sample_size(paInt16),
|
||||
channels=device["maxInputChannels"]
|
||||
)
|
||||
super().__init__(source=source, energy_threshold=energy_threshold, dynamic_energy_threshold=dynamic_energy_threshold, record_timeout=record_timeout)
|
||||
# self.adjustForNoise()
|
||||
141
src-python/models/transcription/transcription_transcriber.py
Normal file
141
src-python/models/transcription/transcription_transcriber.py
Normal file
@@ -0,0 +1,141 @@
|
||||
import time
|
||||
from io import BytesIO
|
||||
from threading import Event
|
||||
import wave
|
||||
from speech_recognition import Recognizer, AudioData, AudioFile
|
||||
from datetime import timedelta
|
||||
from pyaudiowpatch import get_sample_size, paInt16
|
||||
from .transcription_languages import transcription_lang
|
||||
from .transcription_whisper import getWhisperModel, checkWhisperWeight
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from pydub import AudioSegment
|
||||
|
||||
PHRASE_TIMEOUT = 3
|
||||
MAX_PHRASES = 10
|
||||
|
||||
class AudioTranscriber:
|
||||
def __init__(self, speaker, source, phrase_timeout, max_phrases, transcription_engine, root=None, whisper_weight_type=None):
|
||||
self.speaker = speaker
|
||||
self.phrase_timeout = phrase_timeout
|
||||
self.max_phrases = max_phrases
|
||||
self.transcript_data = []
|
||||
self.transcript_changed_event = Event()
|
||||
self.audio_recognizer = Recognizer()
|
||||
self.transcription_engine = "Google"
|
||||
self.whisper_model = None
|
||||
self.audio_sources = {
|
||||
"sample_rate": source.SAMPLE_RATE,
|
||||
"sample_width": source.SAMPLE_WIDTH,
|
||||
"channels": source.channels,
|
||||
"last_sample": bytes(),
|
||||
"last_spoken": None,
|
||||
"new_phrase": True,
|
||||
"process_data_func": self.processSpeakerData if speaker else self.processSpeakerData
|
||||
}
|
||||
|
||||
if transcription_engine == "Whisper" and checkWhisperWeight(root, whisper_weight_type) is True:
|
||||
self.whisper_model = getWhisperModel(root, whisper_weight_type)
|
||||
self.transcription_engine = "Whisper"
|
||||
|
||||
def transcribeAudioQueue(self, audio_queue, language, country, avg_logprob=-0.8, no_speech_prob=0.6):
|
||||
if audio_queue.empty():
|
||||
time.sleep(0.01)
|
||||
return False
|
||||
audio, time_spoken = audio_queue.get()
|
||||
self.updateLastSampleAndPhraseStatus(audio, time_spoken)
|
||||
|
||||
text = ''
|
||||
try:
|
||||
audio_data = self.audio_sources["process_data_func"]()
|
||||
match self.transcription_engine:
|
||||
case "Google":
|
||||
text = self.audio_recognizer.recognize_google(audio_data, language=transcription_lang[language][country][self.transcription_engine])
|
||||
case "Whisper":
|
||||
audio_data = np.frombuffer(audio_data.get_raw_data(convert_rate=16000, convert_width=2), np.int16).flatten().astype(np.float32) / 32768.0
|
||||
if isinstance(audio_data, torch.Tensor):
|
||||
audio_data = audio_data.detach().numpy()
|
||||
segments, _ = self.whisper_model.transcribe(
|
||||
audio_data,
|
||||
beam_size=5,
|
||||
temperature=0.0,
|
||||
log_prob_threshold=-0.8,
|
||||
no_speech_threshold=0.6,
|
||||
language=transcription_lang[language][country][self.transcription_engine],
|
||||
word_timestamps=False,
|
||||
without_timestamps=True,
|
||||
task="transcribe",
|
||||
vad_filter=False,
|
||||
)
|
||||
for s in segments:
|
||||
if s.avg_logprob < avg_logprob or s.no_speech_prob > no_speech_prob:
|
||||
continue
|
||||
text += s.text
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
pass
|
||||
|
||||
if text != '':
|
||||
self.updateTranscript(text)
|
||||
return True
|
||||
|
||||
def updateLastSampleAndPhraseStatus(self, data, time_spoken):
|
||||
source_info = self.audio_sources
|
||||
if source_info["last_spoken"] and time_spoken - source_info["last_spoken"] > timedelta(seconds=self.phrase_timeout):
|
||||
source_info["last_sample"] = bytes()
|
||||
source_info["new_phrase"] = True
|
||||
else:
|
||||
source_info["new_phrase"] = False
|
||||
|
||||
source_info["last_sample"] += data
|
||||
source_info["last_spoken"] = time_spoken
|
||||
|
||||
def processMicData(self):
|
||||
audio_data = AudioData(self.audio_sources["last_sample"], self.audio_sources["sample_rate"], self.audio_sources["sample_width"])
|
||||
return audio_data
|
||||
|
||||
def processSpeakerData(self):
|
||||
temp_file = BytesIO()
|
||||
with wave.open(temp_file, 'wb') as wf:
|
||||
wf.setnchannels(self.audio_sources["channels"])
|
||||
wf.setsampwidth(get_sample_size(paInt16))
|
||||
wf.setframerate(self.audio_sources["sample_rate"])
|
||||
wf.writeframes(self.audio_sources["last_sample"])
|
||||
temp_file.seek(0)
|
||||
|
||||
if self.audio_sources["channels"] > 2:
|
||||
audio = AudioSegment.from_file(temp_file, format="wav")
|
||||
mono_audio = audio.set_channels(1)
|
||||
temp_file = BytesIO()
|
||||
mono_audio.export(temp_file, format="wav")
|
||||
temp_file.seek(0)
|
||||
|
||||
with AudioFile(temp_file) as source:
|
||||
audio = self.audio_recognizer.record(source)
|
||||
return audio
|
||||
|
||||
def updateTranscript(self, text):
|
||||
source_info = self.audio_sources
|
||||
transcript = self.transcript_data
|
||||
|
||||
if source_info["new_phrase"] or len(transcript) == 0:
|
||||
if len(transcript) > self.max_phrases:
|
||||
transcript.pop(-1)
|
||||
transcript.insert(0, text)
|
||||
else:
|
||||
transcript[0] = text
|
||||
|
||||
def getTranscript(self):
|
||||
if len(self.transcript_data) > 0:
|
||||
text = self.transcript_data.pop(-1)
|
||||
else:
|
||||
text = ""
|
||||
return text
|
||||
|
||||
def clearTranscriptData(self):
|
||||
self.transcript_data.clear()
|
||||
self.audio_sources["last_sample"] = bytes()
|
||||
self.audio_sources["new_phrase"] = True
|
||||
70
src-python/models/transcription/transcription_utils.py
Normal file
70
src-python/models/transcription/transcription_utils.py
Normal file
@@ -0,0 +1,70 @@
|
||||
from pyaudiowpatch import PyAudio, paWASAPI
|
||||
|
||||
def getInputDevices():
|
||||
devices = {}
|
||||
with PyAudio() as p:
|
||||
for host_index in range(0, p.get_host_api_count()):
|
||||
host = p.get_host_api_info_by_index(host_index)
|
||||
for device_index in range(0, p.get_host_api_info_by_index(host_index)['deviceCount']):
|
||||
device = p.get_device_info_by_host_api_device_index(host_index, device_index)
|
||||
if device["maxInputChannels"] > 0 and device["isLoopbackDevice"] is False:
|
||||
if host["name"] in devices.keys():
|
||||
devices[host["name"]].append(device)
|
||||
else:
|
||||
devices[host["name"]] = [device]
|
||||
if len(devices) == 0:
|
||||
devices = {"NoHost": [{"name": "NoDevice"}]}
|
||||
return devices
|
||||
|
||||
def getDefaultInputDevice():
|
||||
with PyAudio() as p:
|
||||
api_info = p.get_default_host_api_info()
|
||||
defaultInputDevice = api_info["defaultInputDevice"]
|
||||
|
||||
for host_index in range(0, p.get_host_api_count()):
|
||||
host = p.get_host_api_info_by_index(host_index)
|
||||
for device_index in range(0, p.get_host_api_info_by_index(host_index)['deviceCount']):
|
||||
device = p.get_device_info_by_host_api_device_index(host_index, device_index)
|
||||
if device["index"] == defaultInputDevice:
|
||||
return {"host": host, "device": device}
|
||||
return {"host": {"name": "NoHost"}, "device": {"name": "NoDevice"}}
|
||||
|
||||
def getOutputDevices():
|
||||
devices = []
|
||||
with PyAudio() as p:
|
||||
wasapi_info = p.get_host_api_info_by_type(paWASAPI)
|
||||
for host_index in range(0, p.get_host_api_count()):
|
||||
host = p.get_host_api_info_by_index(host_index)
|
||||
if host["name"] == wasapi_info["name"]:
|
||||
for device_index in range(0, p.get_host_api_info_by_index(host_index)['deviceCount']):
|
||||
device = p.get_device_info_by_host_api_device_index(host_index, device_index)
|
||||
if not device["isLoopbackDevice"]:
|
||||
for loopback in p.get_loopback_device_info_generator():
|
||||
if device["name"] in loopback["name"]:
|
||||
devices.append(loopback)
|
||||
|
||||
if len(devices) == 0:
|
||||
devices = [{"name": "NoDevice"}]
|
||||
else:
|
||||
devices = [dict(t) for t in {tuple(d.items()) for d in devices}]
|
||||
return devices
|
||||
|
||||
def getDefaultOutputDevice():
|
||||
with PyAudio() as p:
|
||||
wasapi_info = p.get_host_api_info_by_type(paWASAPI)
|
||||
defaultOutputDevice = wasapi_info["defaultOutputDevice"]
|
||||
|
||||
for host_index in range(0, p.get_host_api_count()):
|
||||
for device_index in range(0, p. get_host_api_info_by_index(host_index)['deviceCount']):
|
||||
device = p.get_device_info_by_host_api_device_index(host_index, device_index)
|
||||
if device["index"] == defaultOutputDevice:
|
||||
default_speakers = device
|
||||
if not default_speakers["isLoopbackDevice"]:
|
||||
for loopback in p.get_loopback_device_info_generator():
|
||||
if default_speakers["name"] in loopback["name"]:
|
||||
return {"device": loopback}
|
||||
return {"device": {"name": "NoDevice"}}
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("getOutputDevices()", getOutputDevices())
|
||||
print("getDefaultOutputDevice()", getDefaultOutputDevice())
|
||||
98
src-python/models/transcription/transcription_whisper.py
Normal file
98
src-python/models/transcription/transcription_whisper.py
Normal file
@@ -0,0 +1,98 @@
|
||||
from os import path as os_path, makedirs as os_makedirs
|
||||
from requests import get as requests_get
|
||||
from typing import Callable
|
||||
import huggingface_hub
|
||||
from faster_whisper import WhisperModel
|
||||
import logging
|
||||
logger = logging.getLogger('faster_whisper')
|
||||
logger.setLevel(logging.CRITICAL)
|
||||
|
||||
_MODELS = {
|
||||
"tiny": "Systran/faster-whisper-tiny",
|
||||
"base": "Systran/faster-whisper-base",
|
||||
"small": "Systran/faster-whisper-small",
|
||||
"medium": "Systran/faster-whisper-medium",
|
||||
"large-v1": "Systran/faster-whisper-large-v1",
|
||||
"large-v2": "Systran/faster-whisper-large-v2",
|
||||
"large-v3": "Systran/faster-whisper-large-v3",
|
||||
}
|
||||
|
||||
_FILENAMES = [
|
||||
"config.json",
|
||||
"preprocessor_config.json",
|
||||
"model.bin",
|
||||
"tokenizer.json",
|
||||
"vocabulary.txt",
|
||||
"vocabulary.json",
|
||||
]
|
||||
|
||||
def downloadFile(url, path, func=None):
|
||||
try:
|
||||
res = requests_get(url, stream=True)
|
||||
res.raise_for_status()
|
||||
file_size = int(res.headers.get('content-length', 0))
|
||||
total_chunk = 0
|
||||
with open(os_path.join(path), 'wb') as file:
|
||||
for chunk in res.iter_content(chunk_size=1024*5):
|
||||
file.write(chunk)
|
||||
if isinstance(func, Callable):
|
||||
total_chunk += len(chunk)
|
||||
func(total_chunk/file_size)
|
||||
|
||||
except Exception as e:
|
||||
print("error:downloadFile()", e)
|
||||
|
||||
def checkWhisperWeight(root, weight_type):
|
||||
path = os_path.join(root, "weights", "whisper", weight_type)
|
||||
result = False
|
||||
try:
|
||||
WhisperModel(
|
||||
path,
|
||||
device="cpu",
|
||||
device_index=0,
|
||||
compute_type="int8",
|
||||
cpu_threads=4,
|
||||
num_workers=1,
|
||||
local_files_only=True,
|
||||
)
|
||||
result = True
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
|
||||
def downloadWhisperWeight(root, weight_type, callbackFunc):
|
||||
path = os_path.join(root, "weights", "whisper", weight_type)
|
||||
os_makedirs(path, exist_ok=True)
|
||||
if checkWhisperWeight(root, weight_type) is True:
|
||||
return
|
||||
|
||||
for filename in _FILENAMES:
|
||||
print("Downloading", filename, "...")
|
||||
file_path = os_path.join(path, filename)
|
||||
url = huggingface_hub.hf_hub_url(_MODELS[weight_type], filename)
|
||||
downloadFile(url, file_path, func=callbackFunc)
|
||||
|
||||
def getWhisperModel(root, weight_type):
|
||||
path = os_path.join(root, "weights", "whisper", weight_type)
|
||||
return WhisperModel(
|
||||
path,
|
||||
device="cpu",
|
||||
device_index=0,
|
||||
compute_type="int8",
|
||||
cpu_threads=4,
|
||||
num_workers=1,
|
||||
local_files_only=True,
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
def callback(value):
|
||||
print(value)
|
||||
pass
|
||||
|
||||
downloadWhisperWeight("./", "tiny", callback)
|
||||
downloadWhisperWeight("./", "base", callback)
|
||||
downloadWhisperWeight("./", "small", callback)
|
||||
downloadWhisperWeight("./", "medium", callback)
|
||||
downloadWhisperWeight("./", "large-v1", callback)
|
||||
downloadWhisperWeight("./", "large-v2", callback)
|
||||
downloadWhisperWeight("./", "large-v3", callback)
|
||||
Reference in New Issue
Block a user