From 1f5a2e92cc7577d78f367fc8f0dc3c4392e62ad5 Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Wed, 15 Nov 2023 14:38:37 +0900 Subject: [PATCH] =?UTF-8?q?[Add]=20Model:=20ctranslate2=E3=81=AE=E3=83=86?= =?UTF-8?q?=E3=82=B9=E3=83=88=E3=82=B3=E3=83=BC=E3=83=89=E3=82=92=E8=BF=BD?= =?UTF-8?q?=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- model.py | 4 +- models/translation/translation_languages.py | 107 +++++++++++++++++++ models/translation/translation_translator.py | 27 +++++ requirements.txt | 5 +- 4 files changed, 140 insertions(+), 3 deletions(-) diff --git a/model.py b/model.py index bff713f4..431898ad 100644 --- a/model.py +++ b/model.py @@ -161,7 +161,7 @@ class Model: elif target_language in ["Portuguese European", "Portuguese Brazilian"]: target_language = "Portuguese" - translation = self.translator.translate( + translation = self.translator.translate_ctranslate2( translator_name=translator_name, source_language=source_language, target_language=target_language, @@ -192,7 +192,7 @@ class Model: elif target_language in ["Portuguese European", "Portuguese Brazilian"]: target_language = "Portuguese" - translation = self.translator.translate( + translation = self.translator.translate_ctranslate2( translator_name=translator_name, source_language=source_language, target_language=target_language, diff --git a/models/translation/translation_languages.py b/models/translation/translation_languages.py index ae57d4cc..d5645cd2 100644 --- a/models/translation/translation_languages.py +++ b/models/translation/translation_languages.py @@ -240,4 +240,111 @@ dict_bing_languages = { translation_lang["Bing"] = { "source":dict_bing_languages, "target":dict_bing_languages, +} + +dict_ctranslate2_lang = { + 'English': 'en', + 'Chinese': 'zh', + 'German': 'de', + 'Spanish': 'es', + 'Russian': 'ru', + 'Korean': 'ko', + 'French': 'fr', + 'Japanese': 'ja', + 'Portuguese': 'pt', + 'Turkish': 'tr', + 'Polish': 'pl', + 'Catalan': 'ca', + 'Dutch': 'nl', + 'Arabic': 'ar', + 'Swedish': 'sv', + 'Italian': 'it', + 'Indonesian': 'id', + 'Hindi': 'hi', + 'Finnish': 'fi', + 'Vietnamese': 'vi', + 'Hebrew': 'he', + 'Ukrainian': 'uk', + 'Greek': 'el', + 'Malay': 'ms', + 'Czech': 'cs', + 'Romanian': 'ro', + 'Danish': 'da', + 'Hungarian': 'hu', + 'Tamil': 'ta', + 'Norwegian': 'no', + 'Thai': 'th', + 'Urdu': 'ur', + 'Croatian': 'hr', + 'Bulgarian': 'bg', + 'Lithuanian': 'lt', + 'Latin': 'la', + 'Maori': 'mi', + 'Malayalam': 'ml', + 'Welsh': 'cy', + 'Slovak': 'sk', + 'Telugu': 'te', + 'Persian': 'fa', + 'Latvian': 'lv', + 'Bengali': 'bn', + 'Serbian': 'sr', + 'Azerbaijani': 'az', + 'Slovenian': 'sl', + 'Kannada': 'kn', + 'Estonian': 'et', + 'Macedonian': 'mk', + 'Breton': 'br', + 'Basque': 'eu', + 'Icelandic': 'is', + 'Armenian': 'hy', + 'Nepali': 'ne', + 'Mongolian': 'mn', + 'Bosnian': 'bs', + 'Kazakh': 'kk', + 'Albanian': 'sq', + 'Swahili': 'sw', + 'Galician': 'gl', + 'Marathi': 'mr', + 'Punjabi': 'pa', + 'Sinhala': 'si', + 'Khmer': 'km', + 'Shona': 'sn', + 'Yoruba': 'yo', + 'Somali': 'so', + 'Afrikaans': 'af', + 'Occitan': 'oc', + 'Georgian': 'ka', + 'Belarusian': 'be', + 'Tajik': 'tg', + 'Sindhi': 'sd', + 'Gujarati': 'gu', + 'Amharic': 'am', + 'Yiddish': 'yi', + 'Lao': 'lo', + 'Uzbek': 'uz', + 'Faroese': 'fo', + 'Haitian creole': 'ht', + 'Pashto': 'ps', + 'Turkmen': 'tk', + 'Nynorsk': 'nn', + 'Maltese': 'mt', + 'Sanskrit': 'sa', + 'Luxembourgish': 'lb', + 'Myanmar': 'my', + 'Tibetan': 'bo', + 'Tagalog': 'tl', + 'Malagasy': 'mg', + 'Assamese': 'as', + 'Tatar': 'tt', + 'Hawaiian': 'haw', + 'Lingala': 'ln', + 'Hausa': 'ha', + 'Bashkir': 'ba', + 'Javanese': 'jw', + 'Sundanese': 'su' +} + +translation_lang["ctranslate2"] = { + "source":dict_ctranslate2_lang, + "target":dict_ctranslate2_lang, } \ No newline at end of file diff --git a/models/translation/translation_translator.py b/models/translation/translation_translator.py index c3a5682b..d15a05c4 100644 --- a/models/translation/translation_translator.py +++ b/models/translation/translation_translator.py @@ -3,12 +3,24 @@ from deepl_translate import translate as deepl_web_Translator from translators import translate_text as other_web_Translator from .translation_languages import translation_lang +from ctranslate2.converters import TransformersConverter +import ctranslate2 +import transformers + +TRANSLATE_MODELS = { + "small": "facebook/m2m100_418M", + "large": "facebook/m2m100_1.2B" +} + # Translator class Translator(): def __init__(self): pass self.translator_status = {} + self.translator = ctranslate2.Translator("D:\\WORKSPACE\\WORK\\VRChatProject\\VRCT\\weight", device="cpu", device_index=0, compute_type="int8", inter_threads=1, intra_threads=4) + self.tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/m2m100_418M") + def authentication(self, translator_name, authkey=None): result = True match translator_name: @@ -57,4 +69,19 @@ class Translator(): with open('error.log', 'a') as f: traceback.print_exc(file=f) result = False + return result + + def translate_ctranslate2(self, translator_name, source_language, target_language, message): + + source_language=translation_lang["ctranslate2"]["source"][source_language] + target_language=translation_lang["ctranslate2"]["target"][target_language] + + self.tokenizer.src_lang = source_language + source = self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(message)) + target_prefix = [self.tokenizer.lang_code_to_token[target_language]] + results = self.translator.translate_batch([source], target_prefix=[target_prefix]) + target = results[0].hypotheses[0][1:] + + result = self.tokenizer.decode(self.tokenizer.convert_tokens_to_ids(target)) + print(result) return result \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 42f4be2c..ca48a2b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,7 @@ deepl == 1.15.0 flashtext == 2.7 pyyaml == 6.0.1 python-i18n == 0.3.9 -CTkToolTip == 0.8 \ No newline at end of file +CTkToolTip == 0.8 +transformers[torch] +sentencepiece==0.1.99 +ctranslate2==3.21.0 \ No newline at end of file