[Add] Model: ctranslate2のテストコードを追加

This commit is contained in:
misyaguziya
2023-11-15 14:38:37 +09:00
parent ad7080438f
commit 1f5a2e92cc
4 changed files with 140 additions and 3 deletions

View File

@@ -161,7 +161,7 @@ class Model:
elif target_language in ["Portuguese European", "Portuguese Brazilian"]:
target_language = "Portuguese"
translation = self.translator.translate(
translation = self.translator.translate_ctranslate2(
translator_name=translator_name,
source_language=source_language,
target_language=target_language,
@@ -192,7 +192,7 @@ class Model:
elif target_language in ["Portuguese European", "Portuguese Brazilian"]:
target_language = "Portuguese"
translation = self.translator.translate(
translation = self.translator.translate_ctranslate2(
translator_name=translator_name,
source_language=source_language,
target_language=target_language,

View File

@@ -241,3 +241,110 @@ translation_lang["Bing"] = {
"source":dict_bing_languages,
"target":dict_bing_languages,
}
dict_ctranslate2_lang = {
'English': 'en',
'Chinese': 'zh',
'German': 'de',
'Spanish': 'es',
'Russian': 'ru',
'Korean': 'ko',
'French': 'fr',
'Japanese': 'ja',
'Portuguese': 'pt',
'Turkish': 'tr',
'Polish': 'pl',
'Catalan': 'ca',
'Dutch': 'nl',
'Arabic': 'ar',
'Swedish': 'sv',
'Italian': 'it',
'Indonesian': 'id',
'Hindi': 'hi',
'Finnish': 'fi',
'Vietnamese': 'vi',
'Hebrew': 'he',
'Ukrainian': 'uk',
'Greek': 'el',
'Malay': 'ms',
'Czech': 'cs',
'Romanian': 'ro',
'Danish': 'da',
'Hungarian': 'hu',
'Tamil': 'ta',
'Norwegian': 'no',
'Thai': 'th',
'Urdu': 'ur',
'Croatian': 'hr',
'Bulgarian': 'bg',
'Lithuanian': 'lt',
'Latin': 'la',
'Maori': 'mi',
'Malayalam': 'ml',
'Welsh': 'cy',
'Slovak': 'sk',
'Telugu': 'te',
'Persian': 'fa',
'Latvian': 'lv',
'Bengali': 'bn',
'Serbian': 'sr',
'Azerbaijani': 'az',
'Slovenian': 'sl',
'Kannada': 'kn',
'Estonian': 'et',
'Macedonian': 'mk',
'Breton': 'br',
'Basque': 'eu',
'Icelandic': 'is',
'Armenian': 'hy',
'Nepali': 'ne',
'Mongolian': 'mn',
'Bosnian': 'bs',
'Kazakh': 'kk',
'Albanian': 'sq',
'Swahili': 'sw',
'Galician': 'gl',
'Marathi': 'mr',
'Punjabi': 'pa',
'Sinhala': 'si',
'Khmer': 'km',
'Shona': 'sn',
'Yoruba': 'yo',
'Somali': 'so',
'Afrikaans': 'af',
'Occitan': 'oc',
'Georgian': 'ka',
'Belarusian': 'be',
'Tajik': 'tg',
'Sindhi': 'sd',
'Gujarati': 'gu',
'Amharic': 'am',
'Yiddish': 'yi',
'Lao': 'lo',
'Uzbek': 'uz',
'Faroese': 'fo',
'Haitian creole': 'ht',
'Pashto': 'ps',
'Turkmen': 'tk',
'Nynorsk': 'nn',
'Maltese': 'mt',
'Sanskrit': 'sa',
'Luxembourgish': 'lb',
'Myanmar': 'my',
'Tibetan': 'bo',
'Tagalog': 'tl',
'Malagasy': 'mg',
'Assamese': 'as',
'Tatar': 'tt',
'Hawaiian': 'haw',
'Lingala': 'ln',
'Hausa': 'ha',
'Bashkir': 'ba',
'Javanese': 'jw',
'Sundanese': 'su'
}
translation_lang["ctranslate2"] = {
"source":dict_ctranslate2_lang,
"target":dict_ctranslate2_lang,
}

View File

@@ -3,12 +3,24 @@ from deepl_translate import translate as deepl_web_Translator
from translators import translate_text as other_web_Translator
from .translation_languages import translation_lang
from ctranslate2.converters import TransformersConverter
import ctranslate2
import transformers
TRANSLATE_MODELS = {
"small": "facebook/m2m100_418M",
"large": "facebook/m2m100_1.2B"
}
# Translator
class Translator():
def __init__(self):
pass
self.translator_status = {}
self.translator = ctranslate2.Translator("D:\\WORKSPACE\\WORK\\VRChatProject\\VRCT\\weight", device="cpu", device_index=0, compute_type="int8", inter_threads=1, intra_threads=4)
self.tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/m2m100_418M")
def authentication(self, translator_name, authkey=None):
result = True
match translator_name:
@@ -58,3 +70,18 @@ class Translator():
traceback.print_exc(file=f)
result = False
return result
def translate_ctranslate2(self, translator_name, source_language, target_language, message):
source_language=translation_lang["ctranslate2"]["source"][source_language]
target_language=translation_lang["ctranslate2"]["target"][target_language]
self.tokenizer.src_lang = source_language
source = self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(message))
target_prefix = [self.tokenizer.lang_code_to_token[target_language]]
results = self.translator.translate_batch([source], target_prefix=[target_prefix])
target = results[0].hypotheses[0][1:]
result = self.tokenizer.decode(self.tokenizer.convert_tokens_to_ids(target))
print(result)
return result

View File

@@ -7,3 +7,6 @@ flashtext == 2.7
pyyaml == 6.0.1
python-i18n == 0.3.9
CTkToolTip == 0.8
transformers[torch]
sentencepiece==0.1.99
ctranslate2==3.21.0