From 86e43371e6701c2ea2a0f6ed6b68657656f1a96c Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Mon, 29 Jan 2024 16:38:13 +0900 Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=90=9B[bugfix]=20Model=20:=20tokenize?= =?UTF-8?q?r=E3=82=92=E3=83=AD=E3=83=BC=E3=82=AB=E3=83=AB=E3=81=AE?= =?UTF-8?q?=E6=8C=87=E5=AE=9A=E3=83=91=E3=82=B9=E3=81=AB=E4=BF=9D=E5=AD=98?= =?UTF-8?q?=E3=81=99=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/translation/translation_translator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/models/translation/translation_translator.py b/models/translation/translation_translator.py index f3d3c99e..d73bb0fb 100644 --- a/models/translation/translation_translator.py +++ b/models/translation/translation_translator.py @@ -28,6 +28,7 @@ class Translator(): directory_name = ctranslate2_weights[model_type]["directory_name"] tokenizer = ctranslate2_weights[model_type]["tokenizer"] weight_path = os.path.join(path, "weight", directory_name) + tokenizer_path = os.path.join(path, "weight", directory_name, "tokenizer") self.ctranslate2_translator = ctranslate2.Translator( weight_path, device="cpu", @@ -36,7 +37,7 @@ class Translator(): inter_threads=1, intra_threads=4 ) - self.ctranslate2_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer) + self.ctranslate2_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer, cache_dir=tokenizer_path) @staticmethod def getLanguageCode(translator_name, target_country, source_language, target_language): From 82eab0db3c8aaa9bd87afab142b72ab71440a6f0 Mon Sep 17 00:00:00 2001 From: misyaguziya Date: Mon, 29 Jan 2024 23:27:33 +0900 Subject: [PATCH 2/2] =?UTF-8?q?[bugfix]=20Model=20:=20AutoTokenizer.from?= =?UTF-8?q?=5Fpretrained=E3=81=AF=E9=9D=9EASCII=E6=96=87=E5=AD=97=E3=81=AB?= =?UTF-8?q?=E5=AF=BE=E5=BF=9C=E3=81=97=E3=81=A6=E3=81=84=E3=81=AA=E3=81=84?= =?UTF-8?q?=E3=81=9F=E3=82=81=E3=80=81=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ユーザー名が非ASCII文字の場合、絶対バスの場合失敗するので相対パスで対応 --- models/translation/translation_translator.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/models/translation/translation_translator.py b/models/translation/translation_translator.py index d73bb0fb..ea02e490 100644 --- a/models/translation/translation_translator.py +++ b/models/translation/translation_translator.py @@ -37,7 +37,12 @@ class Translator(): inter_threads=1, intra_threads=4 ) - self.ctranslate2_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer, cache_dir=tokenizer_path) + try: + self.ctranslate2_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer, cache_dir=tokenizer_path) + except Exception as e: + print("Error: changeCTranslate2Model()", e) + tokenizer_path = os.path.join("./weight", directory_name, "tokenizer") + self.ctranslate2_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer, cache_dir=tokenizer_path) @staticmethod def getLanguageCode(translator_name, target_country, source_language, target_language):