スレッドセーフなトークナイザーアクセスのためにロックを追加。トークナイザーの呼び出し時に発生する可能性のあるRuntimeErrorを防ぐために、アクセスを直列化。

This commit is contained in:
misyaguziya
2025-10-09 18:43:12 +09:00
parent 944577eaf4
commit 35e8d7dda9

View File

@@ -1,6 +1,7 @@
from sudachipy import tokenizer
from sudachipy import dictionary
from typing import List, Dict, Any
import threading
try:
from .transliteration_kana_to_hepburn import katakana_to_hepburn
except ImportError:
@@ -14,6 +15,9 @@ class Transliterator:
def __init__(self) -> None:
self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create()
self.mode = tokenizer.Tokenizer.SplitMode.C
# Lock to prevent concurrent access to sudachipy tokenizer which may
# internally use Rust/PyO3 borrow semantics and raise "Already borrowed".
self._tokenizer_lock = threading.Lock()
@staticmethod
def is_kanji(ch: str) -> bool:
@@ -132,7 +136,10 @@ class Transliterator:
results.
"""
tokens = self.tokenizer_obj.tokenize(text, self.mode)
# Tokenizer may raise RuntimeError: Already borrowed when called
# concurrently. Protect the call with a lock to serialize access.
with self._tokenizer_lock:
tokens = self.tokenizer_obj.tokenize(text, self.mode)
results: List[Dict[str, Any]] = []
for t in tokens: