スレッドセーフなトークナイザーアクセスのためにロックを追加。トークナイザーの呼び出し時に発生する可能性のあるRuntimeErrorを防ぐために、アクセスを直列化。

This commit is contained in:
misyaguziya
2025-10-09 18:43:12 +09:00
parent 944577eaf4
commit 35e8d7dda9

View File

@@ -1,6 +1,7 @@
from sudachipy import tokenizer from sudachipy import tokenizer
from sudachipy import dictionary from sudachipy import dictionary
from typing import List, Dict, Any from typing import List, Dict, Any
import threading
try: try:
from .transliteration_kana_to_hepburn import katakana_to_hepburn from .transliteration_kana_to_hepburn import katakana_to_hepburn
except ImportError: except ImportError:
@@ -14,6 +15,9 @@ class Transliterator:
def __init__(self) -> None: def __init__(self) -> None:
self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create() self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create()
self.mode = tokenizer.Tokenizer.SplitMode.C self.mode = tokenizer.Tokenizer.SplitMode.C
# Lock to prevent concurrent access to sudachipy tokenizer which may
# internally use Rust/PyO3 borrow semantics and raise "Already borrowed".
self._tokenizer_lock = threading.Lock()
@staticmethod @staticmethod
def is_kanji(ch: str) -> bool: def is_kanji(ch: str) -> bool:
@@ -132,7 +136,10 @@ class Transliterator:
results. results.
""" """
tokens = self.tokenizer_obj.tokenize(text, self.mode) # Tokenizer may raise RuntimeError: Already borrowed when called
# concurrently. Protect the call with a lock to serialize access.
with self._tokenizer_lock:
tokens = self.tokenizer_obj.tokenize(text, self.mode)
results: List[Dict[str, Any]] = [] results: List[Dict[str, Any]] = []
for t in tokens: for t in tokens: