スレッドセーフなトークナイザーアクセスのためにロックを追加。トークナイザーの呼び出し時に発生する可能性のあるRuntimeErrorを防ぐために、アクセスを直列化。
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
from sudachipy import tokenizer
|
from sudachipy import tokenizer
|
||||||
from sudachipy import dictionary
|
from sudachipy import dictionary
|
||||||
from typing import List, Dict, Any
|
from typing import List, Dict, Any
|
||||||
|
import threading
|
||||||
try:
|
try:
|
||||||
from .transliteration_kana_to_hepburn import katakana_to_hepburn
|
from .transliteration_kana_to_hepburn import katakana_to_hepburn
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@@ -14,6 +15,9 @@ class Transliterator:
|
|||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create()
|
self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create()
|
||||||
self.mode = tokenizer.Tokenizer.SplitMode.C
|
self.mode = tokenizer.Tokenizer.SplitMode.C
|
||||||
|
# Lock to prevent concurrent access to sudachipy tokenizer which may
|
||||||
|
# internally use Rust/PyO3 borrow semantics and raise "Already borrowed".
|
||||||
|
self._tokenizer_lock = threading.Lock()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_kanji(ch: str) -> bool:
|
def is_kanji(ch: str) -> bool:
|
||||||
@@ -132,7 +136,10 @@ class Transliterator:
|
|||||||
results.
|
results.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tokens = self.tokenizer_obj.tokenize(text, self.mode)
|
# Tokenizer may raise RuntimeError: Already borrowed when called
|
||||||
|
# concurrently. Protect the call with a lock to serialize access.
|
||||||
|
with self._tokenizer_lock:
|
||||||
|
tokens = self.tokenizer_obj.tokenize(text, self.mode)
|
||||||
|
|
||||||
results: List[Dict[str, Any]] = []
|
results: List[Dict[str, Any]] = []
|
||||||
for t in tokens:
|
for t in tokens:
|
||||||
|
|||||||
Reference in New Issue
Block a user