Files
VRCT/src-python/models/transliteration/transliteration_transliterator.py

158 lines
6.2 KiB
Python

from sudachipy import tokenizer
from sudachipy import dictionary
try:
from .transliteration_kana_to_hepburn import katakana_to_hepburn
except ImportError:
from transliteration_kana_to_hepburn import katakana_to_hepburn
class Transliterator:
def __init__(self):
self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create()
self.mode = tokenizer.Tokenizer.SplitMode.C
@staticmethod
def is_kanji(ch: str) -> bool:
return '\u4e00' <= ch <= '\u9fff'
@staticmethod
def kata_to_hira(text: str) -> str:
return "".join(
chr(ord(c) - 0x60) if '' <= c <= '' else c
for c in text
)
@staticmethod
def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True):
"""
1語の表層形(surface)と読み(reading_kana)を
[ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割
"""
result = []
# 表層を「漢字ブロック」と「非漢字ブロック」に分割
buf = ""
prev_is_kanji = None
blocks = []
for ch in surface:
now_is_kanji = Transliterator.is_kanji(ch)
if prev_is_kanji is None or now_is_kanji == prev_is_kanji:
buf += ch
else:
blocks.append((prev_is_kanji, buf))
buf = ch
prev_is_kanji = now_is_kanji
if buf:
blocks.append((prev_is_kanji, buf))
# 読みを分配
kana_left = reading_kana
for i, (is_kan, part) in enumerate(blocks):
if is_kan:
# 漢字ブロックの処理
if len(blocks) == 1:
# 単一ブロック(全て漢字)の場合
kana_for_kan = kana_left
elif i == len(blocks) - 1:
# 最後のブロック(漢字)の場合
kana_for_kan = kana_left
else:
# 中間の漢字ブロックの場合
# 後続の非漢字ブロックの文字数を計算
remaining_non_kanji = sum(len(p) for is_k, p in blocks[i+1:] if not is_k)
if remaining_non_kanji > 0 and len(kana_left) > remaining_non_kanji:
kana_for_kan = kana_left[:-remaining_non_kanji]
else:
# 漢字1文字あたり最低1文字の読みを割り当て
min_kana = len(part)
kana_for_kan = kana_left[:max(min_kana, len(kana_left) - remaining_non_kanji)]
# 空の読みを避ける
if not kana_for_kan and kana_left:
kana_for_kan = kana_left[:1]
result.append({
"orig": part,
"kana": kana_for_kan,
"hira": Transliterator.kata_to_hira(kana_for_kan),
"hepburn": katakana_to_hepburn(kana_for_kan, use_macron=use_macron)
})
kana_left = kana_left[len(kana_for_kan):]
else:
# 非漢字部分(送り仮名など)
kana_for_okuri = kana_left[:len(part)]
result.append(
{
"orig": part,
"kana": kana_for_okuri,
"hira": Transliterator.kata_to_hira(kana_for_okuri),
"hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=use_macron)
}
)
kana_left = kana_left[len(kana_for_okuri):]
return result
def analyze(self, text: str, use_macron: bool = False):
tokens = self.tokenizer_obj.tokenize(text, self.mode)
results = []
for t in tokens:
surface = t.surface()
reading = t.reading_form()
pos = t.part_of_speech()
if pos and pos[0] in ["記号", "補助記号", "空白"]:
reading = surface
if surface == reading:
results.append({
"orig": surface,
"kana": reading,
"hira": surface,
"hepburn": surface,
})
continue
# 単純に1文字ずつ処理
if len(surface) == 1:
# 1文字の場合はそのまま
results.append({
"orig": surface,
"kana": reading,
"hira": self.kata_to_hira(reading),
"hepburn": katakana_to_hepburn(reading, use_macron=use_macron)
})
else:
# 複数文字の場合は既存のユーティリティで分割
parts = self.split_kanji_okurigana(surface, reading, use_macron=use_macron)
results.extend(parts)
return results
# --- テスト ---
if __name__ == "__main__":
test_cases = [
"美しい花を見る",
"東京に行く",
"漢字とカタカナの混在",
"パーティーに行く",
"コンピューターを使う",
"シェアハウスに住む",
"ヴァイオリンを弾く",
"ギュウニュウを飲む",
"ニューヨークに行く",
"ラーメンを食べる",
"チョコレートが好き",
"SessionIDを取得する",
"取り敢えず検索してみる",
"見知らぬ土地で冒険する",
"彼は優れたエンジニアです",
" ".join(list("[]<>!@#$%^&*()_+-={}|\;:'\",.<>/?`~")),
" ".join(list("「」<>!@#$%^&*()_+-={}|\;:'",./?`~")),
" ".join(list("♪♫♬♭♮♯°℃℉№Å®©™✓✔✕✖★☆○●◎◇◆□■△▲▽▼※→←↑↓↔︎↕︎⇄⇅∞∴∵∷≪≫≦≧±×÷≠≈≡⊂⊃⊆⊇⊄⊅∪∩∈∋∅∀∃∠⊥⌒∂∇√∫∬∮∑∏∧∨¬⇒⇔∀∃∠⊥⌒∂∇√∫∬∮∑∏")),
" ".join(list("😀😃😄😁😆😅😂🤣😊😇🙂"))
]
transliterator = Transliterator()
for case in test_cases:
print(transliterator.analyze(case))