[Add] Implement Transliterator class for kanji and okurigana analysis; update requirements and spec files
This commit is contained in:
187
src-python/models/transliterate/transliterate_transliterator.py
Normal file
187
src-python/models/transliterate/transliterate_transliterator.py
Normal file
@@ -0,0 +1,187 @@
|
||||
from sudachipy import tokenizer
|
||||
from sudachipy import dictionary
|
||||
try:
|
||||
from .transliterate_kana_to_hepburn import katakana_to_hepburn
|
||||
except ImportError:
|
||||
from transliterate_kana_to_hepburn import katakana_to_hepburn
|
||||
|
||||
class Transliterator:
|
||||
def __init__(self):
|
||||
self.tokenizer_obj = dictionary.Dictionary().create()
|
||||
self.mode = tokenizer.Tokenizer.SplitMode.A
|
||||
|
||||
@staticmethod
|
||||
def is_kanji(ch: str) -> bool:
|
||||
return '\u4e00' <= ch <= '\u9fff'
|
||||
|
||||
@staticmethod
|
||||
def kata_to_hira(text: str) -> str:
|
||||
return "".join(
|
||||
chr(ord(c) - 0x60) if 'ァ' <= c <= 'ン' else c
|
||||
for c in text
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def split_kanji_okurigana(surface: str, reading_kana: str):
|
||||
"""
|
||||
1語の表層形(surface)と読み(reading_kana)を
|
||||
[ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割
|
||||
"""
|
||||
result = []
|
||||
|
||||
# 表層を「漢字ブロック」と「非漢字ブロック」に分割
|
||||
buf = ""
|
||||
prev_is_kanji = None
|
||||
blocks = []
|
||||
for ch in surface:
|
||||
now_is_kanji = Transliterator.is_kanji(ch)
|
||||
if prev_is_kanji is None or now_is_kanji == prev_is_kanji:
|
||||
buf += ch
|
||||
else:
|
||||
blocks.append((prev_is_kanji, buf))
|
||||
buf = ch
|
||||
prev_is_kanji = now_is_kanji
|
||||
if buf:
|
||||
blocks.append((prev_is_kanji, buf))
|
||||
|
||||
# 読みを分配
|
||||
kana_left = reading_kana
|
||||
for i, (is_kan, part) in enumerate(blocks):
|
||||
if is_kan:
|
||||
# 漢字ブロックの処理
|
||||
if len(blocks) == 1:
|
||||
# 単一ブロック(全て漢字)の場合
|
||||
kana_for_kan = kana_left
|
||||
elif i == len(blocks) - 1:
|
||||
# 最後のブロック(漢字)の場合
|
||||
kana_for_kan = kana_left
|
||||
else:
|
||||
# 中間の漢字ブロックの場合
|
||||
# 後続の非漢字ブロックの文字数を計算
|
||||
remaining_non_kanji = sum(len(p) for is_k, p in blocks[i+1:] if not is_k)
|
||||
if remaining_non_kanji > 0 and len(kana_left) > remaining_non_kanji:
|
||||
kana_for_kan = kana_left[:-remaining_non_kanji]
|
||||
else:
|
||||
# 漢字1文字あたり最低1文字の読みを割り当て
|
||||
min_kana = len(part)
|
||||
kana_for_kan = kana_left[:max(min_kana, len(kana_left) - remaining_non_kanji)]
|
||||
|
||||
# 空の読みを避ける
|
||||
if not kana_for_kan and kana_left:
|
||||
kana_for_kan = kana_left[:1]
|
||||
|
||||
result.append(
|
||||
{
|
||||
"orig": part,
|
||||
"kana": kana_for_kan,
|
||||
"hira": Transliterator.kata_to_hira(kana_for_kan),
|
||||
"hepburn": katakana_to_hepburn(kana_for_kan, use_macron=True)
|
||||
}
|
||||
)
|
||||
kana_left = kana_left[len(kana_for_kan):]
|
||||
else:
|
||||
# 非漢字部分(送り仮名など)
|
||||
kana_for_okuri = kana_left[:len(part)]
|
||||
result.append(
|
||||
{
|
||||
"orig": part,
|
||||
"kana": kana_for_okuri,
|
||||
"hira": Transliterator.kata_to_hira(kana_for_okuri),
|
||||
"hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=True)
|
||||
}
|
||||
)
|
||||
kana_left = kana_left[len(kana_for_okuri):]
|
||||
|
||||
return result
|
||||
|
||||
def analyze(self, text: str, use_macron: bool = True):
|
||||
tokens = self.tokenizer_obj.tokenize(text, self.mode)
|
||||
|
||||
results = []
|
||||
for t in tokens:
|
||||
surface = t.surface()
|
||||
reading = t.reading_form()
|
||||
|
||||
# 単純に1文字ずつ処理
|
||||
if len(surface) == 1:
|
||||
# 1文字の場合はそのまま
|
||||
results.append({
|
||||
"orig": surface,
|
||||
"kana": reading,
|
||||
"hira": self.kata_to_hira(reading),
|
||||
"hepburn": katakana_to_hepburn(reading, use_macron=use_macron)
|
||||
})
|
||||
else:
|
||||
# 複数文字の場合は文字種別で分割
|
||||
i = 0
|
||||
reading_pos = 0
|
||||
|
||||
while i < len(surface):
|
||||
char = surface[i]
|
||||
|
||||
if self.is_kanji(char):
|
||||
# 漢字の場合、連続する漢字をまとめて処理
|
||||
kanji_block = ""
|
||||
while i < len(surface) and self.is_kanji(surface[i]):
|
||||
kanji_block += surface[i]
|
||||
i += 1
|
||||
|
||||
# 漢字ブロックの読みを推定
|
||||
if i < len(surface):
|
||||
# 後に文字がある場合、送り仮名を考慮
|
||||
remaining_chars = len(surface) - i
|
||||
kanji_reading = reading[reading_pos:-remaining_chars] if remaining_chars > 0 else reading[reading_pos:]
|
||||
else:
|
||||
# 最後の漢字ブロックの場合
|
||||
kanji_reading = reading[reading_pos:]
|
||||
|
||||
results.append({
|
||||
"orig": kanji_block,
|
||||
"kana": kanji_reading,
|
||||
"hira": self.kata_to_hira(kanji_reading),
|
||||
"hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron)
|
||||
})
|
||||
reading_pos += len(kanji_reading)
|
||||
else:
|
||||
# 非漢字の場合
|
||||
non_kanji_block = ""
|
||||
while i < len(surface) and not self.is_kanji(surface[i]):
|
||||
non_kanji_block += surface[i]
|
||||
i += 1
|
||||
|
||||
# 非漢字部分の読み(通常は文字数分)
|
||||
non_kanji_reading = reading[reading_pos:reading_pos + len(non_kanji_block)]
|
||||
|
||||
results.append({
|
||||
"orig": non_kanji_block,
|
||||
"kana": non_kanji_reading,
|
||||
"hira": self.kata_to_hira(non_kanji_reading),
|
||||
"hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron)
|
||||
})
|
||||
reading_pos += len(non_kanji_reading)
|
||||
|
||||
return results
|
||||
|
||||
# --- テスト ---
|
||||
if __name__ == "__main__":
|
||||
test_cases = [
|
||||
"美しい花を見る",
|
||||
"東京に行く",
|
||||
"漢字とカタカナの混在",
|
||||
"パーティーに行く",
|
||||
"コンピューターを使う",
|
||||
"シェアハウスに住む",
|
||||
"ヴァイオリンを弾く",
|
||||
"ギュウニュウを飲む",
|
||||
"ニューヨークに行く",
|
||||
"ラーメンを食べる",
|
||||
"チョコレートが好き",
|
||||
"SessionIDを取得する",
|
||||
"取り敢えず検索してみる",
|
||||
"見知らぬ土地で冒険する",
|
||||
"彼は優れたエンジニアです",
|
||||
]
|
||||
|
||||
transliterator = Transliterator()
|
||||
for case in test_cases:
|
||||
print(transliterator.analyze(case))
|
||||
Reference in New Issue
Block a user