220 lines
9.4 KiB
Python
220 lines
9.4 KiB
Python
from sudachipy import tokenizer
|
|
from sudachipy import dictionary
|
|
try:
|
|
from .transliteration_kana_to_hepburn import katakana_to_hepburn
|
|
except ImportError:
|
|
from transliteration_kana_to_hepburn import katakana_to_hepburn
|
|
|
|
class Transliterator:
|
|
def __init__(self):
|
|
self.tokenizer_obj = dictionary.Dictionary().create()
|
|
self.mode = tokenizer.Tokenizer.SplitMode.C
|
|
|
|
@staticmethod
|
|
def is_kanji(ch: str) -> bool:
|
|
return '\u4e00' <= ch <= '\u9fff'
|
|
|
|
@staticmethod
|
|
def kata_to_hira(text: str) -> str:
|
|
return "".join(
|
|
chr(ord(c) - 0x60) if 'ァ' <= c <= 'ン' else c
|
|
for c in text
|
|
)
|
|
|
|
@staticmethod
|
|
def split_kanji_okurigana(surface: str, reading_kana: str):
|
|
"""
|
|
1語の表層形(surface)と読み(reading_kana)を
|
|
[ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割
|
|
"""
|
|
result = []
|
|
|
|
# 表層を「漢字ブロック」と「非漢字ブロック」に分割
|
|
buf = ""
|
|
prev_is_kanji = None
|
|
blocks = []
|
|
for ch in surface:
|
|
now_is_kanji = Transliterator.is_kanji(ch)
|
|
if prev_is_kanji is None or now_is_kanji == prev_is_kanji:
|
|
buf += ch
|
|
else:
|
|
blocks.append((prev_is_kanji, buf))
|
|
buf = ch
|
|
prev_is_kanji = now_is_kanji
|
|
if buf:
|
|
blocks.append((prev_is_kanji, buf))
|
|
|
|
# 読みを分配
|
|
kana_left = reading_kana
|
|
for i, (is_kan, part) in enumerate(blocks):
|
|
if is_kan:
|
|
# 漢字ブロックの処理
|
|
if len(blocks) == 1:
|
|
# 単一ブロック(全て漢字)の場合
|
|
kana_for_kan = kana_left
|
|
elif i == len(blocks) - 1:
|
|
# 最後のブロック(漢字)の場合
|
|
kana_for_kan = kana_left
|
|
else:
|
|
# 中間の漢字ブロックの場合
|
|
# 後続の非漢字ブロックの文字数を計算
|
|
remaining_non_kanji = sum(len(p) for is_k, p in blocks[i+1:] if not is_k)
|
|
if remaining_non_kanji > 0 and len(kana_left) > remaining_non_kanji:
|
|
kana_for_kan = kana_left[:-remaining_non_kanji]
|
|
else:
|
|
# 漢字1文字あたり最低1文字の読みを割り当て
|
|
min_kana = len(part)
|
|
kana_for_kan = kana_left[:max(min_kana, len(kana_left) - remaining_non_kanji)]
|
|
|
|
# 空の読みを避ける
|
|
if not kana_for_kan and kana_left:
|
|
kana_for_kan = kana_left[:1]
|
|
|
|
result.append(
|
|
{
|
|
"orig": part,
|
|
"kana": kana_for_kan,
|
|
"hira": Transliterator.kata_to_hira(kana_for_kan),
|
|
"hepburn": katakana_to_hepburn(kana_for_kan, use_macron=True)
|
|
}
|
|
)
|
|
kana_left = kana_left[len(kana_for_kan):]
|
|
else:
|
|
# 非漢字部分(送り仮名など)
|
|
kana_for_okuri = kana_left[:len(part)]
|
|
result.append(
|
|
{
|
|
"orig": part,
|
|
"kana": kana_for_okuri,
|
|
"hira": Transliterator.kata_to_hira(kana_for_okuri),
|
|
"hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=True)
|
|
}
|
|
)
|
|
kana_left = kana_left[len(kana_for_okuri):]
|
|
|
|
return result
|
|
|
|
def analyze(self, text: str, use_macron: bool = True):
|
|
tokens = self.tokenizer_obj.tokenize(text, self.mode)
|
|
|
|
results = []
|
|
for t in tokens:
|
|
surface = t.surface()
|
|
reading = t.reading_form()
|
|
pos = t.part_of_speech()
|
|
|
|
if pos and pos[0] in ["記号", "補助記号"]:
|
|
reading = surface
|
|
|
|
if surface == reading:
|
|
results.append({
|
|
"orig": surface,
|
|
"kana": reading,
|
|
"hira": surface,
|
|
"hepburn": surface,
|
|
})
|
|
continue
|
|
|
|
# 単純に1文字ずつ処理
|
|
if len(surface) == 1:
|
|
# 1文字の場合はそのまま
|
|
results.append({
|
|
"orig": surface,
|
|
"kana": reading,
|
|
"hira": self.kata_to_hira(reading),
|
|
"hepburn": katakana_to_hepburn(reading, use_macron=use_macron)
|
|
})
|
|
else:
|
|
# 複数文字の場合は文字種別で分割
|
|
i = 0
|
|
reading_pos = 0
|
|
|
|
while i < len(surface):
|
|
char = surface[i]
|
|
|
|
if self.is_kanji(char):
|
|
# 漢字の場合、連続する漢字をまとめて処理
|
|
kanji_block = ""
|
|
while i < len(surface) and self.is_kanji(surface[i]):
|
|
kanji_block += surface[i]
|
|
i += 1
|
|
|
|
# 漢字ブロックの読みを推定
|
|
if i < len(surface):
|
|
# 後に文字がある場合、送り仮名を考慮
|
|
remaining_chars = len(surface) - i
|
|
kanji_reading = reading[reading_pos:-remaining_chars] if remaining_chars > 0 else reading[reading_pos:]
|
|
else:
|
|
# 最後の漢字ブロックの場合
|
|
kanji_reading = reading[reading_pos:]
|
|
|
|
# 空の読みを避ける
|
|
if not kanji_reading and reading_pos < len(reading):
|
|
kanji_reading = reading[reading_pos:]
|
|
if not kanji_reading and kanji_block:
|
|
# 読みが空だが漢字ブロックがある場合、残りの読みを全て割り当てる
|
|
kanji_reading = reading[reading_pos:]
|
|
|
|
# reading_posの更新を正確に行うために、割り当てられた読みの長さをチェック
|
|
len_allocated_reading = len(kanji_reading)
|
|
if reading_pos + len_allocated_reading > len(reading):
|
|
len_allocated_reading = len(reading) - reading_pos
|
|
|
|
results.append({
|
|
"orig": kanji_block,
|
|
"kana": kanji_reading,
|
|
"hira": self.kata_to_hira(kanji_reading),
|
|
"hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron)
|
|
})
|
|
reading_pos += len_allocated_reading
|
|
else:
|
|
# 非漢字の場合
|
|
non_kanji_block = ""
|
|
while i < len(surface) and not self.is_kanji(surface[i]):
|
|
non_kanji_block += surface[i]
|
|
i += 1
|
|
|
|
# 非漢字部分の読み(通常は文字数分、または残りの読みの分だけ)
|
|
len_block = len(non_kanji_block)
|
|
non_kanji_reading = reading[reading_pos:reading_pos + len_block]
|
|
|
|
# 割り当てられた読みの長さ
|
|
len_allocated_reading = len(non_kanji_reading)
|
|
|
|
results.append({
|
|
"orig": non_kanji_block,
|
|
"kana": non_kanji_reading,
|
|
"hira": self.kata_to_hira(non_kanji_reading),
|
|
"hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron)
|
|
})
|
|
reading_pos += len_allocated_reading
|
|
|
|
return results
|
|
|
|
# --- テスト ---
|
|
if __name__ == "__main__":
|
|
test_cases = [
|
|
"美しい花を見る",
|
|
"東京に行く",
|
|
"漢字とカタカナの混在",
|
|
"パーティーに行く",
|
|
"コンピューターを使う",
|
|
"シェアハウスに住む",
|
|
"ヴァイオリンを弾く",
|
|
"ギュウニュウを飲む",
|
|
"ニューヨークに行く",
|
|
"ラーメンを食べる",
|
|
"チョコレートが好き",
|
|
"SessionIDを取得する",
|
|
"取り敢えず検索してみる",
|
|
"見知らぬ土地で冒険する",
|
|
"彼は優れたエンジニアです",
|
|
" ".join(list("[]<>!@#$%^&*()_+-={}|\;:'\",.<>/?`~")),
|
|
" ".join(list("「」<>!@#$%^&*()_+-={}|\;:'",./?`~")),
|
|
" ".join(list("♪♫♬♭♮♯°℃℉№Å®©™✓✔✕✖★☆○●◎◇◆□■△▲▽▼※→←↑↓↔︎↕︎⇄⇅∞∴∵∷≪≫≦≧±×÷≠≈≡⊂⊃⊆⊇⊄⊅∪∩∈∋∅∀∃∠⊥⌒∂∇√∫∬∮∑∏∧∨¬⇒⇔∀∃∠⊥⌒∂∇√∫∬∮∑∏")),
|
|
" ".join(list("😀😃😄😁😆😅😂🤣😊😇🙂"))
|
|
]
|
|
|
|
transliterator = Transliterator()
|
|
for case in test_cases:
|
|
print(transliterator.analyze(case)) |