From e32e5c1b5f8c0079434a8ef9e238f46a86818ca1 Mon Sep 17 00:00:00 2001 From: misyaguziya <53165965+misyaguziya@users.noreply.github.com> Date: Mon, 29 Sep 2025 10:15:03 +0900 Subject: [PATCH] [Update] Transliterator: Change tokenizer mode to SplitMode.C and enhance analyze method for better reading assignment --- .../transliteration_transliterator.py | 54 +++++++++++++++---- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/src-python/models/transliteration/transliteration_transliterator.py b/src-python/models/transliteration/transliteration_transliterator.py index f2a9780f..9d395a13 100644 --- a/src-python/models/transliteration/transliteration_transliterator.py +++ b/src-python/models/transliteration/transliteration_transliterator.py @@ -8,7 +8,7 @@ except ImportError: class Transliterator: def __init__(self): self.tokenizer_obj = dictionary.Dictionary().create() - self.mode = tokenizer.Tokenizer.SplitMode.A + self.mode = tokenizer.Tokenizer.SplitMode.C @staticmethod def is_kanji(ch: str) -> bool: @@ -101,7 +101,21 @@ class Transliterator: for t in tokens: surface = t.surface() reading = t.reading_form() - + pos = t.part_of_speech() + print("surface:", surface, " reading:", reading, " pos:", pos) + + if pos and pos[0] in ["記号", "補助記号"]: + reading = surface + + if surface == reading: + results.append({ + "orig": surface, + "kana": reading, + "hira": surface, + "hepburn": surface, + }) + continue + # 単純に1文字ずつ処理 if len(surface) == 1: # 1文字の場合はそのまま @@ -134,32 +148,48 @@ class Transliterator: else: # 最後の漢字ブロックの場合 kanji_reading = reading[reading_pos:] - + + # 空の読みを避ける + if not kanji_reading and reading_pos < len(reading): + kanji_reading = reading[reading_pos:] + if not kanji_reading and kanji_block: + # 読みが空だが漢字ブロックがある場合、残りの読みを全て割り当てる + kanji_reading = reading[reading_pos:] + + # reading_posの更新を正確に行うために、割り当てられた読みの長さをチェック + len_allocated_reading = len(kanji_reading) + if reading_pos + len_allocated_reading > len(reading): + len_allocated_reading = len(reading) - reading_pos + results.append({ "orig": kanji_block, "kana": kanji_reading, "hira": self.kata_to_hira(kanji_reading), "hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron) }) - reading_pos += len(kanji_reading) + reading_pos += len_allocated_reading else: # 非漢字の場合 non_kanji_block = "" while i < len(surface) and not self.is_kanji(surface[i]): non_kanji_block += surface[i] i += 1 - - # 非漢字部分の読み(通常は文字数分) - non_kanji_reading = reading[reading_pos:reading_pos + len(non_kanji_block)] - + + # 非漢字部分の読み(通常は文字数分、または残りの読みの分だけ) + len_block = len(non_kanji_block) + non_kanji_reading = reading[reading_pos:reading_pos + len_block] + + # 割り当てられた読みの長さ + len_allocated_reading = len(non_kanji_reading) + results.append({ "orig": non_kanji_block, "kana": non_kanji_reading, "hira": self.kata_to_hira(non_kanji_reading), "hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron) }) - reading_pos += len(non_kanji_reading) - + reading_pos += len_allocated_reading + return results # --- テスト --- @@ -180,6 +210,10 @@ if __name__ == "__main__": "取り敢えず検索してみる", "見知らぬ土地で冒険する", "彼は優れたエンジニアです", + " ".join(list("[]<>!@#$%^&*()_+-={}|\;:'\",.<>/?`~")), + " ".join(list("「」<>!@#$%^&*()_+-={}|\;:'",./?`~")), + " ".join(list("♪♫♬♭♮♯°℃℉№Å®©™✓✔✕✖★☆○●◎◇◆□■△▲▽▼※→←↑↓↔︎↕︎⇄⇅∞∴∵∷≪≫≦≧±×÷≠≈≡⊂⊃⊆⊇⊄⊅∪∩∈∋∅∀∃∠⊥⌒∂∇√∫∬∮∑∏∧∨¬⇒⇔∀∃∠⊥⌒∂∇√∫∬∮∑∏")), + " ".join(list("😀😃😄😁😆😅😂🤣😊😇🙂")) ] transliterator = Transliterator()