[Update] Transliterator: Change tokenizer mode to SplitMode.C and enhance analyze method for better reading assignment

2025-09-29 10:15:03 +09:00
parent c5a5fc62f7
commit e32e5c1b5f
1 changed files with 44 additions and 10 deletions
--- a/src-python/models/transliteration/transliteration_transliterator.py
+++ b/src-python/models/transliteration/transliteration_transliterator.py
@@ -8,7 +8,7 @@ except ImportError:
 class Transliterator:
    def __init__(self):
        self.tokenizer_obj = dictionary.Dictionary().create()
-        self.mode = tokenizer.Tokenizer.SplitMode.A
+        self.mode = tokenizer.Tokenizer.SplitMode.C

    @staticmethod
    def is_kanji(ch: str) -> bool:
@@ -101,6 +101,20 @@ class Transliterator:
        for t in tokens:
            surface = t.surface()
            reading = t.reading_form()
+            pos = t.part_of_speech()
+            print("surface:", surface, " reading:", reading, " pos:", pos)
+
+            if pos and pos[0] in ["記号", "補助記号"]:
+                reading = surface
+
+            if surface == reading:
+                results.append({
+                    "orig": surface,
+                    "kana": reading,
+                    "hira": surface,
+                    "hepburn": surface,
+                })
+                continue

            # 単純に1文字ずつ処理
            if len(surface) == 1:
@@ -135,13 +149,25 @@ class Transliterator:
                            # 最後の漢字ブロックの場合
                            kanji_reading = reading[reading_pos:]

+                        # 空の読みを避ける
+                        if not kanji_reading and reading_pos < len(reading):
+                            kanji_reading = reading[reading_pos:]
+                        if not kanji_reading and kanji_block:
+                            # 読みが空だが漢字ブロックがある場合、残りの読みを全て割り当てる
+                            kanji_reading = reading[reading_pos:]
+
+                        # reading_posの更新を正確に行うために、割り当てられた読みの長さをチェック
+                        len_allocated_reading = len(kanji_reading)
+                        if reading_pos + len_allocated_reading > len(reading):
+                            len_allocated_reading = len(reading) - reading_pos
+
                        results.append({
                            "orig": kanji_block,
                            "kana": kanji_reading,
                            "hira": self.kata_to_hira(kanji_reading),
                            "hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron)
                        })
-                        reading_pos += len(kanji_reading)
+                        reading_pos += len_allocated_reading
                    else:
                        # 非漢字の場合
                        non_kanji_block = ""
@@ -149,8 +175,12 @@ class Transliterator:
                            non_kanji_block += surface[i]
                            i += 1

-                        # 非漢字部分の読み（通常は文字数分）
-                        non_kanji_reading = reading[reading_pos:reading_pos + len(non_kanji_block)]
+                        # 非漢字部分の読み（通常は文字数分、または残りの読みの分だけ）
+                        len_block = len(non_kanji_block)
+                        non_kanji_reading = reading[reading_pos:reading_pos + len_block]
+
+                        # 割り当てられた読みの長さ
+                        len_allocated_reading = len(non_kanji_reading)

                        results.append({
                            "orig": non_kanji_block,
@@ -158,7 +188,7 @@ class Transliterator:
                            "hira": self.kata_to_hira(non_kanji_reading),
                            "hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron)
                        })
-                        reading_pos += len(non_kanji_reading)
+                        reading_pos += len_allocated_reading

        return results

@@ -180,6 +210,10 @@ if __name__ == "__main__":
        "取り敢えず検索してみる",
        "見知らぬ土地で冒険する",
        "彼は優れたエンジニアです",
+        " ".join(list("[]<>!@#$%^&*()_+-={}|\;:'\",.<>/?`~")),
+        " ".join(list("「」＜＞！＠＃＄％＾＆＊（）＿＋－＝｛｝｜＼；：＇＂，．／？｀～")),
+        " ".join(list("♪♫♬♭♮♯°℃℉№Å®©™✓✔✕✖★☆○●◎◇◆□■△▲▽▼※→←↑↓↔︎↕︎⇄⇅∞∴∵∷≪≫≦≧±×÷≠≈≡⊂⊃⊆⊇⊄⊅∪∩∈∋∅∀∃∠⊥⌒∂∇√∫∬∮∑∏∧∨¬⇒⇔∀∃∠⊥⌒∂∇√∫∬∮∑∏")),
+        " ".join(list("😀😃😄😁😆😅😂🤣😊😇🙂"))
    ]

    transliterator = Transliterator()