From 7b1e9136ee61f6386d224bf1d3c3879ed126cd3c Mon Sep 17 00:00:00 2001
From: misyaguziya <53165965+misyaguziya@users.noreply.github.com>
Date: Sat, 4 Oct 2025 22:25:55 +0900
Subject: [PATCH 1/3] [Update] Transliterator: Enhance transliteration control
 and improve tokenizer initialization

---
 src-python/controller.py                      | 13 +++
 src-python/model.py                           | 13 ++-
 .../transliteration_transliterator.py         | 92 +++----------------
 3 files changed, 40 insertions(+), 78 deletions(-)

diff --git a/src-python/controller.py b/src-python/controller.py
index 16856ddc..5c360a91 100644
--- a/src-python/controller.py
+++ b/src-python/controller.py
@@ -915,12 +915,16 @@ class Controller:
     @staticmethod
     def setEnableConvertMessageToRomaji(*args, **kwargs) -> dict:
         if config.CONVERT_MESSAGE_TO_ROMAJI is False:
+            if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
+                model.startTransliteration()
             config.CONVERT_MESSAGE_TO_ROMAJI = True
         return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI}
 
     @staticmethod
     def setDisableConvertMessageToRomaji(*args, **kwargs) -> dict:
         if config.CONVERT_MESSAGE_TO_ROMAJI is True:
+            if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
+                model.stopTransliteration()
             config.CONVERT_MESSAGE_TO_ROMAJI = False
         return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI}
 
@@ -931,12 +935,16 @@ class Controller:
     @staticmethod
     def setEnableConvertMessageToHiragana(*args, **kwargs) -> dict:
         if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
+            if config.CONVERT_MESSAGE_TO_ROMAJI is False:
+                model.startTransliteration()
             config.CONVERT_MESSAGE_TO_HIRAGANA = True
         return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA}
 
     @staticmethod
     def setDisableConvertMessageToHiragana(*args, **kwargs) -> dict:
         if config.CONVERT_MESSAGE_TO_HIRAGANA is True:
+            if config.CONVERT_MESSAGE_TO_ROMAJI is False:
+                model.stopTransliteration()
             config.CONVERT_MESSAGE_TO_HIRAGANA = False
         return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA}
 
@@ -2466,6 +2474,11 @@ class Controller:
         self.updateDownloadedWhisperModelWeight()
         self.updateTranscriptionEngine()
 
+        # set Transliteration status
+        printLog("Set Transliteration")
+        if config.CONVERT_MESSAGE_TO_ROMAJI is True or config.CONVERT_MESSAGE_TO_HIRAGANA is True:
+            model.startTransliteration()
+
         self.initializationProgress(3)
 
         # set word filter
diff --git a/src-python/model.py b/src-python/model.py
index 9d29c2d0..6048c630 100644
--- a/src-python/model.py
+++ b/src-python/model.py
@@ -99,7 +99,7 @@ class Model:
         self.overlay_image = OverlayImage(config.PATH_LOCAL)
         self.mic_audio_queue = None
         self.mic_mute_status = None
-        self.transliterator = Transliterator()
+        self.transliterator = None
         self.watchdog = Watchdog(config.WATCHDOG_TIMEOUT, config.WATCHDOG_INTERVAL)
         self.osc_handler = OSCHandler(config.OSC_IP_ADDRESS, config.OSC_PORT)
         self.websocket_server = None
@@ -277,6 +277,14 @@ class Model:
         self.previous_receive_message = message
         return repeat_flag
 
+    def startTransliteration(self):
+        if self.transliterator is None:
+            self.transliterator = Transliterator()
+
+    def stopTransliteration(self):
+        if self.transliterator is not None:
+            self.transliterator = None
+
     def convertMessageToTransliteration(self, message: str, hiragana: bool=True, romaji: bool=True) -> str:
         if hiragana is False and romaji is False:
             return message
@@ -287,6 +295,9 @@ class Model:
         if romaji:
             keys_to_keep.add("hepburn")
 
+        if self.transliterator is None:
+            self.startTransliteration()
+
         data_list = self.transliterator.analyze(message, use_macron=False)
         filtered_list = [
             {key: value for key, value in item.items() if key in keys_to_keep}
diff --git a/src-python/models/transliteration/transliteration_transliterator.py b/src-python/models/transliteration/transliteration_transliterator.py
index 7c85ebee..b8e64f7d 100644
--- a/src-python/models/transliteration/transliteration_transliterator.py
+++ b/src-python/models/transliteration/transliteration_transliterator.py
@@ -7,7 +7,7 @@ except ImportError:
 
 class Transliterator:
     def __init__(self):
-        self.tokenizer_obj = dictionary.Dictionary().create()
+        self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create()
         self.mode = tokenizer.Tokenizer.SplitMode.C
 
     @staticmethod
@@ -22,7 +22,7 @@ class Transliterator:
         )
 
     @staticmethod
-    def split_kanji_okurigana(surface: str, reading_kana: str):
+    def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True):
         """
         1語の表層形(surface)と読み(reading_kana)を
         [ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割
@@ -69,15 +69,13 @@ class Transliterator:
                 # 空の読みを避ける
                 if not kana_for_kan and kana_left:
                     kana_for_kan = kana_left[:1]
-                
-                result.append(
-                    {
-                        "orig": part,
-                        "kana": kana_for_kan,
-                        "hira": Transliterator.kata_to_hira(kana_for_kan),
-                        "hepburn": katakana_to_hepburn(kana_for_kan, use_macron=True)
-                    }
-                )
+
+                result.append({
+                    "orig": part,
+                    "kana": kana_for_kan,
+                    "hira": Transliterator.kata_to_hira(kana_for_kan),
+                    "hepburn": katakana_to_hepburn(kana_for_kan, use_macron=use_macron)
+                })
                 kana_left = kana_left[len(kana_for_kan):]
             else:
                 # 非漢字部分（送り仮名など）
@@ -87,14 +85,14 @@ class Transliterator:
                         "orig": part,
                         "kana": kana_for_okuri,
                         "hira": Transliterator.kata_to_hira(kana_for_okuri),
-                        "hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=True)
+                        "hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=use_macron)
                     }
                 )
                 kana_left = kana_left[len(kana_for_okuri):]
 
         return result
 
-    def analyze(self, text: str, use_macron: bool = True):
+    def analyze(self, text: str, use_macron: bool = False):
         tokens = self.tokenizer_obj.tokenize(text, self.mode)
 
         results = []
@@ -103,7 +101,7 @@ class Transliterator:
             reading = t.reading_form()
             pos = t.part_of_speech()
 
-            if pos and pos[0] in ["記号", "補助記号"]:
+            if pos and pos[0] in ["記号", "補助記号", "空白"]:
                 reading = surface
 
             if surface == reading:
@@ -125,69 +123,9 @@ class Transliterator:
                     "hepburn": katakana_to_hepburn(reading, use_macron=use_macron)
                 })
             else:
-                # 複数文字の場合は文字種別で分割
-                i = 0
-                reading_pos = 0
-                
-                while i < len(surface):
-                    char = surface[i]
-                    
-                    if self.is_kanji(char):
-                        # 漢字の場合、連続する漢字をまとめて処理
-                        kanji_block = ""
-                        while i < len(surface) and self.is_kanji(surface[i]):
-                            kanji_block += surface[i]
-                            i += 1
-                        
-                        # 漢字ブロックの読みを推定
-                        if i < len(surface):
-                            # 後に文字がある場合、送り仮名を考慮
-                            remaining_chars = len(surface) - i
-                            kanji_reading = reading[reading_pos:-remaining_chars] if remaining_chars > 0 else reading[reading_pos:]
-                        else:
-                            # 最後の漢字ブロックの場合
-                            kanji_reading = reading[reading_pos:]
-
-                        # 空の読みを避ける
-                        if not kanji_reading and reading_pos < len(reading):
-                            kanji_reading = reading[reading_pos:]
-                        if not kanji_reading and kanji_block:
-                            # 読みが空だが漢字ブロックがある場合、残りの読みを全て割り当てる
-                            kanji_reading = reading[reading_pos:]
-
-                        # reading_posの更新を正確に行うために、割り当てられた読みの長さをチェック
-                        len_allocated_reading = len(kanji_reading)
-                        if reading_pos + len_allocated_reading > len(reading):
-                            len_allocated_reading = len(reading) - reading_pos
-
-                        results.append({
-                            "orig": kanji_block,
-                            "kana": kanji_reading,
-                            "hira": self.kata_to_hira(kanji_reading),
-                            "hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron)
-                        })
-                        reading_pos += len_allocated_reading
-                    else:
-                        # 非漢字の場合
-                        non_kanji_block = ""
-                        while i < len(surface) and not self.is_kanji(surface[i]):
-                            non_kanji_block += surface[i]
-                            i += 1
-
-                        # 非漢字部分の読み（通常は文字数分、または残りの読みの分だけ）
-                        len_block = len(non_kanji_block)
-                        non_kanji_reading = reading[reading_pos:reading_pos + len_block]
-
-                        # 割り当てられた読みの長さ
-                        len_allocated_reading = len(non_kanji_reading)
-
-                        results.append({
-                            "orig": non_kanji_block,
-                            "kana": non_kanji_reading,
-                            "hira": self.kata_to_hira(non_kanji_reading),
-                            "hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron)
-                        })
-                        reading_pos += len_allocated_reading
+                # 複数文字の場合は既存のユーティリティで分割
+                parts = self.split_kanji_okurigana(surface, reading, use_macron=use_macron)
+                results.extend(parts)
 
         return results
 

From 3ee724622457d48b7ca45b6dd95770744dfd1447 Mon Sep 17 00:00:00 2001
From: misyaguziya <53165965+misyaguziya@users.noreply.github.com>
Date: Sun, 5 Oct 2025 16:18:58 +0900
Subject: [PATCH 2/3] [Feature] Transliterator: Implement contextual
 transliteration rules and integrate with analysis method

---
 .../transliteration_context_rules.py          | 134 +++++++++++++++
 .../transliteration_transliterator.py         | 156 ++++++++++++------
 2 files changed, 244 insertions(+), 46 deletions(-)
 create mode 100644 src-python/models/transliteration/transliteration_context_rules.py

diff --git a/src-python/models/transliteration/transliteration_context_rules.py b/src-python/models/transliteration/transliteration_context_rules.py
new file mode 100644
index 00000000..d0b5d339
--- /dev/null
+++ b/src-python/models/transliteration/transliteration_context_rules.py
@@ -0,0 +1,134 @@
+from typing import List, Dict
+import re
+
+"""Contextual transliteration rules for tokenized results.
+
+This module provides a compact rule engine that can modify token
+readings (kana) based on neighboring tokens. Rules are embedded in
+``DEFAULT_RULES`` to simplify packaging (no external JSON required).
+
+Key points
+- Rules are applied in descending ``priority`` order.
+- Supported match modes: ``equals`` (exact match) and ``regex``.
+- ``direction`` chooses whether to inspect the next or previous token.
+- When a rule sets ``kana``, the engine overwrites ``kana`` and clears
+  ``hira``/``hepburn``; callers should recompute them after rules run.
+
+The engine mutates the provided ``results`` list in-place and also
+returns it for convenience.
+"""
+DEFAULT_RULES = {
+    "rules": [
+        {
+            "name": "nan_next_tdna",
+            "target": "何",
+            "match_mode": "equals",
+            "direction": "next",
+            "kana_set": list("タチツテトダヂヅデドナニヌネノ"),
+            "on_true": {"kana": "ナン"},
+            "on_false": {"kana": "ナニ"}
+        }
+    ]
+}
+
+
+
+def apply_context_rules(results: List[Dict], use_macron: bool = False) -> List[Dict]:
+    """Apply contextual rewrite rules to `results`.
+
+    Parameters
+    - results: list of token dicts produced by Transliterator.split_kanji_okurigana
+        where each entry contains at least the keys: 'orig', 'kana', 'hira', 'hepburn'.
+    - use_macron: passed through for compatibility; rules themselves don't use it
+
+    Returns
+    - The (possibly modified) `results` list. The list is also modified in-place.
+
+    The engine supports 'equals' and 'regex' match modes, next/prev neighbor
+    inspection, and simple actions that overwrite `kana` (caller must recalc
+    `hira`/`hepburn` afterwards).
+    """
+
+    # prepare rules: sort by priority (desc) and precompile regex where provided
+    raw_rules = DEFAULT_RULES.get("rules", [])
+    rules = sorted(raw_rules, key=lambda r: r.get("priority", 0), reverse=True)
+    for r in rules:
+        if r.get("match_mode") == "regex" and r.get("pattern"):
+            try:
+                r["_re"] = re.compile(r["pattern"])
+            except Exception:
+                r["_re"] = None
+
+    i = 0
+    n = len(results)
+    while i < n:
+        entry = results[i]
+        orig = entry.get("orig", "")
+        # skip tokens with empty orig (symbols, whitespace, etc.)
+        if not orig:
+            i += 1
+            continue
+
+        for rule in rules:
+            target = rule.get("target")
+            mode = rule.get("match_mode", "equals")
+            direction = rule.get("direction", "next")
+            kana_set = set(rule.get("kana_set", []))
+            on_true = rule.get("on_true", {})
+            on_false = rule.get("on_false", {})
+
+            matched = False
+            if mode == "equals" and orig == target:
+                matched = True
+            elif mode == "regex":
+                cre = rule.get("_re")
+                if cre and cre.search(orig):
+                    matched = True
+            # regex or other modes can be added later
+
+            if not matched:
+                continue
+
+            # decide neighbor token based on direction
+            neighbor_entry = None
+            if direction == "next":
+                j = i + 1
+                while j < n:
+                    if results[j].get("orig"):
+                        neighbor_entry = results[j]
+                        break
+                    j += 1
+            elif direction == "prev":
+                j = i - 1
+                while j >= 0:
+                    if results[j].get("orig"):
+                        neighbor_entry = results[j]
+                        break
+                    j -= 1
+
+            condition = False
+            if neighbor_entry:
+                nk = neighbor_entry.get("kana", "")
+                if nk:
+                    first = nk[0]
+                    if first in kana_set:
+                        condition = True
+                else:
+                    # fallback to orig-first-char check
+                    fo = neighbor_entry.get("orig", "")[:1]
+                    if fo and 'ァ' <= fo <= 'ン' and fo in kana_set:
+                        condition = True
+
+            # Apply action: simple overwrite of kana/hira/hepburn for the matched token
+            action = on_true if condition else on_false
+            if "kana" in action:
+                entry["kana"] = action["kana"]
+                entry["hira"] = ""
+                entry["hepburn"] = ""
+                # once a rule applied, do not apply further rules to this token
+                break
+
+        i += 1
+
+    # return the (possibly modified) results for convenience/pure-function style usage
+    return results
diff --git a/src-python/models/transliteration/transliteration_transliterator.py b/src-python/models/transliteration/transliteration_transliterator.py
index b8e64f7d..e25b3be4 100644
--- a/src-python/models/transliteration/transliteration_transliterator.py
+++ b/src-python/models/transliteration/transliteration_transliterator.py
@@ -4,6 +4,10 @@ try:
     from .transliteration_kana_to_hepburn import katakana_to_hepburn
 except ImportError:
     from transliteration_kana_to_hepburn import katakana_to_hepburn
+try:
+    from .transliteration_context_rules import apply_context_rules
+except ImportError:
+    from transliteration_context_rules import apply_context_rules
 
 class Transliterator:
     def __init__(self):
@@ -23,10 +27,24 @@ class Transliterator:
 
     @staticmethod
     def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True):
+        """Split a single surface word and its kana reading into parts.
+
+        Inputs:
+        - surface: the surface form (may contain kanji + kana)
+        - reading_kana: the katakana reading for the whole surface
+
+        Output:
+        - a list of dicts: [{"orig": str, "kana": str, "hira": str, "hepburn": str}, ...]
+
+        Notes:
+        - The function allocates portions of ``reading_kana`` to each contiguous
+          kanji/non-kanji block in ``surface``. Allocation is heuristic: an
+          initial allocation based on block length is used and any remainder is
+          distributed left-to-right preferring kanji blocks.
+        - This function is pure (no external side effects) and returns the
+          constructed list.
         """
-        1語の表層形(surface)と読み(reading_kana)を
-        [ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割
-        """
+
         result = []
 
         # 表層を「漢字ブロック」と「非漢字ブロック」に分割
@@ -46,53 +64,73 @@ class Transliterator:
 
         # 読みを分配
         kana_left = reading_kana
-        for i, (is_kan, part) in enumerate(blocks):
-            if is_kan:
-                # 漢字ブロックの処理
-                if len(blocks) == 1:
-                    # 単一ブロック（全て漢字）の場合
-                    kana_for_kan = kana_left
-                elif i == len(blocks) - 1:
-                    # 最後のブロック（漢字）の場合
-                    kana_for_kan = kana_left
-                else:
-                    # 中間の漢字ブロックの場合
-                    # 後続の非漢字ブロックの文字数を計算
-                    remaining_non_kanji = sum(len(p) for is_k, p in blocks[i+1:] if not is_k)
-                    if remaining_non_kanji > 0 and len(kana_left) > remaining_non_kanji:
-                        kana_for_kan = kana_left[:-remaining_non_kanji]
-                    else:
-                        # 漢字1文字あたり最低1文字の読みを割り当て
-                        min_kana = len(part)
-                        kana_for_kan = kana_left[:max(min_kana, len(kana_left) - remaining_non_kanji)]
-                
-                # 空の読みを避ける
-                if not kana_for_kan and kana_left:
-                    kana_for_kan = kana_left[:1]
+        # We'll allocate kana to each block by initial guess = len(part) (characters)
+        # and distribute any remaining kana left-to-right preferring kanji blocks.
+        kana_len = len(kana_left)
 
-                result.append({
-                    "orig": part,
-                    "kana": kana_for_kan,
-                    "hira": Transliterator.kata_to_hira(kana_for_kan),
-                    "hepburn": katakana_to_hepburn(kana_for_kan, use_macron=use_macron)
-                })
-                kana_left = kana_left[len(kana_for_kan):]
-            else:
-                # 非漢字部分（送り仮名など）
-                kana_for_okuri = kana_left[:len(part)]
-                result.append(
-                    {
-                        "orig": part,
-                        "kana": kana_for_okuri,
-                        "hira": Transliterator.kata_to_hira(kana_for_okuri),
-                        "hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=use_macron)
-                    }
-                )
-                kana_left = kana_left[len(kana_for_okuri):]
+        # initial allocation per block
+        allocs = [len(part) for _, part in blocks]
+        allocated = sum(allocs)
+        remaining = kana_len - allocated
+
+        # distribute extra kana to kanji blocks first (left-to-right)
+        if remaining > 0:
+            for idx, (is_kan, _) in enumerate(blocks):
+                if remaining <= 0:
+                    break
+                if is_kan:
+                    allocs[idx] += 1
+                    remaining -= 1
+            # if still remaining, distribute to all blocks left-to-right
+            idx = 0
+            while remaining > 0 and len(blocks) > 0:
+                allocs[idx] += 1
+                remaining -= 1
+                idx = (idx + 1) % len(blocks)
+
+        # if remaining < 0 (reading shorter than base), shrink allocations from right
+        if remaining < 0:
+            # remove from rightmost blocks as needed
+            need = -remaining
+            idx = len(blocks) - 1
+            while need > 0 and idx >= 0:
+                take = min(allocs[idx] - 1, need) if allocs[idx] > 1 else 0
+                allocs[idx] -= take
+                need -= take
+                idx -= 1
+
+        # now slice kana_left according to allocs
+        pos = 0
+        for (is_kan, part), cnt in zip(blocks, allocs):
+            kana_for_part = kana_left[pos:pos+cnt]
+            pos += cnt
+            result.append({
+                "orig": part,
+                "kana": kana_for_part,
+                "hira": Transliterator.kata_to_hira(kana_for_part),
+                "hepburn": katakana_to_hepburn(kana_for_part, use_macron=use_macron)
+            })
 
         return result
 
     def analyze(self, text: str, use_macron: bool = False):
+        """Tokenize ``text`` and produce per-subunit reading information.
+
+        Returns a list of dicts for each token/sub-part with keys:
+        - orig: original surface string (one or more characters)
+        - kana: katakana reading for this part (may be adapted by context rules)
+        - hira: hiragana reading (derived from kana)
+        - hepburn: Latin transcription (derived from kana)
+
+        Side-effects / notes:
+        - The function calls ``apply_context_rules(results, use_macron=...)``
+          which both mutates ``results`` in-place and returns it. This method
+          safely accepts the returned list and then recalculates ``hira`` and
+          ``hepburn`` for entries whose ``kana`` was changed.
+        - If rule application fails, analysis still returns the best-effort
+          results.
+        """
+
         tokens = self.tokenizer_obj.tokenize(text, self.mode)
 
         results = []
@@ -127,11 +165,37 @@ class Transliterator:
                 parts = self.split_kanji_okurigana(surface, reading, use_macron=use_macron)
                 results.extend(parts)
 
+        # 文脈ルールを適用（別ファイル）
+        try:
+            results = apply_context_rules(results, use_macron=use_macron) or results
+        except Exception:
+            # ルール適用で失敗しても解析結果は返す
+            pass
+
+        # apply_context_rules が kana を書き換えた場合、hira と hepburn を再計算
+        for entry in results:
+            kana = entry.get("kana", "")
+            if kana:
+                entry["hira"] = self.kata_to_hira(kana)
+                entry["hepburn"] = katakana_to_hepburn(kana, use_macron=use_macron)
+
         return results
 
 # --- テスト ---
 if __name__ == "__main__":
+    import pprint
     test_cases = [
+        "向こうへ行く",
+        "行事を行う",
+        "上がる",
+        "上る",
+        "入り込む",
+        "何",
+        "何が好き？",
+        "何色が好き？",
+        "何色ありますか？",
+        "何語ですか？",
+        "テーブルに色鉛筆は何色ありますか？"
         "美しい花を見る",
         "東京に行く",
         "漢字とカタカナの混在",
@@ -155,4 +219,4 @@ if __name__ == "__main__":
 
     transliterator = Transliterator()
     for case in test_cases:
-        print(transliterator.analyze(case))
\ No newline at end of file
+        pprint.pprint(transliterator.analyze(case), sort_dicts=False)
\ No newline at end of file

From ca07aef201ac23e15f17511b934dfab51026530f Mon Sep 17 00:00:00 2001
From: misyaguziya <53165965+misyaguziya@users.noreply.github.com>
Date: Sun, 5 Oct 2025 17:15:25 +0900
Subject: [PATCH 3/3] [Update] Translator: Add check to return original message
 if source and target languages are the same

---
 src-python/models/translation/translation_translator.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src-python/models/translation/translation_translator.py b/src-python/models/translation/translation_translator.py
index 897fcd1b..a9a1a56a 100644
--- a/src-python/models/translation/translation_translator.py
+++ b/src-python/models/translation/translation_translator.py
@@ -100,6 +100,9 @@ class Translator():
 
     def translate(self, translator_name, source_language, target_language, target_country, message):
         try:
+            if source_language == target_language:
+                return message
+
             result = ""
             source_language, target_language = self.getLanguageCode(translator_name, target_country, source_language, target_language)
             match translator_name: