VRCT/src-python/models/transliteration/transliteration_context_rules.py

from typing import List, Dict, Any
import re

"""Contextual transliteration rules for tokenized results.

This module provides a compact rule engine that can modify token
readings (kana) based on neighboring tokens. Rules are embedded in
``DEFAULT_RULES`` to simplify packaging (no external JSON required).

Key points
- Rules are applied in descending ``priority`` order.
- Supported match modes: ``equals`` (exact match) and ``regex``.
- ``direction`` chooses whether to inspect the next or previous token.
- When a rule sets ``kana``, the engine overwrites ``kana`` and clears
  ``hira``/``hepburn``; callers should recompute them after rules run.

The engine mutates the provided ``results`` list in-place and also
returns it for convenience.
"""
DEFAULT_RULES = {
    "rules": [
        {
            "name": "nan_next_tdna",
            "target": "何",
            "match_mode": "equals",
            "direction": "next",
            "kana_set": list("タチツテトダヂヅデドナニヌネノ"),
            "on_true": {"kana": "ナン"},
            "on_false": {"kana": "ナニ"}
        }
    ]
}


def apply_context_rules(results: List[Dict[str, Any]], use_macron: bool = False) -> List[Dict[str, Any]]:
    """Apply contextual rewrite rules to `results`.

    Parameters
    - results: list of token dicts produced by Transliterator.split_kanji_okurigana
        where each entry contains at least the keys: 'orig', 'kana', 'hira', 'hepburn'.
    - use_macron: passed through for compatibility; rules themselves don't use it

    Returns
    - The (possibly modified) `results` list. The list is also modified in-place.

    The engine supports 'equals' and 'regex' match modes, next/prev neighbor
    inspection, and simple actions that overwrite `kana` (caller must recalc
    `hira`/`hepburn` afterwards).
    """

    # prepare rules: sort by priority (desc) and precompile regex where provided
    raw_rules: List[Dict[str, Any]] = DEFAULT_RULES.get("rules", [])
    rules = sorted(raw_rules, key=lambda r: r.get("priority", 0), reverse=True)
    for r in rules:
        if r.get("match_mode") == "regex" and r.get("pattern"):
            try:
                r["_re"] = re.compile(r["pattern"])
            except Exception:
                r["_re"] = None

    i = 0
    n = len(results)
    while i < n:
        entry = results[i]
        orig = entry.get("orig", "")
        # skip tokens with empty orig (symbols, whitespace, etc.)
        if not orig:
            i += 1
            continue

        for rule in rules:
            target = rule.get("target")
            mode = rule.get("match_mode", "equals")
            direction = rule.get("direction", "next")
            kana_set = set(rule.get("kana_set", []))
            on_true = rule.get("on_true", {})
            on_false = rule.get("on_false", {})

            matched = False
            if mode == "equals" and orig == target:
                matched = True
            elif mode == "regex":
                cre = rule.get("_re")
                if cre and cre.search(orig):
                    matched = True
            # regex or other modes can be added later

            if not matched:
                continue

            # decide neighbor token based on direction
            neighbor_entry = None
            if direction == "next":
                j = i + 1
                while j < n:
                    if results[j].get("orig"):
                        neighbor_entry = results[j]
                        break
                    j += 1
            elif direction == "prev":
                j = i - 1
                while j >= 0:
                    if results[j].get("orig"):
                        neighbor_entry = results[j]
                        break
                    j -= 1

            condition = False
            if neighbor_entry:
                nk = neighbor_entry.get("kana", "")
                if nk:
                    first = nk[0]
                    if first in kana_set:
                        condition = True
                else:
                    # fallback to orig-first-char check
                    fo = neighbor_entry.get("orig", "")[:1]
                    if fo and 'ァ' <= fo <= 'ン' and fo in kana_set:
                        condition = True

            # Apply action: simple overwrite of kana/hira/hepburn for the matched token
            action = on_true if condition else on_false
            if "kana" in action:
                entry["kana"] = action["kana"]
                entry["hira"] = ""
                entry["hepburn"] = ""
                # once a rule applied, do not apply further rules to this token
                break

        i += 1

    # return the (possibly modified) results for convenience/pure-function style usage
    return results