Files
VRCT/src-python/models/transliteration/transliteration_context_rules.py

135 lines
4.7 KiB
Python

from typing import List, Dict, Any
import re
"""Contextual transliteration rules for tokenized results.
This module provides a compact rule engine that can modify token
readings (kana) based on neighboring tokens. Rules are embedded in
``DEFAULT_RULES`` to simplify packaging (no external JSON required).
Key points
- Rules are applied in descending ``priority`` order.
- Supported match modes: ``equals`` (exact match) and ``regex``.
- ``direction`` chooses whether to inspect the next or previous token.
- When a rule sets ``kana``, the engine overwrites ``kana`` and clears
``hira``/``hepburn``; callers should recompute them after rules run.
The engine mutates the provided ``results`` list in-place and also
returns it for convenience.
"""
DEFAULT_RULES = {
"rules": [
{
"name": "nan_next_tdna",
"target": "",
"match_mode": "equals",
"direction": "next",
"kana_set": list("タチツテトダヂヅデドナニヌネノ"),
"on_true": {"kana": "ナン"},
"on_false": {"kana": "ナニ"}
}
]
}
def apply_context_rules(results: List[Dict[str, Any]], use_macron: bool = False) -> List[Dict[str, Any]]:
"""Apply contextual rewrite rules to `results`.
Parameters
- results: list of token dicts produced by Transliterator.split_kanji_okurigana
where each entry contains at least the keys: 'orig', 'kana', 'hira', 'hepburn'.
- use_macron: passed through for compatibility; rules themselves don't use it
Returns
- The (possibly modified) `results` list. The list is also modified in-place.
The engine supports 'equals' and 'regex' match modes, next/prev neighbor
inspection, and simple actions that overwrite `kana` (caller must recalc
`hira`/`hepburn` afterwards).
"""
# prepare rules: sort by priority (desc) and precompile regex where provided
raw_rules: List[Dict[str, Any]] = DEFAULT_RULES.get("rules", [])
rules = sorted(raw_rules, key=lambda r: r.get("priority", 0), reverse=True)
for r in rules:
if r.get("match_mode") == "regex" and r.get("pattern"):
try:
r["_re"] = re.compile(r["pattern"])
except Exception:
r["_re"] = None
i = 0
n = len(results)
while i < n:
entry = results[i]
orig = entry.get("orig", "")
# skip tokens with empty orig (symbols, whitespace, etc.)
if not orig:
i += 1
continue
for rule in rules:
target = rule.get("target")
mode = rule.get("match_mode", "equals")
direction = rule.get("direction", "next")
kana_set = set(rule.get("kana_set", []))
on_true = rule.get("on_true", {})
on_false = rule.get("on_false", {})
matched = False
if mode == "equals" and orig == target:
matched = True
elif mode == "regex":
cre = rule.get("_re")
if cre and cre.search(orig):
matched = True
# regex or other modes can be added later
if not matched:
continue
# decide neighbor token based on direction
neighbor_entry = None
if direction == "next":
j = i + 1
while j < n:
if results[j].get("orig"):
neighbor_entry = results[j]
break
j += 1
elif direction == "prev":
j = i - 1
while j >= 0:
if results[j].get("orig"):
neighbor_entry = results[j]
break
j -= 1
condition = False
if neighbor_entry:
nk = neighbor_entry.get("kana", "")
if nk:
first = nk[0]
if first in kana_set:
condition = True
else:
# fallback to orig-first-char check
fo = neighbor_entry.get("orig", "")[:1]
if fo and '' <= fo <= '' and fo in kana_set:
condition = True
# Apply action: simple overwrite of kana/hira/hepburn for the matched token
action = on_true if condition else on_false
if "kana" in action:
entry["kana"] = action["kana"]
entry["hira"] = ""
entry["hepburn"] = ""
# once a rule applied, do not apply further rules to this token
break
i += 1
# return the (possibly modified) results for convenience/pure-function style usage
return results