135 lines
4.7 KiB
Python
135 lines
4.7 KiB
Python
from typing import List, Dict, Any
|
|
import re
|
|
|
|
"""Contextual transliteration rules for tokenized results.
|
|
|
|
This module provides a compact rule engine that can modify token
|
|
readings (kana) based on neighboring tokens. Rules are embedded in
|
|
``DEFAULT_RULES`` to simplify packaging (no external JSON required).
|
|
|
|
Key points
|
|
- Rules are applied in descending ``priority`` order.
|
|
- Supported match modes: ``equals`` (exact match) and ``regex``.
|
|
- ``direction`` chooses whether to inspect the next or previous token.
|
|
- When a rule sets ``kana``, the engine overwrites ``kana`` and clears
|
|
``hira``/``hepburn``; callers should recompute them after rules run.
|
|
|
|
The engine mutates the provided ``results`` list in-place and also
|
|
returns it for convenience.
|
|
"""
|
|
DEFAULT_RULES = {
|
|
"rules": [
|
|
{
|
|
"name": "nan_next_tdna",
|
|
"target": "何",
|
|
"match_mode": "equals",
|
|
"direction": "next",
|
|
"kana_set": list("タチツテトダヂヅデドナニヌネノ"),
|
|
"on_true": {"kana": "ナン"},
|
|
"on_false": {"kana": "ナニ"}
|
|
}
|
|
]
|
|
}
|
|
|
|
|
|
|
|
def apply_context_rules(results: List[Dict[str, Any]], use_macron: bool = False) -> List[Dict[str, Any]]:
|
|
"""Apply contextual rewrite rules to `results`.
|
|
|
|
Parameters
|
|
- results: list of token dicts produced by Transliterator.split_kanji_okurigana
|
|
where each entry contains at least the keys: 'orig', 'kana', 'hira', 'hepburn'.
|
|
- use_macron: passed through for compatibility; rules themselves don't use it
|
|
|
|
Returns
|
|
- The (possibly modified) `results` list. The list is also modified in-place.
|
|
|
|
The engine supports 'equals' and 'regex' match modes, next/prev neighbor
|
|
inspection, and simple actions that overwrite `kana` (caller must recalc
|
|
`hira`/`hepburn` afterwards).
|
|
"""
|
|
|
|
# prepare rules: sort by priority (desc) and precompile regex where provided
|
|
raw_rules: List[Dict[str, Any]] = DEFAULT_RULES.get("rules", [])
|
|
rules = sorted(raw_rules, key=lambda r: r.get("priority", 0), reverse=True)
|
|
for r in rules:
|
|
if r.get("match_mode") == "regex" and r.get("pattern"):
|
|
try:
|
|
r["_re"] = re.compile(r["pattern"])
|
|
except Exception:
|
|
r["_re"] = None
|
|
|
|
i = 0
|
|
n = len(results)
|
|
while i < n:
|
|
entry = results[i]
|
|
orig = entry.get("orig", "")
|
|
# skip tokens with empty orig (symbols, whitespace, etc.)
|
|
if not orig:
|
|
i += 1
|
|
continue
|
|
|
|
for rule in rules:
|
|
target = rule.get("target")
|
|
mode = rule.get("match_mode", "equals")
|
|
direction = rule.get("direction", "next")
|
|
kana_set = set(rule.get("kana_set", []))
|
|
on_true = rule.get("on_true", {})
|
|
on_false = rule.get("on_false", {})
|
|
|
|
matched = False
|
|
if mode == "equals" and orig == target:
|
|
matched = True
|
|
elif mode == "regex":
|
|
cre = rule.get("_re")
|
|
if cre and cre.search(orig):
|
|
matched = True
|
|
# regex or other modes can be added later
|
|
|
|
if not matched:
|
|
continue
|
|
|
|
# decide neighbor token based on direction
|
|
neighbor_entry = None
|
|
if direction == "next":
|
|
j = i + 1
|
|
while j < n:
|
|
if results[j].get("orig"):
|
|
neighbor_entry = results[j]
|
|
break
|
|
j += 1
|
|
elif direction == "prev":
|
|
j = i - 1
|
|
while j >= 0:
|
|
if results[j].get("orig"):
|
|
neighbor_entry = results[j]
|
|
break
|
|
j -= 1
|
|
|
|
condition = False
|
|
if neighbor_entry:
|
|
nk = neighbor_entry.get("kana", "")
|
|
if nk:
|
|
first = nk[0]
|
|
if first in kana_set:
|
|
condition = True
|
|
else:
|
|
# fallback to orig-first-char check
|
|
fo = neighbor_entry.get("orig", "")[:1]
|
|
if fo and 'ァ' <= fo <= 'ン' and fo in kana_set:
|
|
condition = True
|
|
|
|
# Apply action: simple overwrite of kana/hira/hepburn for the matched token
|
|
action = on_true if condition else on_false
|
|
if "kana" in action:
|
|
entry["kana"] = action["kana"]
|
|
entry["hira"] = ""
|
|
entry["hepburn"] = ""
|
|
# once a rule applied, do not apply further rules to this token
|
|
break
|
|
|
|
i += 1
|
|
|
|
# return the (possibly modified) results for convenience/pure-function style usage
|
|
return results
|