[Feature] Transliterator: Implement contextual transliteration rules and integrate with analysis method

This commit is contained in:
misyaguziya
2025-10-05 16:18:58 +09:00
parent 7b1e9136ee
commit 3ee7246224
2 changed files with 244 additions and 46 deletions

View File

@@ -0,0 +1,134 @@
from typing import List, Dict
import re
"""Contextual transliteration rules for tokenized results.
This module provides a compact rule engine that can modify token
readings (kana) based on neighboring tokens. Rules are embedded in
``DEFAULT_RULES`` to simplify packaging (no external JSON required).
Key points
- Rules are applied in descending ``priority`` order.
- Supported match modes: ``equals`` (exact match) and ``regex``.
- ``direction`` chooses whether to inspect the next or previous token.
- When a rule sets ``kana``, the engine overwrites ``kana`` and clears
``hira``/``hepburn``; callers should recompute them after rules run.
The engine mutates the provided ``results`` list in-place and also
returns it for convenience.
"""
DEFAULT_RULES = {
"rules": [
{
"name": "nan_next_tdna",
"target": "",
"match_mode": "equals",
"direction": "next",
"kana_set": list("タチツテトダヂヅデドナニヌネノ"),
"on_true": {"kana": "ナン"},
"on_false": {"kana": "ナニ"}
}
]
}
def apply_context_rules(results: List[Dict], use_macron: bool = False) -> List[Dict]:
"""Apply contextual rewrite rules to `results`.
Parameters
- results: list of token dicts produced by Transliterator.split_kanji_okurigana
where each entry contains at least the keys: 'orig', 'kana', 'hira', 'hepburn'.
- use_macron: passed through for compatibility; rules themselves don't use it
Returns
- The (possibly modified) `results` list. The list is also modified in-place.
The engine supports 'equals' and 'regex' match modes, next/prev neighbor
inspection, and simple actions that overwrite `kana` (caller must recalc
`hira`/`hepburn` afterwards).
"""
# prepare rules: sort by priority (desc) and precompile regex where provided
raw_rules = DEFAULT_RULES.get("rules", [])
rules = sorted(raw_rules, key=lambda r: r.get("priority", 0), reverse=True)
for r in rules:
if r.get("match_mode") == "regex" and r.get("pattern"):
try:
r["_re"] = re.compile(r["pattern"])
except Exception:
r["_re"] = None
i = 0
n = len(results)
while i < n:
entry = results[i]
orig = entry.get("orig", "")
# skip tokens with empty orig (symbols, whitespace, etc.)
if not orig:
i += 1
continue
for rule in rules:
target = rule.get("target")
mode = rule.get("match_mode", "equals")
direction = rule.get("direction", "next")
kana_set = set(rule.get("kana_set", []))
on_true = rule.get("on_true", {})
on_false = rule.get("on_false", {})
matched = False
if mode == "equals" and orig == target:
matched = True
elif mode == "regex":
cre = rule.get("_re")
if cre and cre.search(orig):
matched = True
# regex or other modes can be added later
if not matched:
continue
# decide neighbor token based on direction
neighbor_entry = None
if direction == "next":
j = i + 1
while j < n:
if results[j].get("orig"):
neighbor_entry = results[j]
break
j += 1
elif direction == "prev":
j = i - 1
while j >= 0:
if results[j].get("orig"):
neighbor_entry = results[j]
break
j -= 1
condition = False
if neighbor_entry:
nk = neighbor_entry.get("kana", "")
if nk:
first = nk[0]
if first in kana_set:
condition = True
else:
# fallback to orig-first-char check
fo = neighbor_entry.get("orig", "")[:1]
if fo and '' <= fo <= '' and fo in kana_set:
condition = True
# Apply action: simple overwrite of kana/hira/hepburn for the matched token
action = on_true if condition else on_false
if "kana" in action:
entry["kana"] = action["kana"]
entry["hira"] = ""
entry["hepburn"] = ""
# once a rule applied, do not apply further rules to this token
break
i += 1
# return the (possibly modified) results for convenience/pure-function style usage
return results

View File

@@ -4,6 +4,10 @@ try:
from .transliteration_kana_to_hepburn import katakana_to_hepburn from .transliteration_kana_to_hepburn import katakana_to_hepburn
except ImportError: except ImportError:
from transliteration_kana_to_hepburn import katakana_to_hepburn from transliteration_kana_to_hepburn import katakana_to_hepburn
try:
from .transliteration_context_rules import apply_context_rules
except ImportError:
from transliteration_context_rules import apply_context_rules
class Transliterator: class Transliterator:
def __init__(self): def __init__(self):
@@ -23,10 +27,24 @@ class Transliterator:
@staticmethod @staticmethod
def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True): def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True):
"""Split a single surface word and its kana reading into parts.
Inputs:
- surface: the surface form (may contain kanji + kana)
- reading_kana: the katakana reading for the whole surface
Output:
- a list of dicts: [{"orig": str, "kana": str, "hira": str, "hepburn": str}, ...]
Notes:
- The function allocates portions of ``reading_kana`` to each contiguous
kanji/non-kanji block in ``surface``. Allocation is heuristic: an
initial allocation based on block length is used and any remainder is
distributed left-to-right preferring kanji blocks.
- This function is pure (no external side effects) and returns the
constructed list.
""" """
1語の表層形(surface)と読み(reading_kana)を
[ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割
"""
result = [] result = []
# 表層を「漢字ブロック」と「非漢字ブロック」に分割 # 表層を「漢字ブロック」と「非漢字ブロック」に分割
@@ -46,53 +64,73 @@ class Transliterator:
# 読みを分配 # 読みを分配
kana_left = reading_kana kana_left = reading_kana
for i, (is_kan, part) in enumerate(blocks): # We'll allocate kana to each block by initial guess = len(part) (characters)
if is_kan: # and distribute any remaining kana left-to-right preferring kanji blocks.
# 漢字ブロックの処理 kana_len = len(kana_left)
if len(blocks) == 1:
# 単一ブロック(全て漢字)の場合
kana_for_kan = kana_left
elif i == len(blocks) - 1:
# 最後のブロック(漢字)の場合
kana_for_kan = kana_left
else:
# 中間の漢字ブロックの場合
# 後続の非漢字ブロックの文字数を計算
remaining_non_kanji = sum(len(p) for is_k, p in blocks[i+1:] if not is_k)
if remaining_non_kanji > 0 and len(kana_left) > remaining_non_kanji:
kana_for_kan = kana_left[:-remaining_non_kanji]
else:
# 漢字1文字あたり最低1文字の読みを割り当て
min_kana = len(part)
kana_for_kan = kana_left[:max(min_kana, len(kana_left) - remaining_non_kanji)]
# 空の読みを避ける # initial allocation per block
if not kana_for_kan and kana_left: allocs = [len(part) for _, part in blocks]
kana_for_kan = kana_left[:1] allocated = sum(allocs)
remaining = kana_len - allocated
result.append({ # distribute extra kana to kanji blocks first (left-to-right)
"orig": part, if remaining > 0:
"kana": kana_for_kan, for idx, (is_kan, _) in enumerate(blocks):
"hira": Transliterator.kata_to_hira(kana_for_kan), if remaining <= 0:
"hepburn": katakana_to_hepburn(kana_for_kan, use_macron=use_macron) break
}) if is_kan:
kana_left = kana_left[len(kana_for_kan):] allocs[idx] += 1
else: remaining -= 1
# 非漢字部分(送り仮名など) # if still remaining, distribute to all blocks left-to-right
kana_for_okuri = kana_left[:len(part)] idx = 0
result.append( while remaining > 0 and len(blocks) > 0:
{ allocs[idx] += 1
"orig": part, remaining -= 1
"kana": kana_for_okuri, idx = (idx + 1) % len(blocks)
"hira": Transliterator.kata_to_hira(kana_for_okuri),
"hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=use_macron) # if remaining < 0 (reading shorter than base), shrink allocations from right
} if remaining < 0:
) # remove from rightmost blocks as needed
kana_left = kana_left[len(kana_for_okuri):] need = -remaining
idx = len(blocks) - 1
while need > 0 and idx >= 0:
take = min(allocs[idx] - 1, need) if allocs[idx] > 1 else 0
allocs[idx] -= take
need -= take
idx -= 1
# now slice kana_left according to allocs
pos = 0
for (is_kan, part), cnt in zip(blocks, allocs):
kana_for_part = kana_left[pos:pos+cnt]
pos += cnt
result.append({
"orig": part,
"kana": kana_for_part,
"hira": Transliterator.kata_to_hira(kana_for_part),
"hepburn": katakana_to_hepburn(kana_for_part, use_macron=use_macron)
})
return result return result
def analyze(self, text: str, use_macron: bool = False): def analyze(self, text: str, use_macron: bool = False):
"""Tokenize ``text`` and produce per-subunit reading information.
Returns a list of dicts for each token/sub-part with keys:
- orig: original surface string (one or more characters)
- kana: katakana reading for this part (may be adapted by context rules)
- hira: hiragana reading (derived from kana)
- hepburn: Latin transcription (derived from kana)
Side-effects / notes:
- The function calls ``apply_context_rules(results, use_macron=...)``
which both mutates ``results`` in-place and returns it. This method
safely accepts the returned list and then recalculates ``hira`` and
``hepburn`` for entries whose ``kana`` was changed.
- If rule application fails, analysis still returns the best-effort
results.
"""
tokens = self.tokenizer_obj.tokenize(text, self.mode) tokens = self.tokenizer_obj.tokenize(text, self.mode)
results = [] results = []
@@ -127,11 +165,37 @@ class Transliterator:
parts = self.split_kanji_okurigana(surface, reading, use_macron=use_macron) parts = self.split_kanji_okurigana(surface, reading, use_macron=use_macron)
results.extend(parts) results.extend(parts)
# 文脈ルールを適用(別ファイル)
try:
results = apply_context_rules(results, use_macron=use_macron) or results
except Exception:
# ルール適用で失敗しても解析結果は返す
pass
# apply_context_rules が kana を書き換えた場合、hira と hepburn を再計算
for entry in results:
kana = entry.get("kana", "")
if kana:
entry["hira"] = self.kata_to_hira(kana)
entry["hepburn"] = katakana_to_hepburn(kana, use_macron=use_macron)
return results return results
# --- テスト --- # --- テスト ---
if __name__ == "__main__": if __name__ == "__main__":
import pprint
test_cases = [ test_cases = [
"向こうへ行く",
"行事を行う",
"上がる",
"上る",
"入り込む",
"",
"何が好き?",
"何色が好き?",
"何色ありますか?",
"何語ですか?",
"テーブルに色鉛筆は何色ありますか?"
"美しい花を見る", "美しい花を見る",
"東京に行く", "東京に行く",
"漢字とカタカナの混在", "漢字とカタカナの混在",
@@ -155,4 +219,4 @@ if __name__ == "__main__":
transliterator = Transliterator() transliterator = Transliterator()
for case in test_cases: for case in test_cases:
print(transliterator.analyze(case)) pprint.pprint(transliterator.analyze(case), sort_dicts=False)