diff --git a/src-python/models/transliteration/transliteration_context_rules.py b/src-python/models/transliteration/transliteration_context_rules.py new file mode 100644 index 00000000..d0b5d339 --- /dev/null +++ b/src-python/models/transliteration/transliteration_context_rules.py @@ -0,0 +1,134 @@ +from typing import List, Dict +import re + +"""Contextual transliteration rules for tokenized results. + +This module provides a compact rule engine that can modify token +readings (kana) based on neighboring tokens. Rules are embedded in +``DEFAULT_RULES`` to simplify packaging (no external JSON required). + +Key points +- Rules are applied in descending ``priority`` order. +- Supported match modes: ``equals`` (exact match) and ``regex``. +- ``direction`` chooses whether to inspect the next or previous token. +- When a rule sets ``kana``, the engine overwrites ``kana`` and clears + ``hira``/``hepburn``; callers should recompute them after rules run. + +The engine mutates the provided ``results`` list in-place and also +returns it for convenience. +""" +DEFAULT_RULES = { + "rules": [ + { + "name": "nan_next_tdna", + "target": "何", + "match_mode": "equals", + "direction": "next", + "kana_set": list("タチツテトダヂヅデドナニヌネノ"), + "on_true": {"kana": "ナン"}, + "on_false": {"kana": "ナニ"} + } + ] +} + + + +def apply_context_rules(results: List[Dict], use_macron: bool = False) -> List[Dict]: + """Apply contextual rewrite rules to `results`. + + Parameters + - results: list of token dicts produced by Transliterator.split_kanji_okurigana + where each entry contains at least the keys: 'orig', 'kana', 'hira', 'hepburn'. + - use_macron: passed through for compatibility; rules themselves don't use it + + Returns + - The (possibly modified) `results` list. The list is also modified in-place. + + The engine supports 'equals' and 'regex' match modes, next/prev neighbor + inspection, and simple actions that overwrite `kana` (caller must recalc + `hira`/`hepburn` afterwards). + """ + + # prepare rules: sort by priority (desc) and precompile regex where provided + raw_rules = DEFAULT_RULES.get("rules", []) + rules = sorted(raw_rules, key=lambda r: r.get("priority", 0), reverse=True) + for r in rules: + if r.get("match_mode") == "regex" and r.get("pattern"): + try: + r["_re"] = re.compile(r["pattern"]) + except Exception: + r["_re"] = None + + i = 0 + n = len(results) + while i < n: + entry = results[i] + orig = entry.get("orig", "") + # skip tokens with empty orig (symbols, whitespace, etc.) + if not orig: + i += 1 + continue + + for rule in rules: + target = rule.get("target") + mode = rule.get("match_mode", "equals") + direction = rule.get("direction", "next") + kana_set = set(rule.get("kana_set", [])) + on_true = rule.get("on_true", {}) + on_false = rule.get("on_false", {}) + + matched = False + if mode == "equals" and orig == target: + matched = True + elif mode == "regex": + cre = rule.get("_re") + if cre and cre.search(orig): + matched = True + # regex or other modes can be added later + + if not matched: + continue + + # decide neighbor token based on direction + neighbor_entry = None + if direction == "next": + j = i + 1 + while j < n: + if results[j].get("orig"): + neighbor_entry = results[j] + break + j += 1 + elif direction == "prev": + j = i - 1 + while j >= 0: + if results[j].get("orig"): + neighbor_entry = results[j] + break + j -= 1 + + condition = False + if neighbor_entry: + nk = neighbor_entry.get("kana", "") + if nk: + first = nk[0] + if first in kana_set: + condition = True + else: + # fallback to orig-first-char check + fo = neighbor_entry.get("orig", "")[:1] + if fo and 'ァ' <= fo <= 'ン' and fo in kana_set: + condition = True + + # Apply action: simple overwrite of kana/hira/hepburn for the matched token + action = on_true if condition else on_false + if "kana" in action: + entry["kana"] = action["kana"] + entry["hira"] = "" + entry["hepburn"] = "" + # once a rule applied, do not apply further rules to this token + break + + i += 1 + + # return the (possibly modified) results for convenience/pure-function style usage + return results diff --git a/src-python/models/transliteration/transliteration_transliterator.py b/src-python/models/transliteration/transliteration_transliterator.py index b8e64f7d..e25b3be4 100644 --- a/src-python/models/transliteration/transliteration_transliterator.py +++ b/src-python/models/transliteration/transliteration_transliterator.py @@ -4,6 +4,10 @@ try: from .transliteration_kana_to_hepburn import katakana_to_hepburn except ImportError: from transliteration_kana_to_hepburn import katakana_to_hepburn +try: + from .transliteration_context_rules import apply_context_rules +except ImportError: + from transliteration_context_rules import apply_context_rules class Transliterator: def __init__(self): @@ -23,10 +27,24 @@ class Transliterator: @staticmethod def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True): + """Split a single surface word and its kana reading into parts. + + Inputs: + - surface: the surface form (may contain kanji + kana) + - reading_kana: the katakana reading for the whole surface + + Output: + - a list of dicts: [{"orig": str, "kana": str, "hira": str, "hepburn": str}, ...] + + Notes: + - The function allocates portions of ``reading_kana`` to each contiguous + kanji/non-kanji block in ``surface``. Allocation is heuristic: an + initial allocation based on block length is used and any remainder is + distributed left-to-right preferring kanji blocks. + - This function is pure (no external side effects) and returns the + constructed list. """ - 1語の表層形(surface)と読み(reading_kana)を - [ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割 - """ + result = [] # 表層を「漢字ブロック」と「非漢字ブロック」に分割 @@ -46,53 +64,73 @@ class Transliterator: # 読みを分配 kana_left = reading_kana - for i, (is_kan, part) in enumerate(blocks): - if is_kan: - # 漢字ブロックの処理 - if len(blocks) == 1: - # 単一ブロック(全て漢字)の場合 - kana_for_kan = kana_left - elif i == len(blocks) - 1: - # 最後のブロック(漢字)の場合 - kana_for_kan = kana_left - else: - # 中間の漢字ブロックの場合 - # 後続の非漢字ブロックの文字数を計算 - remaining_non_kanji = sum(len(p) for is_k, p in blocks[i+1:] if not is_k) - if remaining_non_kanji > 0 and len(kana_left) > remaining_non_kanji: - kana_for_kan = kana_left[:-remaining_non_kanji] - else: - # 漢字1文字あたり最低1文字の読みを割り当て - min_kana = len(part) - kana_for_kan = kana_left[:max(min_kana, len(kana_left) - remaining_non_kanji)] - - # 空の読みを避ける - if not kana_for_kan and kana_left: - kana_for_kan = kana_left[:1] + # We'll allocate kana to each block by initial guess = len(part) (characters) + # and distribute any remaining kana left-to-right preferring kanji blocks. + kana_len = len(kana_left) - result.append({ - "orig": part, - "kana": kana_for_kan, - "hira": Transliterator.kata_to_hira(kana_for_kan), - "hepburn": katakana_to_hepburn(kana_for_kan, use_macron=use_macron) - }) - kana_left = kana_left[len(kana_for_kan):] - else: - # 非漢字部分(送り仮名など) - kana_for_okuri = kana_left[:len(part)] - result.append( - { - "orig": part, - "kana": kana_for_okuri, - "hira": Transliterator.kata_to_hira(kana_for_okuri), - "hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=use_macron) - } - ) - kana_left = kana_left[len(kana_for_okuri):] + # initial allocation per block + allocs = [len(part) for _, part in blocks] + allocated = sum(allocs) + remaining = kana_len - allocated + + # distribute extra kana to kanji blocks first (left-to-right) + if remaining > 0: + for idx, (is_kan, _) in enumerate(blocks): + if remaining <= 0: + break + if is_kan: + allocs[idx] += 1 + remaining -= 1 + # if still remaining, distribute to all blocks left-to-right + idx = 0 + while remaining > 0 and len(blocks) > 0: + allocs[idx] += 1 + remaining -= 1 + idx = (idx + 1) % len(blocks) + + # if remaining < 0 (reading shorter than base), shrink allocations from right + if remaining < 0: + # remove from rightmost blocks as needed + need = -remaining + idx = len(blocks) - 1 + while need > 0 and idx >= 0: + take = min(allocs[idx] - 1, need) if allocs[idx] > 1 else 0 + allocs[idx] -= take + need -= take + idx -= 1 + + # now slice kana_left according to allocs + pos = 0 + for (is_kan, part), cnt in zip(blocks, allocs): + kana_for_part = kana_left[pos:pos+cnt] + pos += cnt + result.append({ + "orig": part, + "kana": kana_for_part, + "hira": Transliterator.kata_to_hira(kana_for_part), + "hepburn": katakana_to_hepburn(kana_for_part, use_macron=use_macron) + }) return result def analyze(self, text: str, use_macron: bool = False): + """Tokenize ``text`` and produce per-subunit reading information. + + Returns a list of dicts for each token/sub-part with keys: + - orig: original surface string (one or more characters) + - kana: katakana reading for this part (may be adapted by context rules) + - hira: hiragana reading (derived from kana) + - hepburn: Latin transcription (derived from kana) + + Side-effects / notes: + - The function calls ``apply_context_rules(results, use_macron=...)`` + which both mutates ``results`` in-place and returns it. This method + safely accepts the returned list and then recalculates ``hira`` and + ``hepburn`` for entries whose ``kana`` was changed. + - If rule application fails, analysis still returns the best-effort + results. + """ + tokens = self.tokenizer_obj.tokenize(text, self.mode) results = [] @@ -127,11 +165,37 @@ class Transliterator: parts = self.split_kanji_okurigana(surface, reading, use_macron=use_macron) results.extend(parts) + # 文脈ルールを適用(別ファイル) + try: + results = apply_context_rules(results, use_macron=use_macron) or results + except Exception: + # ルール適用で失敗しても解析結果は返す + pass + + # apply_context_rules が kana を書き換えた場合、hira と hepburn を再計算 + for entry in results: + kana = entry.get("kana", "") + if kana: + entry["hira"] = self.kata_to_hira(kana) + entry["hepburn"] = katakana_to_hepburn(kana, use_macron=use_macron) + return results # --- テスト --- if __name__ == "__main__": + import pprint test_cases = [ + "向こうへ行く", + "行事を行う", + "上がる", + "上る", + "入り込む", + "何", + "何が好き?", + "何色が好き?", + "何色ありますか?", + "何語ですか?", + "テーブルに色鉛筆は何色ありますか?" "美しい花を見る", "東京に行く", "漢字とカタカナの混在", @@ -155,4 +219,4 @@ if __name__ == "__main__": transliterator = Transliterator() for case in test_cases: - print(transliterator.analyze(case)) \ No newline at end of file + pprint.pprint(transliterator.analyze(case), sort_dicts=False) \ No newline at end of file