[Feature] Transliterator: Implement contextual transliteration rules and integrate with analysis method
This commit is contained in:
@@ -0,0 +1,134 @@
|
|||||||
|
from typing import List, Dict
|
||||||
|
import re
|
||||||
|
|
||||||
|
"""Contextual transliteration rules for tokenized results.
|
||||||
|
|
||||||
|
This module provides a compact rule engine that can modify token
|
||||||
|
readings (kana) based on neighboring tokens. Rules are embedded in
|
||||||
|
``DEFAULT_RULES`` to simplify packaging (no external JSON required).
|
||||||
|
|
||||||
|
Key points
|
||||||
|
- Rules are applied in descending ``priority`` order.
|
||||||
|
- Supported match modes: ``equals`` (exact match) and ``regex``.
|
||||||
|
- ``direction`` chooses whether to inspect the next or previous token.
|
||||||
|
- When a rule sets ``kana``, the engine overwrites ``kana`` and clears
|
||||||
|
``hira``/``hepburn``; callers should recompute them after rules run.
|
||||||
|
|
||||||
|
The engine mutates the provided ``results`` list in-place and also
|
||||||
|
returns it for convenience.
|
||||||
|
"""
|
||||||
|
DEFAULT_RULES = {
|
||||||
|
"rules": [
|
||||||
|
{
|
||||||
|
"name": "nan_next_tdna",
|
||||||
|
"target": "何",
|
||||||
|
"match_mode": "equals",
|
||||||
|
"direction": "next",
|
||||||
|
"kana_set": list("タチツテトダヂヅデドナニヌネノ"),
|
||||||
|
"on_true": {"kana": "ナン"},
|
||||||
|
"on_false": {"kana": "ナニ"}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def apply_context_rules(results: List[Dict], use_macron: bool = False) -> List[Dict]:
|
||||||
|
"""Apply contextual rewrite rules to `results`.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
- results: list of token dicts produced by Transliterator.split_kanji_okurigana
|
||||||
|
where each entry contains at least the keys: 'orig', 'kana', 'hira', 'hepburn'.
|
||||||
|
- use_macron: passed through for compatibility; rules themselves don't use it
|
||||||
|
|
||||||
|
Returns
|
||||||
|
- The (possibly modified) `results` list. The list is also modified in-place.
|
||||||
|
|
||||||
|
The engine supports 'equals' and 'regex' match modes, next/prev neighbor
|
||||||
|
inspection, and simple actions that overwrite `kana` (caller must recalc
|
||||||
|
`hira`/`hepburn` afterwards).
|
||||||
|
"""
|
||||||
|
|
||||||
|
# prepare rules: sort by priority (desc) and precompile regex where provided
|
||||||
|
raw_rules = DEFAULT_RULES.get("rules", [])
|
||||||
|
rules = sorted(raw_rules, key=lambda r: r.get("priority", 0), reverse=True)
|
||||||
|
for r in rules:
|
||||||
|
if r.get("match_mode") == "regex" and r.get("pattern"):
|
||||||
|
try:
|
||||||
|
r["_re"] = re.compile(r["pattern"])
|
||||||
|
except Exception:
|
||||||
|
r["_re"] = None
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
n = len(results)
|
||||||
|
while i < n:
|
||||||
|
entry = results[i]
|
||||||
|
orig = entry.get("orig", "")
|
||||||
|
# skip tokens with empty orig (symbols, whitespace, etc.)
|
||||||
|
if not orig:
|
||||||
|
i += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
for rule in rules:
|
||||||
|
target = rule.get("target")
|
||||||
|
mode = rule.get("match_mode", "equals")
|
||||||
|
direction = rule.get("direction", "next")
|
||||||
|
kana_set = set(rule.get("kana_set", []))
|
||||||
|
on_true = rule.get("on_true", {})
|
||||||
|
on_false = rule.get("on_false", {})
|
||||||
|
|
||||||
|
matched = False
|
||||||
|
if mode == "equals" and orig == target:
|
||||||
|
matched = True
|
||||||
|
elif mode == "regex":
|
||||||
|
cre = rule.get("_re")
|
||||||
|
if cre and cre.search(orig):
|
||||||
|
matched = True
|
||||||
|
# regex or other modes can be added later
|
||||||
|
|
||||||
|
if not matched:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# decide neighbor token based on direction
|
||||||
|
neighbor_entry = None
|
||||||
|
if direction == "next":
|
||||||
|
j = i + 1
|
||||||
|
while j < n:
|
||||||
|
if results[j].get("orig"):
|
||||||
|
neighbor_entry = results[j]
|
||||||
|
break
|
||||||
|
j += 1
|
||||||
|
elif direction == "prev":
|
||||||
|
j = i - 1
|
||||||
|
while j >= 0:
|
||||||
|
if results[j].get("orig"):
|
||||||
|
neighbor_entry = results[j]
|
||||||
|
break
|
||||||
|
j -= 1
|
||||||
|
|
||||||
|
condition = False
|
||||||
|
if neighbor_entry:
|
||||||
|
nk = neighbor_entry.get("kana", "")
|
||||||
|
if nk:
|
||||||
|
first = nk[0]
|
||||||
|
if first in kana_set:
|
||||||
|
condition = True
|
||||||
|
else:
|
||||||
|
# fallback to orig-first-char check
|
||||||
|
fo = neighbor_entry.get("orig", "")[:1]
|
||||||
|
if fo and 'ァ' <= fo <= 'ン' and fo in kana_set:
|
||||||
|
condition = True
|
||||||
|
|
||||||
|
# Apply action: simple overwrite of kana/hira/hepburn for the matched token
|
||||||
|
action = on_true if condition else on_false
|
||||||
|
if "kana" in action:
|
||||||
|
entry["kana"] = action["kana"]
|
||||||
|
entry["hira"] = ""
|
||||||
|
entry["hepburn"] = ""
|
||||||
|
# once a rule applied, do not apply further rules to this token
|
||||||
|
break
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# return the (possibly modified) results for convenience/pure-function style usage
|
||||||
|
return results
|
||||||
@@ -4,6 +4,10 @@ try:
|
|||||||
from .transliteration_kana_to_hepburn import katakana_to_hepburn
|
from .transliteration_kana_to_hepburn import katakana_to_hepburn
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from transliteration_kana_to_hepburn import katakana_to_hepburn
|
from transliteration_kana_to_hepburn import katakana_to_hepburn
|
||||||
|
try:
|
||||||
|
from .transliteration_context_rules import apply_context_rules
|
||||||
|
except ImportError:
|
||||||
|
from transliteration_context_rules import apply_context_rules
|
||||||
|
|
||||||
class Transliterator:
|
class Transliterator:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -23,10 +27,24 @@ class Transliterator:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True):
|
def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True):
|
||||||
|
"""Split a single surface word and its kana reading into parts.
|
||||||
|
|
||||||
|
Inputs:
|
||||||
|
- surface: the surface form (may contain kanji + kana)
|
||||||
|
- reading_kana: the katakana reading for the whole surface
|
||||||
|
|
||||||
|
Output:
|
||||||
|
- a list of dicts: [{"orig": str, "kana": str, "hira": str, "hepburn": str}, ...]
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- The function allocates portions of ``reading_kana`` to each contiguous
|
||||||
|
kanji/non-kanji block in ``surface``. Allocation is heuristic: an
|
||||||
|
initial allocation based on block length is used and any remainder is
|
||||||
|
distributed left-to-right preferring kanji blocks.
|
||||||
|
- This function is pure (no external side effects) and returns the
|
||||||
|
constructed list.
|
||||||
"""
|
"""
|
||||||
1語の表層形(surface)と読み(reading_kana)を
|
|
||||||
[ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割
|
|
||||||
"""
|
|
||||||
result = []
|
result = []
|
||||||
|
|
||||||
# 表層を「漢字ブロック」と「非漢字ブロック」に分割
|
# 表層を「漢字ブロック」と「非漢字ブロック」に分割
|
||||||
@@ -46,53 +64,73 @@ class Transliterator:
|
|||||||
|
|
||||||
# 読みを分配
|
# 読みを分配
|
||||||
kana_left = reading_kana
|
kana_left = reading_kana
|
||||||
for i, (is_kan, part) in enumerate(blocks):
|
# We'll allocate kana to each block by initial guess = len(part) (characters)
|
||||||
if is_kan:
|
# and distribute any remaining kana left-to-right preferring kanji blocks.
|
||||||
# 漢字ブロックの処理
|
kana_len = len(kana_left)
|
||||||
if len(blocks) == 1:
|
|
||||||
# 単一ブロック(全て漢字)の場合
|
|
||||||
kana_for_kan = kana_left
|
|
||||||
elif i == len(blocks) - 1:
|
|
||||||
# 最後のブロック(漢字)の場合
|
|
||||||
kana_for_kan = kana_left
|
|
||||||
else:
|
|
||||||
# 中間の漢字ブロックの場合
|
|
||||||
# 後続の非漢字ブロックの文字数を計算
|
|
||||||
remaining_non_kanji = sum(len(p) for is_k, p in blocks[i+1:] if not is_k)
|
|
||||||
if remaining_non_kanji > 0 and len(kana_left) > remaining_non_kanji:
|
|
||||||
kana_for_kan = kana_left[:-remaining_non_kanji]
|
|
||||||
else:
|
|
||||||
# 漢字1文字あたり最低1文字の読みを割り当て
|
|
||||||
min_kana = len(part)
|
|
||||||
kana_for_kan = kana_left[:max(min_kana, len(kana_left) - remaining_non_kanji)]
|
|
||||||
|
|
||||||
# 空の読みを避ける
|
|
||||||
if not kana_for_kan and kana_left:
|
|
||||||
kana_for_kan = kana_left[:1]
|
|
||||||
|
|
||||||
result.append({
|
# initial allocation per block
|
||||||
"orig": part,
|
allocs = [len(part) for _, part in blocks]
|
||||||
"kana": kana_for_kan,
|
allocated = sum(allocs)
|
||||||
"hira": Transliterator.kata_to_hira(kana_for_kan),
|
remaining = kana_len - allocated
|
||||||
"hepburn": katakana_to_hepburn(kana_for_kan, use_macron=use_macron)
|
|
||||||
})
|
# distribute extra kana to kanji blocks first (left-to-right)
|
||||||
kana_left = kana_left[len(kana_for_kan):]
|
if remaining > 0:
|
||||||
else:
|
for idx, (is_kan, _) in enumerate(blocks):
|
||||||
# 非漢字部分(送り仮名など)
|
if remaining <= 0:
|
||||||
kana_for_okuri = kana_left[:len(part)]
|
break
|
||||||
result.append(
|
if is_kan:
|
||||||
{
|
allocs[idx] += 1
|
||||||
"orig": part,
|
remaining -= 1
|
||||||
"kana": kana_for_okuri,
|
# if still remaining, distribute to all blocks left-to-right
|
||||||
"hira": Transliterator.kata_to_hira(kana_for_okuri),
|
idx = 0
|
||||||
"hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=use_macron)
|
while remaining > 0 and len(blocks) > 0:
|
||||||
}
|
allocs[idx] += 1
|
||||||
)
|
remaining -= 1
|
||||||
kana_left = kana_left[len(kana_for_okuri):]
|
idx = (idx + 1) % len(blocks)
|
||||||
|
|
||||||
|
# if remaining < 0 (reading shorter than base), shrink allocations from right
|
||||||
|
if remaining < 0:
|
||||||
|
# remove from rightmost blocks as needed
|
||||||
|
need = -remaining
|
||||||
|
idx = len(blocks) - 1
|
||||||
|
while need > 0 and idx >= 0:
|
||||||
|
take = min(allocs[idx] - 1, need) if allocs[idx] > 1 else 0
|
||||||
|
allocs[idx] -= take
|
||||||
|
need -= take
|
||||||
|
idx -= 1
|
||||||
|
|
||||||
|
# now slice kana_left according to allocs
|
||||||
|
pos = 0
|
||||||
|
for (is_kan, part), cnt in zip(blocks, allocs):
|
||||||
|
kana_for_part = kana_left[pos:pos+cnt]
|
||||||
|
pos += cnt
|
||||||
|
result.append({
|
||||||
|
"orig": part,
|
||||||
|
"kana": kana_for_part,
|
||||||
|
"hira": Transliterator.kata_to_hira(kana_for_part),
|
||||||
|
"hepburn": katakana_to_hepburn(kana_for_part, use_macron=use_macron)
|
||||||
|
})
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def analyze(self, text: str, use_macron: bool = False):
|
def analyze(self, text: str, use_macron: bool = False):
|
||||||
|
"""Tokenize ``text`` and produce per-subunit reading information.
|
||||||
|
|
||||||
|
Returns a list of dicts for each token/sub-part with keys:
|
||||||
|
- orig: original surface string (one or more characters)
|
||||||
|
- kana: katakana reading for this part (may be adapted by context rules)
|
||||||
|
- hira: hiragana reading (derived from kana)
|
||||||
|
- hepburn: Latin transcription (derived from kana)
|
||||||
|
|
||||||
|
Side-effects / notes:
|
||||||
|
- The function calls ``apply_context_rules(results, use_macron=...)``
|
||||||
|
which both mutates ``results`` in-place and returns it. This method
|
||||||
|
safely accepts the returned list and then recalculates ``hira`` and
|
||||||
|
``hepburn`` for entries whose ``kana`` was changed.
|
||||||
|
- If rule application fails, analysis still returns the best-effort
|
||||||
|
results.
|
||||||
|
"""
|
||||||
|
|
||||||
tokens = self.tokenizer_obj.tokenize(text, self.mode)
|
tokens = self.tokenizer_obj.tokenize(text, self.mode)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
@@ -127,11 +165,37 @@ class Transliterator:
|
|||||||
parts = self.split_kanji_okurigana(surface, reading, use_macron=use_macron)
|
parts = self.split_kanji_okurigana(surface, reading, use_macron=use_macron)
|
||||||
results.extend(parts)
|
results.extend(parts)
|
||||||
|
|
||||||
|
# 文脈ルールを適用(別ファイル)
|
||||||
|
try:
|
||||||
|
results = apply_context_rules(results, use_macron=use_macron) or results
|
||||||
|
except Exception:
|
||||||
|
# ルール適用で失敗しても解析結果は返す
|
||||||
|
pass
|
||||||
|
|
||||||
|
# apply_context_rules が kana を書き換えた場合、hira と hepburn を再計算
|
||||||
|
for entry in results:
|
||||||
|
kana = entry.get("kana", "")
|
||||||
|
if kana:
|
||||||
|
entry["hira"] = self.kata_to_hira(kana)
|
||||||
|
entry["hepburn"] = katakana_to_hepburn(kana, use_macron=use_macron)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
# --- テスト ---
|
# --- テスト ---
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
import pprint
|
||||||
test_cases = [
|
test_cases = [
|
||||||
|
"向こうへ行く",
|
||||||
|
"行事を行う",
|
||||||
|
"上がる",
|
||||||
|
"上る",
|
||||||
|
"入り込む",
|
||||||
|
"何",
|
||||||
|
"何が好き?",
|
||||||
|
"何色が好き?",
|
||||||
|
"何色ありますか?",
|
||||||
|
"何語ですか?",
|
||||||
|
"テーブルに色鉛筆は何色ありますか?"
|
||||||
"美しい花を見る",
|
"美しい花を見る",
|
||||||
"東京に行く",
|
"東京に行く",
|
||||||
"漢字とカタカナの混在",
|
"漢字とカタカナの混在",
|
||||||
@@ -155,4 +219,4 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
transliterator = Transliterator()
|
transliterator = Transliterator()
|
||||||
for case in test_cases:
|
for case in test_cases:
|
||||||
print(transliterator.analyze(case))
|
pprint.pprint(transliterator.analyze(case), sort_dicts=False)
|
||||||
Reference in New Issue
Block a user