型注釈を追加し、関数の戻り値を明示化。コードの可読性と型安全性を向上。

This commit is contained in:
misyaguziya
2025-10-09 17:07:21 +09:00
parent 7255722b67
commit 7d24b3839c
3 changed files with 16 additions and 13 deletions

View File

@@ -1,4 +1,4 @@
from typing import List, Dict from typing import List, Dict, Any
import re import re
"""Contextual transliteration rules for tokenized results. """Contextual transliteration rules for tokenized results.
@@ -33,7 +33,7 @@ DEFAULT_RULES = {
def apply_context_rules(results: List[Dict], use_macron: bool = False) -> List[Dict]: def apply_context_rules(results: List[Dict[str, Any]], use_macron: bool = False) -> List[Dict[str, Any]]:
"""Apply contextual rewrite rules to `results`. """Apply contextual rewrite rules to `results`.
Parameters Parameters
@@ -50,7 +50,7 @@ def apply_context_rules(results: List[Dict], use_macron: bool = False) -> List[D
""" """
# prepare rules: sort by priority (desc) and precompile regex where provided # prepare rules: sort by priority (desc) and precompile regex where provided
raw_rules = DEFAULT_RULES.get("rules", []) raw_rules: List[Dict[str, Any]] = DEFAULT_RULES.get("rules", [])
rules = sorted(raw_rules, key=lambda r: r.get("priority", 0), reverse=True) rules = sorted(raw_rules, key=lambda r: r.get("priority", 0), reverse=True)
for r in rules: for r in rules:
if r.get("match_mode") == "regex" and r.get("pattern"): if r.get("match_mode") == "regex" and r.get("pattern"):

View File

@@ -1,5 +1,7 @@
# katakana_to_hepburn.py # katakana_to_hepburn.py
# カタカナ -> ヘボン式ローマ字(パッケージ不要) # カタカナ -> ヘボン式ローマ字(パッケージ不要)
from typing import List
def katakana_to_hepburn(kata: str, use_macron: bool = True) -> str: def katakana_to_hepburn(kata: str, use_macron: bool = True) -> str:
""" """
@@ -8,7 +10,7 @@ def katakana_to_hepburn(kata: str, use_macron: bool = True) -> str:
use_macron=False のときは単純に連続母音を残す(例: ou, oo use_macron=False のときは単純に連続母音を残す(例: ou, oo
""" """
# 基本音の対応(主要なカタカナ) # 基本音の対応(主要なカタカナ)
base = { base: dict = {
'':'a','':'i','':'u','':'e','':'o', '':'a','':'i','':'u','':'e','':'o',
'':'ka','':'ki','':'ku','':'ke','':'ko', '':'ka','':'ki','':'ku','':'ke','':'ko',
'':'sa','':'shi','':'su','':'se','':'so', '':'sa','':'shi','':'su','':'se','':'so',
@@ -31,7 +33,7 @@ def katakana_to_hepburn(kata: str, use_macron: bool = True) -> str:
} }
# 拡張:子音 + 小ャユョ の組合せ(主要なもの) # 拡張:子音 + 小ャユョ の組合せ(主要なもの)
digraphs = { digraphs: dict = {
('',''):'kya', ('',''):'kyu', ('',''):'kyo', ('',''):'kya', ('',''):'kyu', ('',''):'kyo',
('',''):'gya', ('',''):'gyu', ('',''):'gyo', ('',''):'gya', ('',''):'gyu', ('',''):'gyo',
('',''):'sha', ('',''):'shu', ('',''):'sho', ('',''):'sha', ('',''):'shu', ('',''):'sho',
@@ -49,8 +51,8 @@ def katakana_to_hepburn(kata: str, use_macron: bool = True) -> str:
# F-sounds (ファ フィ フェ フォ) # F-sounds (ファ フィ フェ フォ)
('',''):'fa', ('',''):'fi', ('',''):'fe', ('',''):'fo', ('',''):'fa', ('',''):'fi', ('',''):'fe', ('',''):'fo',
# シェ チェ ティ etc. # シェ チェ ティ etc.
('',''):'she', ('',''):'che', ('',''):'she', ('',''):'che',
('',''):'ti', ('',''):'tu', ('',''):'du', ('',''):'ti',
('',''):'wa', ('',''):'wi', ('',''):'we', ('',''):'wo', ('',''):'wa', ('',''):'wi', ('',''):'we', ('',''):'wo',
# その他外来語によくある組合せ # その他外来語によくある組合せ
('',''):'si', ('',''):'zi', ('',''):'tsa', ('',''):'tsi', ('',''):'tse', ('',''):'tso', ('',''):'si', ('',''):'zi', ('',''):'tsa', ('',''):'tsi', ('',''):'tse', ('',''):'tso',
@@ -78,7 +80,7 @@ def katakana_to_hepburn(kata: str, use_macron: bool = True) -> str:
return rom # 母音がないなら全部 return rom # 母音がないなら全部
# 変換メイン # 変換メイン
res = [] res: List[str] = []
i = 0 i = 0
kata = kata.strip() kata = kata.strip()
length = len(kata) length = len(kata)

View File

@@ -1,5 +1,6 @@
from sudachipy import tokenizer from sudachipy import tokenizer
from sudachipy import dictionary from sudachipy import dictionary
from typing import List, Dict, Any
try: try:
from .transliteration_kana_to_hepburn import katakana_to_hepburn from .transliteration_kana_to_hepburn import katakana_to_hepburn
except ImportError: except ImportError:
@@ -10,7 +11,7 @@ except ImportError:
from transliteration_context_rules import apply_context_rules from transliteration_context_rules import apply_context_rules
class Transliterator: class Transliterator:
def __init__(self): def __init__(self) -> None:
self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create() self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create()
self.mode = tokenizer.Tokenizer.SplitMode.C self.mode = tokenizer.Tokenizer.SplitMode.C
@@ -26,7 +27,7 @@ class Transliterator:
) )
@staticmethod @staticmethod
def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True): def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True) -> List[Dict[str, str]]:
"""Split a single surface word and its kana reading into parts. """Split a single surface word and its kana reading into parts.
Inputs: Inputs:
@@ -45,7 +46,7 @@ class Transliterator:
constructed list. constructed list.
""" """
result = [] result: List[Dict[str, str]] = []
# 表層を「漢字ブロック」と「非漢字ブロック」に分割 # 表層を「漢字ブロック」と「非漢字ブロック」に分割
buf = "" buf = ""
@@ -113,7 +114,7 @@ class Transliterator:
return result return result
def analyze(self, text: str, use_macron: bool = False): def analyze(self, text: str, use_macron: bool = False) -> List[Dict[str, Any]]:
"""Tokenize ``text`` and produce per-subunit reading information. """Tokenize ``text`` and produce per-subunit reading information.
Returns a list of dicts for each token/sub-part with keys: Returns a list of dicts for each token/sub-part with keys:
@@ -133,7 +134,7 @@ class Transliterator:
tokens = self.tokenizer_obj.tokenize(text, self.mode) tokens = self.tokenizer_obj.tokenize(text, self.mode)
results = [] results: List[Dict[str, Any]] = []
for t in tokens: for t in tokens:
surface = t.surface() surface = t.surface()
reading = t.reading_form() reading = t.reading_form()