diff --git a/src-python/models/transliteration/transliteration_context_rules.py b/src-python/models/transliteration/transliteration_context_rules.py index d0b5d339..35c25ec1 100644 --- a/src-python/models/transliteration/transliteration_context_rules.py +++ b/src-python/models/transliteration/transliteration_context_rules.py @@ -1,4 +1,4 @@ -from typing import List, Dict +from typing import List, Dict, Any import re """Contextual transliteration rules for tokenized results. @@ -33,7 +33,7 @@ DEFAULT_RULES = { -def apply_context_rules(results: List[Dict], use_macron: bool = False) -> List[Dict]: +def apply_context_rules(results: List[Dict[str, Any]], use_macron: bool = False) -> List[Dict[str, Any]]: """Apply contextual rewrite rules to `results`. Parameters @@ -50,7 +50,7 @@ def apply_context_rules(results: List[Dict], use_macron: bool = False) -> List[D """ # prepare rules: sort by priority (desc) and precompile regex where provided - raw_rules = DEFAULT_RULES.get("rules", []) + raw_rules: List[Dict[str, Any]] = DEFAULT_RULES.get("rules", []) rules = sorted(raw_rules, key=lambda r: r.get("priority", 0), reverse=True) for r in rules: if r.get("match_mode") == "regex" and r.get("pattern"): diff --git a/src-python/models/transliteration/transliteration_kana_to_hepburn.py b/src-python/models/transliteration/transliteration_kana_to_hepburn.py index e7ba04c2..d8c2b016 100644 --- a/src-python/models/transliteration/transliteration_kana_to_hepburn.py +++ b/src-python/models/transliteration/transliteration_kana_to_hepburn.py @@ -1,5 +1,7 @@ # katakana_to_hepburn.py # カタカナ -> ヘボン式ローマ字(パッケージ不要) +from typing import List + def katakana_to_hepburn(kata: str, use_macron: bool = True) -> str: """ @@ -8,7 +10,7 @@ def katakana_to_hepburn(kata: str, use_macron: bool = True) -> str: use_macron=False のときは単純に連続母音を残す(例: ou, oo)。 """ # 基本音の対応(主要なカタカナ) - base = { + base: dict = { 'ア':'a','イ':'i','ウ':'u','エ':'e','オ':'o', 'カ':'ka','キ':'ki','ク':'ku','ケ':'ke','コ':'ko', 'サ':'sa','シ':'shi','ス':'su','セ':'se','ソ':'so', @@ -31,7 +33,7 @@ def katakana_to_hepburn(kata: str, use_macron: bool = True) -> str: } # 拡張:子音 + 小ャユョ の組合せ(主要なもの) - digraphs = { + digraphs: dict = { ('キ','ャ'):'kya', ('キ','ュ'):'kyu', ('キ','ョ'):'kyo', ('ギ','ャ'):'gya', ('ギ','ュ'):'gyu', ('ギ','ョ'):'gyo', ('シ','ャ'):'sha', ('シ','ュ'):'shu', ('シ','ョ'):'sho', @@ -49,8 +51,8 @@ def katakana_to_hepburn(kata: str, use_macron: bool = True) -> str: # F-sounds (ファ フィ フェ フォ) ('フ','ァ'):'fa', ('フ','ィ'):'fi', ('フ','ェ'):'fe', ('フ','ォ'):'fo', # シェ チェ ティ etc. - ('シ','ェ'):'she', ('チ','ェ'):'che', - ('テ','ィ'):'ti', ('ト','ゥ'):'tu', ('ド','ゥ'):'du', + ('シ','ェ'):'she', ('チ','ェ'):'che', + ('テ','ィ'):'ti', ('ウ','ァ'):'wa', ('ウ','ィ'):'wi', ('ウ','ェ'):'we', ('ウ','ォ'):'wo', # その他外来語によくある組合せ ('ス','ィ'):'si', ('ズ','ィ'):'zi', ('ツ','ァ'):'tsa', ('ツ','ィ'):'tsi', ('ツ','ェ'):'tse', ('ツ','ォ'):'tso', @@ -78,7 +80,7 @@ def katakana_to_hepburn(kata: str, use_macron: bool = True) -> str: return rom # 母音がないなら全部 # 変換メイン - res = [] + res: List[str] = [] i = 0 kata = kata.strip() length = len(kata) diff --git a/src-python/models/transliteration/transliteration_transliterator.py b/src-python/models/transliteration/transliteration_transliterator.py index e25b3be4..8aff912e 100644 --- a/src-python/models/transliteration/transliteration_transliterator.py +++ b/src-python/models/transliteration/transliteration_transliterator.py @@ -1,5 +1,6 @@ from sudachipy import tokenizer from sudachipy import dictionary +from typing import List, Dict, Any try: from .transliteration_kana_to_hepburn import katakana_to_hepburn except ImportError: @@ -10,7 +11,7 @@ except ImportError: from transliteration_context_rules import apply_context_rules class Transliterator: - def __init__(self): + def __init__(self) -> None: self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create() self.mode = tokenizer.Tokenizer.SplitMode.C @@ -26,7 +27,7 @@ class Transliterator: ) @staticmethod - def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True): + def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True) -> List[Dict[str, str]]: """Split a single surface word and its kana reading into parts. Inputs: @@ -45,7 +46,7 @@ class Transliterator: constructed list. """ - result = [] + result: List[Dict[str, str]] = [] # 表層を「漢字ブロック」と「非漢字ブロック」に分割 buf = "" @@ -113,7 +114,7 @@ class Transliterator: return result - def analyze(self, text: str, use_macron: bool = False): + def analyze(self, text: str, use_macron: bool = False) -> List[Dict[str, Any]]: """Tokenize ``text`` and produce per-subunit reading information. Returns a list of dicts for each token/sub-part with keys: @@ -133,7 +134,7 @@ class Transliterator: tokens = self.tokenizer_obj.tokenize(text, self.mode) - results = [] + results: List[Dict[str, Any]] = [] for t in tokens: surface = t.surface() reading = t.reading_form()