From 7b1e9136ee61f6386d224bf1d3c3879ed126cd3c Mon Sep 17 00:00:00 2001 From: misyaguziya <53165965+misyaguziya@users.noreply.github.com> Date: Sat, 4 Oct 2025 22:25:55 +0900 Subject: [PATCH] [Update] Transliterator: Enhance transliteration control and improve tokenizer initialization --- src-python/controller.py | 13 +++ src-python/model.py | 13 ++- .../transliteration_transliterator.py | 92 +++---------------- 3 files changed, 40 insertions(+), 78 deletions(-) diff --git a/src-python/controller.py b/src-python/controller.py index 16856ddc..5c360a91 100644 --- a/src-python/controller.py +++ b/src-python/controller.py @@ -915,12 +915,16 @@ class Controller: @staticmethod def setEnableConvertMessageToRomaji(*args, **kwargs) -> dict: if config.CONVERT_MESSAGE_TO_ROMAJI is False: + if config.CONVERT_MESSAGE_TO_HIRAGANA is False: + model.startTransliteration() config.CONVERT_MESSAGE_TO_ROMAJI = True return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI} @staticmethod def setDisableConvertMessageToRomaji(*args, **kwargs) -> dict: if config.CONVERT_MESSAGE_TO_ROMAJI is True: + if config.CONVERT_MESSAGE_TO_HIRAGANA is False: + model.stopTransliteration() config.CONVERT_MESSAGE_TO_ROMAJI = False return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI} @@ -931,12 +935,16 @@ class Controller: @staticmethod def setEnableConvertMessageToHiragana(*args, **kwargs) -> dict: if config.CONVERT_MESSAGE_TO_HIRAGANA is False: + if config.CONVERT_MESSAGE_TO_ROMAJI is False: + model.startTransliteration() config.CONVERT_MESSAGE_TO_HIRAGANA = True return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA} @staticmethod def setDisableConvertMessageToHiragana(*args, **kwargs) -> dict: if config.CONVERT_MESSAGE_TO_HIRAGANA is True: + if config.CONVERT_MESSAGE_TO_ROMAJI is False: + model.stopTransliteration() config.CONVERT_MESSAGE_TO_HIRAGANA = False return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA} @@ -2466,6 +2474,11 @@ class Controller: self.updateDownloadedWhisperModelWeight() self.updateTranscriptionEngine() + # set Transliteration status + printLog("Set Transliteration") + if config.CONVERT_MESSAGE_TO_ROMAJI is True or config.CONVERT_MESSAGE_TO_HIRAGANA is True: + model.startTransliteration() + self.initializationProgress(3) # set word filter diff --git a/src-python/model.py b/src-python/model.py index 9d29c2d0..6048c630 100644 --- a/src-python/model.py +++ b/src-python/model.py @@ -99,7 +99,7 @@ class Model: self.overlay_image = OverlayImage(config.PATH_LOCAL) self.mic_audio_queue = None self.mic_mute_status = None - self.transliterator = Transliterator() + self.transliterator = None self.watchdog = Watchdog(config.WATCHDOG_TIMEOUT, config.WATCHDOG_INTERVAL) self.osc_handler = OSCHandler(config.OSC_IP_ADDRESS, config.OSC_PORT) self.websocket_server = None @@ -277,6 +277,14 @@ class Model: self.previous_receive_message = message return repeat_flag + def startTransliteration(self): + if self.transliterator is None: + self.transliterator = Transliterator() + + def stopTransliteration(self): + if self.transliterator is not None: + self.transliterator = None + def convertMessageToTransliteration(self, message: str, hiragana: bool=True, romaji: bool=True) -> str: if hiragana is False and romaji is False: return message @@ -287,6 +295,9 @@ class Model: if romaji: keys_to_keep.add("hepburn") + if self.transliterator is None: + self.startTransliteration() + data_list = self.transliterator.analyze(message, use_macron=False) filtered_list = [ {key: value for key, value in item.items() if key in keys_to_keep} diff --git a/src-python/models/transliteration/transliteration_transliterator.py b/src-python/models/transliteration/transliteration_transliterator.py index 7c85ebee..b8e64f7d 100644 --- a/src-python/models/transliteration/transliteration_transliterator.py +++ b/src-python/models/transliteration/transliteration_transliterator.py @@ -7,7 +7,7 @@ except ImportError: class Transliterator: def __init__(self): - self.tokenizer_obj = dictionary.Dictionary().create() + self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create() self.mode = tokenizer.Tokenizer.SplitMode.C @staticmethod @@ -22,7 +22,7 @@ class Transliterator: ) @staticmethod - def split_kanji_okurigana(surface: str, reading_kana: str): + def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True): """ 1語の表層形(surface)と読み(reading_kana)を [ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割 @@ -69,15 +69,13 @@ class Transliterator: # 空の読みを避ける if not kana_for_kan and kana_left: kana_for_kan = kana_left[:1] - - result.append( - { - "orig": part, - "kana": kana_for_kan, - "hira": Transliterator.kata_to_hira(kana_for_kan), - "hepburn": katakana_to_hepburn(kana_for_kan, use_macron=True) - } - ) + + result.append({ + "orig": part, + "kana": kana_for_kan, + "hira": Transliterator.kata_to_hira(kana_for_kan), + "hepburn": katakana_to_hepburn(kana_for_kan, use_macron=use_macron) + }) kana_left = kana_left[len(kana_for_kan):] else: # 非漢字部分(送り仮名など) @@ -87,14 +85,14 @@ class Transliterator: "orig": part, "kana": kana_for_okuri, "hira": Transliterator.kata_to_hira(kana_for_okuri), - "hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=True) + "hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=use_macron) } ) kana_left = kana_left[len(kana_for_okuri):] return result - def analyze(self, text: str, use_macron: bool = True): + def analyze(self, text: str, use_macron: bool = False): tokens = self.tokenizer_obj.tokenize(text, self.mode) results = [] @@ -103,7 +101,7 @@ class Transliterator: reading = t.reading_form() pos = t.part_of_speech() - if pos and pos[0] in ["記号", "補助記号"]: + if pos and pos[0] in ["記号", "補助記号", "空白"]: reading = surface if surface == reading: @@ -125,69 +123,9 @@ class Transliterator: "hepburn": katakana_to_hepburn(reading, use_macron=use_macron) }) else: - # 複数文字の場合は文字種別で分割 - i = 0 - reading_pos = 0 - - while i < len(surface): - char = surface[i] - - if self.is_kanji(char): - # 漢字の場合、連続する漢字をまとめて処理 - kanji_block = "" - while i < len(surface) and self.is_kanji(surface[i]): - kanji_block += surface[i] - i += 1 - - # 漢字ブロックの読みを推定 - if i < len(surface): - # 後に文字がある場合、送り仮名を考慮 - remaining_chars = len(surface) - i - kanji_reading = reading[reading_pos:-remaining_chars] if remaining_chars > 0 else reading[reading_pos:] - else: - # 最後の漢字ブロックの場合 - kanji_reading = reading[reading_pos:] - - # 空の読みを避ける - if not kanji_reading and reading_pos < len(reading): - kanji_reading = reading[reading_pos:] - if not kanji_reading and kanji_block: - # 読みが空だが漢字ブロックがある場合、残りの読みを全て割り当てる - kanji_reading = reading[reading_pos:] - - # reading_posの更新を正確に行うために、割り当てられた読みの長さをチェック - len_allocated_reading = len(kanji_reading) - if reading_pos + len_allocated_reading > len(reading): - len_allocated_reading = len(reading) - reading_pos - - results.append({ - "orig": kanji_block, - "kana": kanji_reading, - "hira": self.kata_to_hira(kanji_reading), - "hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron) - }) - reading_pos += len_allocated_reading - else: - # 非漢字の場合 - non_kanji_block = "" - while i < len(surface) and not self.is_kanji(surface[i]): - non_kanji_block += surface[i] - i += 1 - - # 非漢字部分の読み(通常は文字数分、または残りの読みの分だけ) - len_block = len(non_kanji_block) - non_kanji_reading = reading[reading_pos:reading_pos + len_block] - - # 割り当てられた読みの長さ - len_allocated_reading = len(non_kanji_reading) - - results.append({ - "orig": non_kanji_block, - "kana": non_kanji_reading, - "hira": self.kata_to_hira(non_kanji_reading), - "hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron) - }) - reading_pos += len_allocated_reading + # 複数文字の場合は既存のユーティリティで分割 + parts = self.split_kanji_okurigana(surface, reading, use_macron=use_macron) + results.extend(parts) return results