[Update] Transliterator: Enhance transliteration control and improve tokenizer initialization

This commit is contained in:
misyaguziya
2025-10-04 22:25:55 +09:00
parent fe3fea34ff
commit 7b1e9136ee
3 changed files with 40 additions and 78 deletions

View File

@@ -915,12 +915,16 @@ class Controller:
@staticmethod @staticmethod
def setEnableConvertMessageToRomaji(*args, **kwargs) -> dict: def setEnableConvertMessageToRomaji(*args, **kwargs) -> dict:
if config.CONVERT_MESSAGE_TO_ROMAJI is False: if config.CONVERT_MESSAGE_TO_ROMAJI is False:
if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
model.startTransliteration()
config.CONVERT_MESSAGE_TO_ROMAJI = True config.CONVERT_MESSAGE_TO_ROMAJI = True
return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI} return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI}
@staticmethod @staticmethod
def setDisableConvertMessageToRomaji(*args, **kwargs) -> dict: def setDisableConvertMessageToRomaji(*args, **kwargs) -> dict:
if config.CONVERT_MESSAGE_TO_ROMAJI is True: if config.CONVERT_MESSAGE_TO_ROMAJI is True:
if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
model.stopTransliteration()
config.CONVERT_MESSAGE_TO_ROMAJI = False config.CONVERT_MESSAGE_TO_ROMAJI = False
return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI} return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI}
@@ -931,12 +935,16 @@ class Controller:
@staticmethod @staticmethod
def setEnableConvertMessageToHiragana(*args, **kwargs) -> dict: def setEnableConvertMessageToHiragana(*args, **kwargs) -> dict:
if config.CONVERT_MESSAGE_TO_HIRAGANA is False: if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
if config.CONVERT_MESSAGE_TO_ROMAJI is False:
model.startTransliteration()
config.CONVERT_MESSAGE_TO_HIRAGANA = True config.CONVERT_MESSAGE_TO_HIRAGANA = True
return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA} return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA}
@staticmethod @staticmethod
def setDisableConvertMessageToHiragana(*args, **kwargs) -> dict: def setDisableConvertMessageToHiragana(*args, **kwargs) -> dict:
if config.CONVERT_MESSAGE_TO_HIRAGANA is True: if config.CONVERT_MESSAGE_TO_HIRAGANA is True:
if config.CONVERT_MESSAGE_TO_ROMAJI is False:
model.stopTransliteration()
config.CONVERT_MESSAGE_TO_HIRAGANA = False config.CONVERT_MESSAGE_TO_HIRAGANA = False
return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA} return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA}
@@ -2466,6 +2474,11 @@ class Controller:
self.updateDownloadedWhisperModelWeight() self.updateDownloadedWhisperModelWeight()
self.updateTranscriptionEngine() self.updateTranscriptionEngine()
# set Transliteration status
printLog("Set Transliteration")
if config.CONVERT_MESSAGE_TO_ROMAJI is True or config.CONVERT_MESSAGE_TO_HIRAGANA is True:
model.startTransliteration()
self.initializationProgress(3) self.initializationProgress(3)
# set word filter # set word filter

View File

@@ -99,7 +99,7 @@ class Model:
self.overlay_image = OverlayImage(config.PATH_LOCAL) self.overlay_image = OverlayImage(config.PATH_LOCAL)
self.mic_audio_queue = None self.mic_audio_queue = None
self.mic_mute_status = None self.mic_mute_status = None
self.transliterator = Transliterator() self.transliterator = None
self.watchdog = Watchdog(config.WATCHDOG_TIMEOUT, config.WATCHDOG_INTERVAL) self.watchdog = Watchdog(config.WATCHDOG_TIMEOUT, config.WATCHDOG_INTERVAL)
self.osc_handler = OSCHandler(config.OSC_IP_ADDRESS, config.OSC_PORT) self.osc_handler = OSCHandler(config.OSC_IP_ADDRESS, config.OSC_PORT)
self.websocket_server = None self.websocket_server = None
@@ -277,6 +277,14 @@ class Model:
self.previous_receive_message = message self.previous_receive_message = message
return repeat_flag return repeat_flag
def startTransliteration(self):
if self.transliterator is None:
self.transliterator = Transliterator()
def stopTransliteration(self):
if self.transliterator is not None:
self.transliterator = None
def convertMessageToTransliteration(self, message: str, hiragana: bool=True, romaji: bool=True) -> str: def convertMessageToTransliteration(self, message: str, hiragana: bool=True, romaji: bool=True) -> str:
if hiragana is False and romaji is False: if hiragana is False and romaji is False:
return message return message
@@ -287,6 +295,9 @@ class Model:
if romaji: if romaji:
keys_to_keep.add("hepburn") keys_to_keep.add("hepburn")
if self.transliterator is None:
self.startTransliteration()
data_list = self.transliterator.analyze(message, use_macron=False) data_list = self.transliterator.analyze(message, use_macron=False)
filtered_list = [ filtered_list = [
{key: value for key, value in item.items() if key in keys_to_keep} {key: value for key, value in item.items() if key in keys_to_keep}

View File

@@ -7,7 +7,7 @@ except ImportError:
class Transliterator: class Transliterator:
def __init__(self): def __init__(self):
self.tokenizer_obj = dictionary.Dictionary().create() self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create()
self.mode = tokenizer.Tokenizer.SplitMode.C self.mode = tokenizer.Tokenizer.SplitMode.C
@staticmethod @staticmethod
@@ -22,7 +22,7 @@ class Transliterator:
) )
@staticmethod @staticmethod
def split_kanji_okurigana(surface: str, reading_kana: str): def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True):
""" """
1語の表層形(surface)と読み(reading_kana)を 1語の表層形(surface)と読み(reading_kana)を
[ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割 [ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割
@@ -69,15 +69,13 @@ class Transliterator:
# 空の読みを避ける # 空の読みを避ける
if not kana_for_kan and kana_left: if not kana_for_kan and kana_left:
kana_for_kan = kana_left[:1] kana_for_kan = kana_left[:1]
result.append( result.append({
{ "orig": part,
"orig": part, "kana": kana_for_kan,
"kana": kana_for_kan, "hira": Transliterator.kata_to_hira(kana_for_kan),
"hira": Transliterator.kata_to_hira(kana_for_kan), "hepburn": katakana_to_hepburn(kana_for_kan, use_macron=use_macron)
"hepburn": katakana_to_hepburn(kana_for_kan, use_macron=True) })
}
)
kana_left = kana_left[len(kana_for_kan):] kana_left = kana_left[len(kana_for_kan):]
else: else:
# 非漢字部分(送り仮名など) # 非漢字部分(送り仮名など)
@@ -87,14 +85,14 @@ class Transliterator:
"orig": part, "orig": part,
"kana": kana_for_okuri, "kana": kana_for_okuri,
"hira": Transliterator.kata_to_hira(kana_for_okuri), "hira": Transliterator.kata_to_hira(kana_for_okuri),
"hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=True) "hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=use_macron)
} }
) )
kana_left = kana_left[len(kana_for_okuri):] kana_left = kana_left[len(kana_for_okuri):]
return result return result
def analyze(self, text: str, use_macron: bool = True): def analyze(self, text: str, use_macron: bool = False):
tokens = self.tokenizer_obj.tokenize(text, self.mode) tokens = self.tokenizer_obj.tokenize(text, self.mode)
results = [] results = []
@@ -103,7 +101,7 @@ class Transliterator:
reading = t.reading_form() reading = t.reading_form()
pos = t.part_of_speech() pos = t.part_of_speech()
if pos and pos[0] in ["記号", "補助記号"]: if pos and pos[0] in ["記号", "補助記号", "空白"]:
reading = surface reading = surface
if surface == reading: if surface == reading:
@@ -125,69 +123,9 @@ class Transliterator:
"hepburn": katakana_to_hepburn(reading, use_macron=use_macron) "hepburn": katakana_to_hepburn(reading, use_macron=use_macron)
}) })
else: else:
# 複数文字の場合は文字種別で分割 # 複数文字の場合は既存のユーティリティで分割
i = 0 parts = self.split_kanji_okurigana(surface, reading, use_macron=use_macron)
reading_pos = 0 results.extend(parts)
while i < len(surface):
char = surface[i]
if self.is_kanji(char):
# 漢字の場合、連続する漢字をまとめて処理
kanji_block = ""
while i < len(surface) and self.is_kanji(surface[i]):
kanji_block += surface[i]
i += 1
# 漢字ブロックの読みを推定
if i < len(surface):
# 後に文字がある場合、送り仮名を考慮
remaining_chars = len(surface) - i
kanji_reading = reading[reading_pos:-remaining_chars] if remaining_chars > 0 else reading[reading_pos:]
else:
# 最後の漢字ブロックの場合
kanji_reading = reading[reading_pos:]
# 空の読みを避ける
if not kanji_reading and reading_pos < len(reading):
kanji_reading = reading[reading_pos:]
if not kanji_reading and kanji_block:
# 読みが空だが漢字ブロックがある場合、残りの読みを全て割り当てる
kanji_reading = reading[reading_pos:]
# reading_posの更新を正確に行うために、割り当てられた読みの長さをチェック
len_allocated_reading = len(kanji_reading)
if reading_pos + len_allocated_reading > len(reading):
len_allocated_reading = len(reading) - reading_pos
results.append({
"orig": kanji_block,
"kana": kanji_reading,
"hira": self.kata_to_hira(kanji_reading),
"hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron)
})
reading_pos += len_allocated_reading
else:
# 非漢字の場合
non_kanji_block = ""
while i < len(surface) and not self.is_kanji(surface[i]):
non_kanji_block += surface[i]
i += 1
# 非漢字部分の読み(通常は文字数分、または残りの読みの分だけ)
len_block = len(non_kanji_block)
non_kanji_reading = reading[reading_pos:reading_pos + len_block]
# 割り当てられた読みの長さ
len_allocated_reading = len(non_kanji_reading)
results.append({
"orig": non_kanji_block,
"kana": non_kanji_reading,
"hira": self.kata_to_hira(non_kanji_reading),
"hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron)
})
reading_pos += len_allocated_reading
return results return results