[Update] Transliterator: Enhance transliteration control and improve tokenizer initialization
This commit is contained in:
@@ -915,12 +915,16 @@ class Controller:
|
||||
@staticmethod
|
||||
def setEnableConvertMessageToRomaji(*args, **kwargs) -> dict:
|
||||
if config.CONVERT_MESSAGE_TO_ROMAJI is False:
|
||||
if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
|
||||
model.startTransliteration()
|
||||
config.CONVERT_MESSAGE_TO_ROMAJI = True
|
||||
return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI}
|
||||
|
||||
@staticmethod
|
||||
def setDisableConvertMessageToRomaji(*args, **kwargs) -> dict:
|
||||
if config.CONVERT_MESSAGE_TO_ROMAJI is True:
|
||||
if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
|
||||
model.stopTransliteration()
|
||||
config.CONVERT_MESSAGE_TO_ROMAJI = False
|
||||
return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI}
|
||||
|
||||
@@ -931,12 +935,16 @@ class Controller:
|
||||
@staticmethod
|
||||
def setEnableConvertMessageToHiragana(*args, **kwargs) -> dict:
|
||||
if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
|
||||
if config.CONVERT_MESSAGE_TO_ROMAJI is False:
|
||||
model.startTransliteration()
|
||||
config.CONVERT_MESSAGE_TO_HIRAGANA = True
|
||||
return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA}
|
||||
|
||||
@staticmethod
|
||||
def setDisableConvertMessageToHiragana(*args, **kwargs) -> dict:
|
||||
if config.CONVERT_MESSAGE_TO_HIRAGANA is True:
|
||||
if config.CONVERT_MESSAGE_TO_ROMAJI is False:
|
||||
model.stopTransliteration()
|
||||
config.CONVERT_MESSAGE_TO_HIRAGANA = False
|
||||
return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA}
|
||||
|
||||
@@ -2466,6 +2474,11 @@ class Controller:
|
||||
self.updateDownloadedWhisperModelWeight()
|
||||
self.updateTranscriptionEngine()
|
||||
|
||||
# set Transliteration status
|
||||
printLog("Set Transliteration")
|
||||
if config.CONVERT_MESSAGE_TO_ROMAJI is True or config.CONVERT_MESSAGE_TO_HIRAGANA is True:
|
||||
model.startTransliteration()
|
||||
|
||||
self.initializationProgress(3)
|
||||
|
||||
# set word filter
|
||||
|
||||
@@ -99,7 +99,7 @@ class Model:
|
||||
self.overlay_image = OverlayImage(config.PATH_LOCAL)
|
||||
self.mic_audio_queue = None
|
||||
self.mic_mute_status = None
|
||||
self.transliterator = Transliterator()
|
||||
self.transliterator = None
|
||||
self.watchdog = Watchdog(config.WATCHDOG_TIMEOUT, config.WATCHDOG_INTERVAL)
|
||||
self.osc_handler = OSCHandler(config.OSC_IP_ADDRESS, config.OSC_PORT)
|
||||
self.websocket_server = None
|
||||
@@ -277,6 +277,14 @@ class Model:
|
||||
self.previous_receive_message = message
|
||||
return repeat_flag
|
||||
|
||||
def startTransliteration(self):
|
||||
if self.transliterator is None:
|
||||
self.transliterator = Transliterator()
|
||||
|
||||
def stopTransliteration(self):
|
||||
if self.transliterator is not None:
|
||||
self.transliterator = None
|
||||
|
||||
def convertMessageToTransliteration(self, message: str, hiragana: bool=True, romaji: bool=True) -> str:
|
||||
if hiragana is False and romaji is False:
|
||||
return message
|
||||
@@ -287,6 +295,9 @@ class Model:
|
||||
if romaji:
|
||||
keys_to_keep.add("hepburn")
|
||||
|
||||
if self.transliterator is None:
|
||||
self.startTransliteration()
|
||||
|
||||
data_list = self.transliterator.analyze(message, use_macron=False)
|
||||
filtered_list = [
|
||||
{key: value for key, value in item.items() if key in keys_to_keep}
|
||||
|
||||
@@ -7,7 +7,7 @@ except ImportError:
|
||||
|
||||
class Transliterator:
|
||||
def __init__(self):
|
||||
self.tokenizer_obj = dictionary.Dictionary().create()
|
||||
self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create()
|
||||
self.mode = tokenizer.Tokenizer.SplitMode.C
|
||||
|
||||
@staticmethod
|
||||
@@ -22,7 +22,7 @@ class Transliterator:
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def split_kanji_okurigana(surface: str, reading_kana: str):
|
||||
def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True):
|
||||
"""
|
||||
1語の表層形(surface)と読み(reading_kana)を
|
||||
[ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割
|
||||
@@ -70,14 +70,12 @@ class Transliterator:
|
||||
if not kana_for_kan and kana_left:
|
||||
kana_for_kan = kana_left[:1]
|
||||
|
||||
result.append(
|
||||
{
|
||||
result.append({
|
||||
"orig": part,
|
||||
"kana": kana_for_kan,
|
||||
"hira": Transliterator.kata_to_hira(kana_for_kan),
|
||||
"hepburn": katakana_to_hepburn(kana_for_kan, use_macron=True)
|
||||
}
|
||||
)
|
||||
"hepburn": katakana_to_hepburn(kana_for_kan, use_macron=use_macron)
|
||||
})
|
||||
kana_left = kana_left[len(kana_for_kan):]
|
||||
else:
|
||||
# 非漢字部分(送り仮名など)
|
||||
@@ -87,14 +85,14 @@ class Transliterator:
|
||||
"orig": part,
|
||||
"kana": kana_for_okuri,
|
||||
"hira": Transliterator.kata_to_hira(kana_for_okuri),
|
||||
"hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=True)
|
||||
"hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=use_macron)
|
||||
}
|
||||
)
|
||||
kana_left = kana_left[len(kana_for_okuri):]
|
||||
|
||||
return result
|
||||
|
||||
def analyze(self, text: str, use_macron: bool = True):
|
||||
def analyze(self, text: str, use_macron: bool = False):
|
||||
tokens = self.tokenizer_obj.tokenize(text, self.mode)
|
||||
|
||||
results = []
|
||||
@@ -103,7 +101,7 @@ class Transliterator:
|
||||
reading = t.reading_form()
|
||||
pos = t.part_of_speech()
|
||||
|
||||
if pos and pos[0] in ["記号", "補助記号"]:
|
||||
if pos and pos[0] in ["記号", "補助記号", "空白"]:
|
||||
reading = surface
|
||||
|
||||
if surface == reading:
|
||||
@@ -125,69 +123,9 @@ class Transliterator:
|
||||
"hepburn": katakana_to_hepburn(reading, use_macron=use_macron)
|
||||
})
|
||||
else:
|
||||
# 複数文字の場合は文字種別で分割
|
||||
i = 0
|
||||
reading_pos = 0
|
||||
|
||||
while i < len(surface):
|
||||
char = surface[i]
|
||||
|
||||
if self.is_kanji(char):
|
||||
# 漢字の場合、連続する漢字をまとめて処理
|
||||
kanji_block = ""
|
||||
while i < len(surface) and self.is_kanji(surface[i]):
|
||||
kanji_block += surface[i]
|
||||
i += 1
|
||||
|
||||
# 漢字ブロックの読みを推定
|
||||
if i < len(surface):
|
||||
# 後に文字がある場合、送り仮名を考慮
|
||||
remaining_chars = len(surface) - i
|
||||
kanji_reading = reading[reading_pos:-remaining_chars] if remaining_chars > 0 else reading[reading_pos:]
|
||||
else:
|
||||
# 最後の漢字ブロックの場合
|
||||
kanji_reading = reading[reading_pos:]
|
||||
|
||||
# 空の読みを避ける
|
||||
if not kanji_reading and reading_pos < len(reading):
|
||||
kanji_reading = reading[reading_pos:]
|
||||
if not kanji_reading and kanji_block:
|
||||
# 読みが空だが漢字ブロックがある場合、残りの読みを全て割り当てる
|
||||
kanji_reading = reading[reading_pos:]
|
||||
|
||||
# reading_posの更新を正確に行うために、割り当てられた読みの長さをチェック
|
||||
len_allocated_reading = len(kanji_reading)
|
||||
if reading_pos + len_allocated_reading > len(reading):
|
||||
len_allocated_reading = len(reading) - reading_pos
|
||||
|
||||
results.append({
|
||||
"orig": kanji_block,
|
||||
"kana": kanji_reading,
|
||||
"hira": self.kata_to_hira(kanji_reading),
|
||||
"hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron)
|
||||
})
|
||||
reading_pos += len_allocated_reading
|
||||
else:
|
||||
# 非漢字の場合
|
||||
non_kanji_block = ""
|
||||
while i < len(surface) and not self.is_kanji(surface[i]):
|
||||
non_kanji_block += surface[i]
|
||||
i += 1
|
||||
|
||||
# 非漢字部分の読み(通常は文字数分、または残りの読みの分だけ)
|
||||
len_block = len(non_kanji_block)
|
||||
non_kanji_reading = reading[reading_pos:reading_pos + len_block]
|
||||
|
||||
# 割り当てられた読みの長さ
|
||||
len_allocated_reading = len(non_kanji_reading)
|
||||
|
||||
results.append({
|
||||
"orig": non_kanji_block,
|
||||
"kana": non_kanji_reading,
|
||||
"hira": self.kata_to_hira(non_kanji_reading),
|
||||
"hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron)
|
||||
})
|
||||
reading_pos += len_allocated_reading
|
||||
# 複数文字の場合は既存のユーティリティで分割
|
||||
parts = self.split_kanji_okurigana(surface, reading, use_macron=use_macron)
|
||||
results.extend(parts)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
Reference in New Issue
Block a user