[Update] Transliterator: Enhance transliteration control and improve tokenizer initialization
This commit is contained in:
@@ -915,12 +915,16 @@ class Controller:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def setEnableConvertMessageToRomaji(*args, **kwargs) -> dict:
|
def setEnableConvertMessageToRomaji(*args, **kwargs) -> dict:
|
||||||
if config.CONVERT_MESSAGE_TO_ROMAJI is False:
|
if config.CONVERT_MESSAGE_TO_ROMAJI is False:
|
||||||
|
if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
|
||||||
|
model.startTransliteration()
|
||||||
config.CONVERT_MESSAGE_TO_ROMAJI = True
|
config.CONVERT_MESSAGE_TO_ROMAJI = True
|
||||||
return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI}
|
return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def setDisableConvertMessageToRomaji(*args, **kwargs) -> dict:
|
def setDisableConvertMessageToRomaji(*args, **kwargs) -> dict:
|
||||||
if config.CONVERT_MESSAGE_TO_ROMAJI is True:
|
if config.CONVERT_MESSAGE_TO_ROMAJI is True:
|
||||||
|
if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
|
||||||
|
model.stopTransliteration()
|
||||||
config.CONVERT_MESSAGE_TO_ROMAJI = False
|
config.CONVERT_MESSAGE_TO_ROMAJI = False
|
||||||
return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI}
|
return {"status":200, "result":config.CONVERT_MESSAGE_TO_ROMAJI}
|
||||||
|
|
||||||
@@ -931,12 +935,16 @@ class Controller:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def setEnableConvertMessageToHiragana(*args, **kwargs) -> dict:
|
def setEnableConvertMessageToHiragana(*args, **kwargs) -> dict:
|
||||||
if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
|
if config.CONVERT_MESSAGE_TO_HIRAGANA is False:
|
||||||
|
if config.CONVERT_MESSAGE_TO_ROMAJI is False:
|
||||||
|
model.startTransliteration()
|
||||||
config.CONVERT_MESSAGE_TO_HIRAGANA = True
|
config.CONVERT_MESSAGE_TO_HIRAGANA = True
|
||||||
return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA}
|
return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def setDisableConvertMessageToHiragana(*args, **kwargs) -> dict:
|
def setDisableConvertMessageToHiragana(*args, **kwargs) -> dict:
|
||||||
if config.CONVERT_MESSAGE_TO_HIRAGANA is True:
|
if config.CONVERT_MESSAGE_TO_HIRAGANA is True:
|
||||||
|
if config.CONVERT_MESSAGE_TO_ROMAJI is False:
|
||||||
|
model.stopTransliteration()
|
||||||
config.CONVERT_MESSAGE_TO_HIRAGANA = False
|
config.CONVERT_MESSAGE_TO_HIRAGANA = False
|
||||||
return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA}
|
return {"status":200, "result":config.CONVERT_MESSAGE_TO_HIRAGANA}
|
||||||
|
|
||||||
@@ -2466,6 +2474,11 @@ class Controller:
|
|||||||
self.updateDownloadedWhisperModelWeight()
|
self.updateDownloadedWhisperModelWeight()
|
||||||
self.updateTranscriptionEngine()
|
self.updateTranscriptionEngine()
|
||||||
|
|
||||||
|
# set Transliteration status
|
||||||
|
printLog("Set Transliteration")
|
||||||
|
if config.CONVERT_MESSAGE_TO_ROMAJI is True or config.CONVERT_MESSAGE_TO_HIRAGANA is True:
|
||||||
|
model.startTransliteration()
|
||||||
|
|
||||||
self.initializationProgress(3)
|
self.initializationProgress(3)
|
||||||
|
|
||||||
# set word filter
|
# set word filter
|
||||||
|
|||||||
@@ -99,7 +99,7 @@ class Model:
|
|||||||
self.overlay_image = OverlayImage(config.PATH_LOCAL)
|
self.overlay_image = OverlayImage(config.PATH_LOCAL)
|
||||||
self.mic_audio_queue = None
|
self.mic_audio_queue = None
|
||||||
self.mic_mute_status = None
|
self.mic_mute_status = None
|
||||||
self.transliterator = Transliterator()
|
self.transliterator = None
|
||||||
self.watchdog = Watchdog(config.WATCHDOG_TIMEOUT, config.WATCHDOG_INTERVAL)
|
self.watchdog = Watchdog(config.WATCHDOG_TIMEOUT, config.WATCHDOG_INTERVAL)
|
||||||
self.osc_handler = OSCHandler(config.OSC_IP_ADDRESS, config.OSC_PORT)
|
self.osc_handler = OSCHandler(config.OSC_IP_ADDRESS, config.OSC_PORT)
|
||||||
self.websocket_server = None
|
self.websocket_server = None
|
||||||
@@ -277,6 +277,14 @@ class Model:
|
|||||||
self.previous_receive_message = message
|
self.previous_receive_message = message
|
||||||
return repeat_flag
|
return repeat_flag
|
||||||
|
|
||||||
|
def startTransliteration(self):
|
||||||
|
if self.transliterator is None:
|
||||||
|
self.transliterator = Transliterator()
|
||||||
|
|
||||||
|
def stopTransliteration(self):
|
||||||
|
if self.transliterator is not None:
|
||||||
|
self.transliterator = None
|
||||||
|
|
||||||
def convertMessageToTransliteration(self, message: str, hiragana: bool=True, romaji: bool=True) -> str:
|
def convertMessageToTransliteration(self, message: str, hiragana: bool=True, romaji: bool=True) -> str:
|
||||||
if hiragana is False and romaji is False:
|
if hiragana is False and romaji is False:
|
||||||
return message
|
return message
|
||||||
@@ -287,6 +295,9 @@ class Model:
|
|||||||
if romaji:
|
if romaji:
|
||||||
keys_to_keep.add("hepburn")
|
keys_to_keep.add("hepburn")
|
||||||
|
|
||||||
|
if self.transliterator is None:
|
||||||
|
self.startTransliteration()
|
||||||
|
|
||||||
data_list = self.transliterator.analyze(message, use_macron=False)
|
data_list = self.transliterator.analyze(message, use_macron=False)
|
||||||
filtered_list = [
|
filtered_list = [
|
||||||
{key: value for key, value in item.items() if key in keys_to_keep}
|
{key: value for key, value in item.items() if key in keys_to_keep}
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ except ImportError:
|
|||||||
|
|
||||||
class Transliterator:
|
class Transliterator:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.tokenizer_obj = dictionary.Dictionary().create()
|
self.tokenizer_obj = dictionary.Dictionary(dict_type="full").create()
|
||||||
self.mode = tokenizer.Tokenizer.SplitMode.C
|
self.mode = tokenizer.Tokenizer.SplitMode.C
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -22,7 +22,7 @@ class Transliterator:
|
|||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def split_kanji_okurigana(surface: str, reading_kana: str):
|
def split_kanji_okurigana(surface: str, reading_kana: str, use_macron: bool = True):
|
||||||
"""
|
"""
|
||||||
1語の表層形(surface)と読み(reading_kana)を
|
1語の表層形(surface)と読み(reading_kana)を
|
||||||
[ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割
|
[ {"orig":..., "kana":..., "hira":..., "hepburn":...}, ... ] に分割
|
||||||
@@ -69,15 +69,13 @@ class Transliterator:
|
|||||||
# 空の読みを避ける
|
# 空の読みを避ける
|
||||||
if not kana_for_kan and kana_left:
|
if not kana_for_kan and kana_left:
|
||||||
kana_for_kan = kana_left[:1]
|
kana_for_kan = kana_left[:1]
|
||||||
|
|
||||||
result.append(
|
result.append({
|
||||||
{
|
"orig": part,
|
||||||
"orig": part,
|
"kana": kana_for_kan,
|
||||||
"kana": kana_for_kan,
|
"hira": Transliterator.kata_to_hira(kana_for_kan),
|
||||||
"hira": Transliterator.kata_to_hira(kana_for_kan),
|
"hepburn": katakana_to_hepburn(kana_for_kan, use_macron=use_macron)
|
||||||
"hepburn": katakana_to_hepburn(kana_for_kan, use_macron=True)
|
})
|
||||||
}
|
|
||||||
)
|
|
||||||
kana_left = kana_left[len(kana_for_kan):]
|
kana_left = kana_left[len(kana_for_kan):]
|
||||||
else:
|
else:
|
||||||
# 非漢字部分(送り仮名など)
|
# 非漢字部分(送り仮名など)
|
||||||
@@ -87,14 +85,14 @@ class Transliterator:
|
|||||||
"orig": part,
|
"orig": part,
|
||||||
"kana": kana_for_okuri,
|
"kana": kana_for_okuri,
|
||||||
"hira": Transliterator.kata_to_hira(kana_for_okuri),
|
"hira": Transliterator.kata_to_hira(kana_for_okuri),
|
||||||
"hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=True)
|
"hepburn": katakana_to_hepburn(kana_for_okuri, use_macron=use_macron)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
kana_left = kana_left[len(kana_for_okuri):]
|
kana_left = kana_left[len(kana_for_okuri):]
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def analyze(self, text: str, use_macron: bool = True):
|
def analyze(self, text: str, use_macron: bool = False):
|
||||||
tokens = self.tokenizer_obj.tokenize(text, self.mode)
|
tokens = self.tokenizer_obj.tokenize(text, self.mode)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
@@ -103,7 +101,7 @@ class Transliterator:
|
|||||||
reading = t.reading_form()
|
reading = t.reading_form()
|
||||||
pos = t.part_of_speech()
|
pos = t.part_of_speech()
|
||||||
|
|
||||||
if pos and pos[0] in ["記号", "補助記号"]:
|
if pos and pos[0] in ["記号", "補助記号", "空白"]:
|
||||||
reading = surface
|
reading = surface
|
||||||
|
|
||||||
if surface == reading:
|
if surface == reading:
|
||||||
@@ -125,69 +123,9 @@ class Transliterator:
|
|||||||
"hepburn": katakana_to_hepburn(reading, use_macron=use_macron)
|
"hepburn": katakana_to_hepburn(reading, use_macron=use_macron)
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
# 複数文字の場合は文字種別で分割
|
# 複数文字の場合は既存のユーティリティで分割
|
||||||
i = 0
|
parts = self.split_kanji_okurigana(surface, reading, use_macron=use_macron)
|
||||||
reading_pos = 0
|
results.extend(parts)
|
||||||
|
|
||||||
while i < len(surface):
|
|
||||||
char = surface[i]
|
|
||||||
|
|
||||||
if self.is_kanji(char):
|
|
||||||
# 漢字の場合、連続する漢字をまとめて処理
|
|
||||||
kanji_block = ""
|
|
||||||
while i < len(surface) and self.is_kanji(surface[i]):
|
|
||||||
kanji_block += surface[i]
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
# 漢字ブロックの読みを推定
|
|
||||||
if i < len(surface):
|
|
||||||
# 後に文字がある場合、送り仮名を考慮
|
|
||||||
remaining_chars = len(surface) - i
|
|
||||||
kanji_reading = reading[reading_pos:-remaining_chars] if remaining_chars > 0 else reading[reading_pos:]
|
|
||||||
else:
|
|
||||||
# 最後の漢字ブロックの場合
|
|
||||||
kanji_reading = reading[reading_pos:]
|
|
||||||
|
|
||||||
# 空の読みを避ける
|
|
||||||
if not kanji_reading and reading_pos < len(reading):
|
|
||||||
kanji_reading = reading[reading_pos:]
|
|
||||||
if not kanji_reading and kanji_block:
|
|
||||||
# 読みが空だが漢字ブロックがある場合、残りの読みを全て割り当てる
|
|
||||||
kanji_reading = reading[reading_pos:]
|
|
||||||
|
|
||||||
# reading_posの更新を正確に行うために、割り当てられた読みの長さをチェック
|
|
||||||
len_allocated_reading = len(kanji_reading)
|
|
||||||
if reading_pos + len_allocated_reading > len(reading):
|
|
||||||
len_allocated_reading = len(reading) - reading_pos
|
|
||||||
|
|
||||||
results.append({
|
|
||||||
"orig": kanji_block,
|
|
||||||
"kana": kanji_reading,
|
|
||||||
"hira": self.kata_to_hira(kanji_reading),
|
|
||||||
"hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron)
|
|
||||||
})
|
|
||||||
reading_pos += len_allocated_reading
|
|
||||||
else:
|
|
||||||
# 非漢字の場合
|
|
||||||
non_kanji_block = ""
|
|
||||||
while i < len(surface) and not self.is_kanji(surface[i]):
|
|
||||||
non_kanji_block += surface[i]
|
|
||||||
i += 1
|
|
||||||
|
|
||||||
# 非漢字部分の読み(通常は文字数分、または残りの読みの分だけ)
|
|
||||||
len_block = len(non_kanji_block)
|
|
||||||
non_kanji_reading = reading[reading_pos:reading_pos + len_block]
|
|
||||||
|
|
||||||
# 割り当てられた読みの長さ
|
|
||||||
len_allocated_reading = len(non_kanji_reading)
|
|
||||||
|
|
||||||
results.append({
|
|
||||||
"orig": non_kanji_block,
|
|
||||||
"kana": non_kanji_reading,
|
|
||||||
"hira": self.kata_to_hira(non_kanji_reading),
|
|
||||||
"hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron)
|
|
||||||
})
|
|
||||||
reading_pos += len_allocated_reading
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user