Merge branch 'transliteration' into develop

This commit is contained in:
misyaguziya
2025-10-02 23:19:19 +09:00
2 changed files with 58 additions and 18 deletions

View File

@@ -8,7 +8,7 @@ except ImportError:
class Transliterator: class Transliterator:
def __init__(self): def __init__(self):
self.tokenizer_obj = dictionary.Dictionary().create() self.tokenizer_obj = dictionary.Dictionary().create()
self.mode = tokenizer.Tokenizer.SplitMode.A self.mode = tokenizer.Tokenizer.SplitMode.C
@staticmethod @staticmethod
def is_kanji(ch: str) -> bool: def is_kanji(ch: str) -> bool:
@@ -101,7 +101,20 @@ class Transliterator:
for t in tokens: for t in tokens:
surface = t.surface() surface = t.surface()
reading = t.reading_form() reading = t.reading_form()
pos = t.part_of_speech()
if pos and pos[0] in ["記号", "補助記号"]:
reading = surface
if surface == reading:
results.append({
"orig": surface,
"kana": reading,
"hira": surface,
"hepburn": surface,
})
continue
# 単純に1文字ずつ処理 # 単純に1文字ずつ処理
if len(surface) == 1: if len(surface) == 1:
# 1文字の場合はそのまま # 1文字の場合はそのまま
@@ -134,32 +147,48 @@ class Transliterator:
else: else:
# 最後の漢字ブロックの場合 # 最後の漢字ブロックの場合
kanji_reading = reading[reading_pos:] kanji_reading = reading[reading_pos:]
# 空の読みを避ける
if not kanji_reading and reading_pos < len(reading):
kanji_reading = reading[reading_pos:]
if not kanji_reading and kanji_block:
# 読みが空だが漢字ブロックがある場合、残りの読みを全て割り当てる
kanji_reading = reading[reading_pos:]
# reading_posの更新を正確に行うために、割り当てられた読みの長さをチェック
len_allocated_reading = len(kanji_reading)
if reading_pos + len_allocated_reading > len(reading):
len_allocated_reading = len(reading) - reading_pos
results.append({ results.append({
"orig": kanji_block, "orig": kanji_block,
"kana": kanji_reading, "kana": kanji_reading,
"hira": self.kata_to_hira(kanji_reading), "hira": self.kata_to_hira(kanji_reading),
"hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron) "hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron)
}) })
reading_pos += len(kanji_reading) reading_pos += len_allocated_reading
else: else:
# 非漢字の場合 # 非漢字の場合
non_kanji_block = "" non_kanji_block = ""
while i < len(surface) and not self.is_kanji(surface[i]): while i < len(surface) and not self.is_kanji(surface[i]):
non_kanji_block += surface[i] non_kanji_block += surface[i]
i += 1 i += 1
# 非漢字部分の読み(通常は文字数分) # 非漢字部分の読み(通常は文字数分、または残りの読みの分だけ
non_kanji_reading = reading[reading_pos:reading_pos + len(non_kanji_block)] len_block = len(non_kanji_block)
non_kanji_reading = reading[reading_pos:reading_pos + len_block]
# 割り当てられた読みの長さ
len_allocated_reading = len(non_kanji_reading)
results.append({ results.append({
"orig": non_kanji_block, "orig": non_kanji_block,
"kana": non_kanji_reading, "kana": non_kanji_reading,
"hira": self.kata_to_hira(non_kanji_reading), "hira": self.kata_to_hira(non_kanji_reading),
"hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron) "hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron)
}) })
reading_pos += len(non_kanji_reading) reading_pos += len_allocated_reading
return results return results
# --- テスト --- # --- テスト ---
@@ -180,6 +209,10 @@ if __name__ == "__main__":
"取り敢えず検索してみる", "取り敢えず検索してみる",
"見知らぬ土地で冒険する", "見知らぬ土地で冒険する",
"彼は優れたエンジニアです", "彼は優れたエンジニアです",
" ".join(list("[]<>!@#$%^&*()_+-={}|\;:'\",.<>/?`~")),
" ".join(list("「」<>!@#$%^&*()_+-={}|\;:'",./?`~")),
" ".join(list("♪♫♬♭♮♯°℃℉№Å®©™✓✔✕✖★☆○●◎◇◆□■△▲▽▼※→←↑↓↔︎↕︎⇄⇅∞∴∵∷≪≫≦≧±×÷≠≈≡⊂⊃⊆⊇⊄⊅∪∩∈∋∅∀∃∠⊥⌒∂∇√∫∬∮∑∏∧∨¬⇒⇔∀∃∠⊥⌒∂∇√∫∬∮∑∏")),
" ".join(list("😀😃😄😁😆😅😂🤣😊😇🙂"))
] ]
transliterator = Transliterator() transliterator = Transliterator()

View File

@@ -94,6 +94,7 @@ const MessageWithTransliteration = ({ item }) => {
const hira = token.hira ?? ""; const hira = token.hira ?? "";
const hepburn = token.hepburn ?? ""; const hepburn = token.hepburn ?? "";
// Only hovered romaji if it exists. (No ruby cuz 'orig' and 'hira' are same.)
if (hira && hira === orig && hepburn) { if (hira && hira === orig && hepburn) {
return ( return (
<span key={key} title={hepburn} className={styles.with_hepburn}> <span key={key} title={hepburn} className={styles.with_hepburn}>
@@ -102,7 +103,8 @@ const MessageWithTransliteration = ({ item }) => {
); );
} }
if (hira && hira !== orig && hepburn) { // Ruby hiragana and hovered romaji.
if (hira && hepburn) {
return ( return (
<ruby key={key} title={hepburn} className={styles.with_hepburn}> <ruby key={key} title={hepburn} className={styles.with_hepburn}>
{orig} {orig}
@@ -111,15 +113,20 @@ const MessageWithTransliteration = ({ item }) => {
); );
} }
if (hepburn && hepburn !== orig) { // Ruby romaji or hiragana.
return ( if (hepburn || hira) {
<ruby key={key} className={styles.ruby}> const ruby = hepburn ? hepburn : hira;
{orig} if (ruby !== orig) {
<rt>{hepburn}</rt> return (
</ruby> <ruby key={key} className={styles.ruby}>
); {orig}
<rt>{ruby}</rt>
</ruby>
);
};
} }
// Nothing. Original only.
return ( return (
<span key={key} className={styles.original_only}> <span key={key} className={styles.original_only}>
{orig} {orig}