Merge branch 'transliteration' into develop
This commit is contained in:
@@ -8,7 +8,7 @@ except ImportError:
|
|||||||
class Transliterator:
|
class Transliterator:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.tokenizer_obj = dictionary.Dictionary().create()
|
self.tokenizer_obj = dictionary.Dictionary().create()
|
||||||
self.mode = tokenizer.Tokenizer.SplitMode.A
|
self.mode = tokenizer.Tokenizer.SplitMode.C
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_kanji(ch: str) -> bool:
|
def is_kanji(ch: str) -> bool:
|
||||||
@@ -101,7 +101,20 @@ class Transliterator:
|
|||||||
for t in tokens:
|
for t in tokens:
|
||||||
surface = t.surface()
|
surface = t.surface()
|
||||||
reading = t.reading_form()
|
reading = t.reading_form()
|
||||||
|
pos = t.part_of_speech()
|
||||||
|
|
||||||
|
if pos and pos[0] in ["記号", "補助記号"]:
|
||||||
|
reading = surface
|
||||||
|
|
||||||
|
if surface == reading:
|
||||||
|
results.append({
|
||||||
|
"orig": surface,
|
||||||
|
"kana": reading,
|
||||||
|
"hira": surface,
|
||||||
|
"hepburn": surface,
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
# 単純に1文字ずつ処理
|
# 単純に1文字ずつ処理
|
||||||
if len(surface) == 1:
|
if len(surface) == 1:
|
||||||
# 1文字の場合はそのまま
|
# 1文字の場合はそのまま
|
||||||
@@ -134,32 +147,48 @@ class Transliterator:
|
|||||||
else:
|
else:
|
||||||
# 最後の漢字ブロックの場合
|
# 最後の漢字ブロックの場合
|
||||||
kanji_reading = reading[reading_pos:]
|
kanji_reading = reading[reading_pos:]
|
||||||
|
|
||||||
|
# 空の読みを避ける
|
||||||
|
if not kanji_reading and reading_pos < len(reading):
|
||||||
|
kanji_reading = reading[reading_pos:]
|
||||||
|
if not kanji_reading and kanji_block:
|
||||||
|
# 読みが空だが漢字ブロックがある場合、残りの読みを全て割り当てる
|
||||||
|
kanji_reading = reading[reading_pos:]
|
||||||
|
|
||||||
|
# reading_posの更新を正確に行うために、割り当てられた読みの長さをチェック
|
||||||
|
len_allocated_reading = len(kanji_reading)
|
||||||
|
if reading_pos + len_allocated_reading > len(reading):
|
||||||
|
len_allocated_reading = len(reading) - reading_pos
|
||||||
|
|
||||||
results.append({
|
results.append({
|
||||||
"orig": kanji_block,
|
"orig": kanji_block,
|
||||||
"kana": kanji_reading,
|
"kana": kanji_reading,
|
||||||
"hira": self.kata_to_hira(kanji_reading),
|
"hira": self.kata_to_hira(kanji_reading),
|
||||||
"hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron)
|
"hepburn": katakana_to_hepburn(kanji_reading, use_macron=use_macron)
|
||||||
})
|
})
|
||||||
reading_pos += len(kanji_reading)
|
reading_pos += len_allocated_reading
|
||||||
else:
|
else:
|
||||||
# 非漢字の場合
|
# 非漢字の場合
|
||||||
non_kanji_block = ""
|
non_kanji_block = ""
|
||||||
while i < len(surface) and not self.is_kanji(surface[i]):
|
while i < len(surface) and not self.is_kanji(surface[i]):
|
||||||
non_kanji_block += surface[i]
|
non_kanji_block += surface[i]
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
# 非漢字部分の読み(通常は文字数分)
|
# 非漢字部分の読み(通常は文字数分、または残りの読みの分だけ)
|
||||||
non_kanji_reading = reading[reading_pos:reading_pos + len(non_kanji_block)]
|
len_block = len(non_kanji_block)
|
||||||
|
non_kanji_reading = reading[reading_pos:reading_pos + len_block]
|
||||||
|
|
||||||
|
# 割り当てられた読みの長さ
|
||||||
|
len_allocated_reading = len(non_kanji_reading)
|
||||||
|
|
||||||
results.append({
|
results.append({
|
||||||
"orig": non_kanji_block,
|
"orig": non_kanji_block,
|
||||||
"kana": non_kanji_reading,
|
"kana": non_kanji_reading,
|
||||||
"hira": self.kata_to_hira(non_kanji_reading),
|
"hira": self.kata_to_hira(non_kanji_reading),
|
||||||
"hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron)
|
"hepburn": katakana_to_hepburn(non_kanji_reading, use_macron=use_macron)
|
||||||
})
|
})
|
||||||
reading_pos += len(non_kanji_reading)
|
reading_pos += len_allocated_reading
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
# --- テスト ---
|
# --- テスト ---
|
||||||
@@ -180,6 +209,10 @@ if __name__ == "__main__":
|
|||||||
"取り敢えず検索してみる",
|
"取り敢えず検索してみる",
|
||||||
"見知らぬ土地で冒険する",
|
"見知らぬ土地で冒険する",
|
||||||
"彼は優れたエンジニアです",
|
"彼は優れたエンジニアです",
|
||||||
|
" ".join(list("[]<>!@#$%^&*()_+-={}|\;:'\",.<>/?`~")),
|
||||||
|
" ".join(list("「」<>!@#$%^&*()_+-={}|\;:'",./?`~")),
|
||||||
|
" ".join(list("♪♫♬♭♮♯°℃℉№Å®©™✓✔✕✖★☆○●◎◇◆□■△▲▽▼※→←↑↓↔︎↕︎⇄⇅∞∴∵∷≪≫≦≧±×÷≠≈≡⊂⊃⊆⊇⊄⊅∪∩∈∋∅∀∃∠⊥⌒∂∇√∫∬∮∑∏∧∨¬⇒⇔∀∃∠⊥⌒∂∇√∫∬∮∑∏")),
|
||||||
|
" ".join(list("😀😃😄😁😆😅😂🤣😊😇🙂"))
|
||||||
]
|
]
|
||||||
|
|
||||||
transliterator = Transliterator()
|
transliterator = Transliterator()
|
||||||
|
|||||||
@@ -94,6 +94,7 @@ const MessageWithTransliteration = ({ item }) => {
|
|||||||
const hira = token.hira ?? "";
|
const hira = token.hira ?? "";
|
||||||
const hepburn = token.hepburn ?? "";
|
const hepburn = token.hepburn ?? "";
|
||||||
|
|
||||||
|
// Only hovered romaji if it exists. (No ruby cuz 'orig' and 'hira' are same.)
|
||||||
if (hira && hira === orig && hepburn) {
|
if (hira && hira === orig && hepburn) {
|
||||||
return (
|
return (
|
||||||
<span key={key} title={hepburn} className={styles.with_hepburn}>
|
<span key={key} title={hepburn} className={styles.with_hepburn}>
|
||||||
@@ -102,7 +103,8 @@ const MessageWithTransliteration = ({ item }) => {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hira && hira !== orig && hepburn) {
|
// Ruby hiragana and hovered romaji.
|
||||||
|
if (hira && hepburn) {
|
||||||
return (
|
return (
|
||||||
<ruby key={key} title={hepburn} className={styles.with_hepburn}>
|
<ruby key={key} title={hepburn} className={styles.with_hepburn}>
|
||||||
{orig}
|
{orig}
|
||||||
@@ -111,15 +113,20 @@ const MessageWithTransliteration = ({ item }) => {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hepburn && hepburn !== orig) {
|
// Ruby romaji or hiragana.
|
||||||
return (
|
if (hepburn || hira) {
|
||||||
<ruby key={key} className={styles.ruby}>
|
const ruby = hepburn ? hepburn : hira;
|
||||||
{orig}
|
if (ruby !== orig) {
|
||||||
<rt>{hepburn}</rt>
|
return (
|
||||||
</ruby>
|
<ruby key={key} className={styles.ruby}>
|
||||||
);
|
{orig}
|
||||||
|
<rt>{ruby}</rt>
|
||||||
|
</ruby>
|
||||||
|
);
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Nothing. Original only.
|
||||||
return (
|
return (
|
||||||
<span key={key} className={styles.original_only}>
|
<span key={key} className={styles.original_only}>
|
||||||
{orig}
|
{orig}
|
||||||
|
|||||||
Reference in New Issue
Block a user