fix(api): correct count_words for CJK characters
Chinese characters (U+4E00-U+9FFF) also return true for is_alphabetic(), so they were being counted as part of English words instead of individually. Fix: check CJK range before is_alphabetic().
This commit is contained in:
parent
e6c3cacf12
commit
19e5a0be41
@ -418,15 +418,14 @@ fn count_words(md: &str) -> u32 {
|
||||
let mut in_word = false;
|
||||
|
||||
for c in plain.chars() {
|
||||
if c.is_alphabetic() {
|
||||
if c as u32 >= 0x4E00 && c as u32 <= 0x9FFF {
|
||||
count += 1;
|
||||
in_word = false;
|
||||
} else if c.is_alphabetic() {
|
||||
if !in_word {
|
||||
count += 1;
|
||||
in_word = true;
|
||||
}
|
||||
} else if c as u32 >= 0x4E00 && c as u32 <= 0x9FFF {
|
||||
// Chinese character
|
||||
count += 1;
|
||||
in_word = false;
|
||||
} else {
|
||||
in_word = false;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user