|
|
|
|
|
|
|
|
|
|
|
|
|
|
mod normalizer; |
|
|
mod phoneme; |
|
|
mod tokenizer; |
|
|
|
|
|
pub use normalizer::{Language, TextNormalizer}; |
|
|
pub use phoneme::{g2p_english, pinyin_to_phones}; |
|
|
pub use tokenizer::{TextTokenizer, TokenizerConfig}; |
|
|
|
|
|
use crate::Result; |
|
|
|
|
|
|
|
|
pub fn process_text(text: &str, tokenizer: &TextTokenizer) -> Result<Vec<i64>> { |
|
|
|
|
|
let normalizer = TextNormalizer::new(); |
|
|
let normalized = normalizer.normalize(text)?; |
|
|
|
|
|
|
|
|
let tokens = tokenizer.encode(&normalized)?; |
|
|
|
|
|
Ok(tokens) |
|
|
} |
|
|
|
|
|
|
|
|
pub fn detect_language(text: &str) -> Language { |
|
|
let mut chinese_count = 0; |
|
|
let mut english_count = 0; |
|
|
|
|
|
for ch in text.chars() { |
|
|
if is_chinese_char(ch) { |
|
|
chinese_count += 1; |
|
|
} else if ch.is_ascii_alphabetic() { |
|
|
english_count += 1; |
|
|
} |
|
|
} |
|
|
|
|
|
if chinese_count > 0 && english_count == 0 { |
|
|
Language::Chinese |
|
|
} else if english_count > 0 && chinese_count == 0 { |
|
|
Language::English |
|
|
} else if chinese_count > 0 && english_count > 0 { |
|
|
Language::Mixed |
|
|
} else { |
|
|
|
|
|
Language::English |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
pub fn is_chinese_char(ch: char) -> bool { |
|
|
matches!(ch as u32, |
|
|
0x4E00..=0x9FFF | |
|
|
0x3400..=0x4DBF | |
|
|
0x20000..=0x2A6DF | |
|
|
0x2A700..=0x2B73F | |
|
|
0x2B740..=0x2B81F | |
|
|
0xF900..=0xFAFF | |
|
|
0x2F800..=0x2FA1F |
|
|
) |
|
|
} |
|
|
|
|
|
|
|
|
pub fn contains_chinese(text: &str) -> bool { |
|
|
text.chars().any(is_chinese_char) |
|
|
} |
|
|
|
|
|
|
|
|
pub fn is_ascii_only(text: &str) -> bool { |
|
|
text.is_ascii() |
|
|
} |
|
|
|
|
|
|
|
|
pub fn split_by_language(text: &str) -> Vec<(String, Language)> { |
|
|
let mut segments = Vec::new(); |
|
|
let mut current_segment = String::new(); |
|
|
let mut current_lang = None; |
|
|
|
|
|
for ch in text.chars() { |
|
|
let char_lang = if is_chinese_char(ch) { |
|
|
Some(Language::Chinese) |
|
|
} else if ch.is_ascii_alphabetic() { |
|
|
Some(Language::English) |
|
|
} else { |
|
|
None |
|
|
}; |
|
|
|
|
|
match (current_lang, char_lang) { |
|
|
(None, Some(lang)) => { |
|
|
current_lang = Some(lang); |
|
|
current_segment.push(ch); |
|
|
} |
|
|
(Some(curr), Some(lang)) if curr == lang => { |
|
|
current_segment.push(ch); |
|
|
} |
|
|
(Some(curr), Some(lang)) if curr != lang => { |
|
|
if !current_segment.trim().is_empty() { |
|
|
segments.push((current_segment.clone(), curr)); |
|
|
} |
|
|
current_segment = ch.to_string(); |
|
|
current_lang = Some(lang); |
|
|
} |
|
|
(Some(_), None) => { |
|
|
|
|
|
current_segment.push(ch); |
|
|
} |
|
|
(None, None) => { |
|
|
|
|
|
if !current_segment.is_empty() { |
|
|
current_segment.push(ch); |
|
|
} |
|
|
} |
|
|
_ => {} |
|
|
} |
|
|
} |
|
|
|
|
|
if !current_segment.trim().is_empty() { |
|
|
if let Some(lang) = current_lang { |
|
|
segments.push((current_segment, lang)); |
|
|
} |
|
|
} |
|
|
|
|
|
segments |
|
|
} |
|
|
|
|
|
#[cfg(test)] |
|
|
mod tests { |
|
|
use super::*; |
|
|
|
|
|
#[test] |
|
|
fn test_is_chinese_char() { |
|
|
assert!(is_chinese_char('中')); |
|
|
assert!(is_chinese_char('文')); |
|
|
assert!(!is_chinese_char('a')); |
|
|
assert!(!is_chinese_char('1')); |
|
|
} |
|
|
|
|
|
#[test] |
|
|
fn test_detect_language() { |
|
|
assert_eq!(detect_language("Hello world"), Language::English); |
|
|
assert_eq!(detect_language("你好世界"), Language::Chinese); |
|
|
assert_eq!(detect_language("Hello 世界"), Language::Mixed); |
|
|
} |
|
|
|
|
|
#[test] |
|
|
fn test_contains_chinese() { |
|
|
assert!(contains_chinese("Hello 世界")); |
|
|
assert!(contains_chinese("你好")); |
|
|
assert!(!contains_chinese("Hello world")); |
|
|
} |
|
|
} |
|
|
|