|
|
|
|
|
|
|
|
use crate::Result; |
|
|
use lazy_static::lazy_static; |
|
|
use regex::Regex; |
|
|
use std::collections::HashMap; |
|
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)] |
|
|
pub enum Language { |
|
|
Chinese, |
|
|
English, |
|
|
Mixed, |
|
|
} |
|
|
|
|
|
#[derive(Debug)] |
|
|
pub struct TextNormalizer { |
|
|
punct_map: HashMap<char, char>, |
|
|
number_words: HashMap<u64, &'static str>, |
|
|
} |
|
|
|
|
|
lazy_static! { |
|
|
static ref NUMBER_REGEX: Regex = Regex::new(r"\d+").unwrap(); |
|
|
static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap(); |
|
|
} |
|
|
|
|
|
impl TextNormalizer { |
|
|
pub fn new() -> Self { |
|
|
let mut punct_map = HashMap::new(); |
|
|
punct_map.insert('\u{FF0C}', ','); |
|
|
punct_map.insert('\u{3002}', '.'); |
|
|
punct_map.insert('\u{FF01}', '!'); |
|
|
punct_map.insert('\u{FF1F}', '?'); |
|
|
punct_map.insert('\u{FF1B}', ';'); |
|
|
punct_map.insert('\u{FF1A}', ':'); |
|
|
punct_map.insert('\u{201C}', '\u{0022}'); |
|
|
punct_map.insert('\u{201D}', '\u{0022}'); |
|
|
punct_map.insert('\u{2018}', '\''); |
|
|
punct_map.insert('\u{2019}', '\''); |
|
|
|
|
|
let mut number_words = HashMap::new(); |
|
|
number_words.insert(0, "zero"); |
|
|
number_words.insert(1, "one"); |
|
|
number_words.insert(2, "two"); |
|
|
number_words.insert(3, "three"); |
|
|
number_words.insert(4, "four"); |
|
|
number_words.insert(5, "five"); |
|
|
number_words.insert(6, "six"); |
|
|
number_words.insert(7, "seven"); |
|
|
number_words.insert(8, "eight"); |
|
|
number_words.insert(9, "nine"); |
|
|
number_words.insert(10, "ten"); |
|
|
number_words.insert(20, "twenty"); |
|
|
number_words.insert(30, "thirty"); |
|
|
|
|
|
Self { punct_map, number_words } |
|
|
} |
|
|
|
|
|
pub fn normalize(&self, text: &str) -> Result<String> { |
|
|
let mut result = self.normalize_punctuation(text); |
|
|
result = self.normalize_whitespace(&result); |
|
|
Ok(result) |
|
|
} |
|
|
|
|
|
pub fn normalize_punctuation(&self, text: &str) -> String { |
|
|
text.chars() |
|
|
.map(|c| *self.punct_map.get(&c).unwrap_or(&c)) |
|
|
.collect() |
|
|
} |
|
|
|
|
|
pub fn normalize_whitespace(&self, text: &str) -> String { |
|
|
WHITESPACE_REGEX.replace_all(text, " ").trim().to_string() |
|
|
} |
|
|
|
|
|
pub fn split_sentences(&self, text: &str) -> Vec<String> { |
|
|
let mut sentences = Vec::new(); |
|
|
let mut current = String::new(); |
|
|
|
|
|
for ch in text.chars() { |
|
|
current.push(ch); |
|
|
if ch == '.' || ch == '!' || ch == '?' { |
|
|
let trimmed = current.trim().to_string(); |
|
|
if !trimmed.is_empty() { |
|
|
sentences.push(trimmed); |
|
|
} |
|
|
current.clear(); |
|
|
} |
|
|
} |
|
|
|
|
|
let trimmed = current.trim().to_string(); |
|
|
if !trimmed.is_empty() { |
|
|
sentences.push(trimmed); |
|
|
} |
|
|
|
|
|
sentences |
|
|
} |
|
|
} |
|
|
|
|
|
impl Default for TextNormalizer { |
|
|
fn default() -> Self { |
|
|
Self::new() |
|
|
} |
|
|
} |
|
|
|
|
|
#[cfg(test)] |
|
|
mod tests { |
|
|
use super::*; |
|
|
|
|
|
#[test] |
|
|
fn test_normalizer() { |
|
|
let n = TextNormalizer::new(); |
|
|
let r = n.normalize_whitespace(" a b "); |
|
|
assert_eq!(r.len(), 3); |
|
|
} |
|
|
} |
|
|
|