File size: 3,135 Bytes
2bbfbb7
 
0393dfa
2bbfbb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
//! Text normalization for TTS

use crate::Result;
use lazy_static::lazy_static;
use regex::Regex;
use std::collections::HashMap;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Language {
    Chinese,
    English,
    Mixed,
}

#[derive(Debug)]
pub struct TextNormalizer {
    punct_map: HashMap<char, char>,
    number_words: HashMap<u64, &'static str>,
}

lazy_static! {
    static ref NUMBER_REGEX: Regex = Regex::new(r"\d+").unwrap();
    static ref WHITESPACE_REGEX: Regex = Regex::new(r"\s+").unwrap();
}

impl TextNormalizer {
    pub fn new() -> Self {
        let mut punct_map = HashMap::new();
        punct_map.insert('\u{FF0C}', ',');
        punct_map.insert('\u{3002}', '.');
        punct_map.insert('\u{FF01}', '!');
        punct_map.insert('\u{FF1F}', '?');
        punct_map.insert('\u{FF1B}', ';');
        punct_map.insert('\u{FF1A}', ':');
        punct_map.insert('\u{201C}', '\u{0022}');
        punct_map.insert('\u{201D}', '\u{0022}');
        punct_map.insert('\u{2018}', '\'');
        punct_map.insert('\u{2019}', '\'');

        let mut number_words = HashMap::new();
        number_words.insert(0, "zero");
        number_words.insert(1, "one");
        number_words.insert(2, "two");
        number_words.insert(3, "three");
        number_words.insert(4, "four");
        number_words.insert(5, "five");
        number_words.insert(6, "six");
        number_words.insert(7, "seven");
        number_words.insert(8, "eight");
        number_words.insert(9, "nine");
        number_words.insert(10, "ten");
        number_words.insert(20, "twenty");
        number_words.insert(30, "thirty");

        Self { punct_map, number_words }
    }

    pub fn normalize(&self, text: &str) -> Result<String> {
        let mut result = self.normalize_punctuation(text);
        result = self.normalize_whitespace(&result);
        Ok(result)
    }

    pub fn normalize_punctuation(&self, text: &str) -> String {
        text.chars()
            .map(|c| *self.punct_map.get(&c).unwrap_or(&c))
            .collect()
    }

    pub fn normalize_whitespace(&self, text: &str) -> String {
        WHITESPACE_REGEX.replace_all(text, " ").trim().to_string()
    }

    pub fn split_sentences(&self, text: &str) -> Vec<String> {
        let mut sentences = Vec::new();
        let mut current = String::new();

        for ch in text.chars() {
            current.push(ch);
            if ch == '.' || ch == '!' || ch == '?' {
                let trimmed = current.trim().to_string();
                if !trimmed.is_empty() {
                    sentences.push(trimmed);
                }
                current.clear();
            }
        }

        let trimmed = current.trim().to_string();
        if !trimmed.is_empty() {
            sentences.push(trimmed);
        }

        sentences
    }
}

impl Default for TextNormalizer {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_normalizer() {
        let n = TextNormalizer::new();
        let r = n.normalize_whitespace("  a  b  ");
        assert_eq!(r.len(), 3);
    }
}