Spaces:
Running
Running
| import unittest | |
| from src.data.preprocessing import TextPreprocessor | |
| from src.data.tokenization import Tokenizer, TokenizerConfig | |
| class _StubTokenizer(Tokenizer): | |
| def __init__(self, max_length: int) -> None: | |
| # Avoid expensive huggingface initialisation by skipping super().__init__ | |
| self.config = TokenizerConfig(max_length=max_length) | |
| def batch_encode(self, texts, *, max_length=None): | |
| raise NotImplementedError | |
| class TextPreprocessorTests(unittest.TestCase): | |
| def test_matching_max_length_leaves_tokenizer_unchanged(self) -> None: | |
| tokenizer = _StubTokenizer(max_length=128) | |
| TextPreprocessor(tokenizer=tokenizer, max_length=128) | |
| self.assertEqual(tokenizer.config.max_length, 128) | |
| def test_conflicting_max_length_raises_value_error(self) -> None: | |
| tokenizer = _StubTokenizer(max_length=256) | |
| with self.assertRaises(ValueError): | |
| TextPreprocessor(tokenizer=tokenizer, max_length=128) | |
| if __name__ == "__main__": | |
| unittest.main() | |