LexiMind / tests /test_data /test_preprocessing.py
OliverPerrin's picture
Refactor: Consolidate dependencies, improve testing, and add CI/CD
d18b34d
import unittest
from src.data.preprocessing import TextPreprocessor
from src.data.tokenization import Tokenizer, TokenizerConfig
class _StubTokenizer(Tokenizer):
def __init__(self, max_length: int) -> None:
# Avoid expensive huggingface initialisation by skipping super().__init__
self.config = TokenizerConfig(max_length=max_length)
def batch_encode(self, texts, *, max_length=None):
raise NotImplementedError
class TextPreprocessorTests(unittest.TestCase):
def test_matching_max_length_leaves_tokenizer_unchanged(self) -> None:
tokenizer = _StubTokenizer(max_length=128)
TextPreprocessor(tokenizer=tokenizer, max_length=128)
self.assertEqual(tokenizer.config.max_length, 128)
def test_conflicting_max_length_raises_value_error(self) -> None:
tokenizer = _StubTokenizer(max_length=256)
with self.assertRaises(ValueError):
TextPreprocessor(tokenizer=tokenizer, max_length=128)
if __name__ == "__main__":
unittest.main()