Spaces:

OliverPerrin
/

LexiMind

Running

LexiMind / tests /test_data /test_preprocessing.py

Refactor: Consolidate dependencies, improve testing, and add CI/CD

d18b34d 17 days ago

1.03 kB

	import unittest

	from src.data.preprocessing import TextPreprocessor
	from src.data.tokenization import Tokenizer, TokenizerConfig


	class _StubTokenizer(Tokenizer):
	def __init__(self, max_length: int) -> None:
	# Avoid expensive huggingface initialisation by skipping super().__init__
	self.config = TokenizerConfig(max_length=max_length)

	def batch_encode(self, texts, *, max_length=None):
	raise NotImplementedError


	class TextPreprocessorTests(unittest.TestCase):
	def test_matching_max_length_leaves_tokenizer_unchanged(self) -> None:
	tokenizer = _StubTokenizer(max_length=128)
	TextPreprocessor(tokenizer=tokenizer, max_length=128)
	self.assertEqual(tokenizer.config.max_length, 128)

	def test_conflicting_max_length_raises_value_error(self) -> None:
	tokenizer = _StubTokenizer(max_length=256)
	with self.assertRaises(ValueError):
	TextPreprocessor(tokenizer=tokenizer, max_length=128)


	if __name__ == "__main__":
	unittest.main()