LexiMind / scripts /export_tokenizer.py
OliverPerrin
Full training run, code cleanup, mypy/ruff fixes
590a604
"""
Tokenizer export script for LexiMind.
Saves the FLAN-T5 tokenizer to the artifacts directory for reproducible
inference without requiring network access.
Author: Oliver Perrin
Date: December 2025
"""
from __future__ import annotations
import argparse
from pathlib import Path
from transformers import AutoTokenizer
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Export tokenizer to artifacts directory")
parser.add_argument(
"--model-name",
default="google/flan-t5-base",
help="HuggingFace model name for the tokenizer.",
)
parser.add_argument(
"--output-dir",
default="artifacts/hf_tokenizer",
help="Output directory for tokenizer files.",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Downloading tokenizer from {args.model_name}...")
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
print(f"Saving tokenizer to {output_dir}...")
tokenizer.save_pretrained(str(output_dir))
# Print tokenizer info
print("\nTokenizer saved successfully!")
print(f" Vocab size: {tokenizer.vocab_size}")
print(f" Pad token: {tokenizer.pad_token} (id={tokenizer.pad_token_id})")
print(f" EOS token: {tokenizer.eos_token} (id={tokenizer.eos_token_id})")
print(f" BOS token: {tokenizer.bos_token} (id={getattr(tokenizer, 'bos_token_id', 'N/A')})")
print("\nFiles created:")
for file in sorted(output_dir.iterdir()):
print(f" - {file.name}")
if __name__ == "__main__":
main()