Spaces:
Running
Running
| """ | |
| Tokenizer export script for LexiMind. | |
| Saves the FLAN-T5 tokenizer to the artifacts directory for reproducible | |
| inference without requiring network access. | |
| Author: Oliver Perrin | |
| Date: December 2025 | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| from pathlib import Path | |
| from transformers import AutoTokenizer | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="Export tokenizer to artifacts directory") | |
| parser.add_argument( | |
| "--model-name", | |
| default="google/flan-t5-base", | |
| help="HuggingFace model name for the tokenizer.", | |
| ) | |
| parser.add_argument( | |
| "--output-dir", | |
| default="artifacts/hf_tokenizer", | |
| help="Output directory for tokenizer files.", | |
| ) | |
| return parser.parse_args() | |
| def main() -> None: | |
| args = parse_args() | |
| output_dir = Path(args.output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| print(f"Downloading tokenizer from {args.model_name}...") | |
| tokenizer = AutoTokenizer.from_pretrained(args.model_name) | |
| print(f"Saving tokenizer to {output_dir}...") | |
| tokenizer.save_pretrained(str(output_dir)) | |
| # Print tokenizer info | |
| print("\nTokenizer saved successfully!") | |
| print(f" Vocab size: {tokenizer.vocab_size}") | |
| print(f" Pad token: {tokenizer.pad_token} (id={tokenizer.pad_token_id})") | |
| print(f" EOS token: {tokenizer.eos_token} (id={tokenizer.eos_token_id})") | |
| print(f" BOS token: {tokenizer.bos_token} (id={getattr(tokenizer, 'bos_token_id', 'N/A')})") | |
| print("\nFiles created:") | |
| for file in sorted(output_dir.iterdir()): | |
| print(f" - {file.name}") | |
| if __name__ == "__main__": | |
| main() | |