""" Process book collection with LexiMind model. Analyzes each book to generate: - Overall topic classification - Dominant emotions - Concise summary Results are saved to data/processed/books/library.json for future use. Author: Oliver Perrin Date: December 2025 """ from __future__ import annotations import json import sys from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parents[1] if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT)) from src.inference.factory import create_inference_pipeline from src.utils.logging import configure_logging, get_logger configure_logging() logger = get_logger(__name__) # --------------- Configuration --------------- BOOKS_DIR = PROJECT_ROOT / "data" / "raw" / "books" OUTPUT_PATH = PROJECT_ROOT / "data" / "processed" / "books" / "library.json" # Chunk books into manageable sections for analysis MAX_CHUNK_LENGTH = 1000 # characters per chunk MAX_CHUNKS = 5 # analyze first N chunks to get representative sample # --------------- Book Processing --------------- def clean_text(text: str) -> str: """Clean and normalize book text.""" # Remove Project Gutenberg headers/footers (common patterns) lines = text.split("\n") start_idx = 0 end_idx = len(lines) for i, line in enumerate(lines): if "START OF" in line.upper() and "PROJECT GUTENBERG" in line.upper(): start_idx = i + 1 break for i in range(len(lines) - 1, -1, -1): if "END OF" in lines[i].upper() and "PROJECT GUTENBERG" in lines[i].upper(): end_idx = i break text = "\n".join(lines[start_idx:end_idx]) # Basic cleanup text = text.strip() text = " ".join(text.split()) # normalize whitespace return text def chunk_text(text: str, chunk_size: int = MAX_CHUNK_LENGTH) -> list[str]: """Split text into chunks for analysis.""" words = text.split() chunks = [] current_chunk = [] current_length = 0 for word in words: current_chunk.append(word) current_length += len(word) + 1 # +1 for space if current_length >= chunk_size: chunks.append(" ".join(current_chunk)) current_chunk = [] current_length = 0 if current_chunk: chunks.append(" ".join(current_chunk)) return chunks def process_book(book_path: Path, pipeline) -> dict: """Analyze a single book and return metadata.""" logger.info(f"Processing {book_path.name}...") # Read and clean try: text = book_path.read_text(encoding="utf-8", errors="ignore") except Exception as exc: logger.error(f"Failed to read {book_path.name}: {exc}") return {} text = clean_text(text) if not text or len(text) < 100: logger.warning(f"Skipping {book_path.name} - insufficient content") return {} # Chunk and sample chunks = chunk_text(text) sample_chunks = chunks[: min(MAX_CHUNKS, len(chunks))] logger.info(f" Analyzing {len(sample_chunks)} chunks (of {len(chunks)} total)...") # Run inference on chunks try: topics = pipeline.predict_topics(sample_chunks) emotions = pipeline.predict_emotions(sample_chunks, threshold=0.3) summaries = pipeline.summarize(sample_chunks, max_length=64) # Aggregate results # Topic: most common prediction topic_counts: dict[str, int] = {} for t in topics: topic_counts[t.label] = topic_counts.get(t.label, 0) + 1 dominant_topic = max(topic_counts.items(), key=lambda x: x[1])[0] # Emotion: aggregate top emotions all_emotions: dict[str, list[float]] = {} for emotion in emotions: for label, score in zip(emotion.labels, emotion.scores, strict=False): if label not in all_emotions: all_emotions[label] = [] all_emotions[label].append(score) # Average scores and take top 3 emotion_scores = { label: sum(scores) / len(scores) for label, scores in all_emotions.items() } top_emotions = sorted(emotion_scores.items(), key=lambda x: x[1], reverse=True)[:3] # Summary: combine first few chunk summaries combined_summary = " ".join(summaries[:3]) result: dict[str, object] = { "title": book_path.stem.replace("_", " ").title(), "filename": book_path.name, "topic": dominant_topic, "emotions": [{"label": label, "score": float(score)} for label, score in top_emotions], "summary": combined_summary, "word_count": len(text.split()), "chunks_analyzed": len(sample_chunks), } logger.info( f" āœ“ {result['title']}: {result['topic']} | " f"{', '.join(str(e['label']) for e in result['emotions'][:2] if isinstance(e, dict))}" # type: ignore[index] ) return result except Exception as exc: logger.error(f"Analysis failed for {book_path.name}: {exc}", exc_info=True) return {} # --------------- Main --------------- def main(): """Process all books and save library.""" logger.info("Loading inference pipeline...") pipeline, label_metadata = create_inference_pipeline( tokenizer_dir="artifacts/hf_tokenizer/", checkpoint_path="checkpoints/best.pt", labels_path="artifacts/labels.json", ) logger.info("Finding books...") book_files = sorted(BOOKS_DIR.glob("*.txt")) if not book_files: logger.error(f"No books found in {BOOKS_DIR}") return logger.info(f"Found {len(book_files)} books") # Process each book library = [] for book_path in book_files: result = process_book(book_path, pipeline) if result: library.append(result) # Save results OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_PATH, "w") as f: json.dump( { "books": library, "metadata": { "total_books": len(library), "chunk_size": MAX_CHUNK_LENGTH, "chunks_per_book": MAX_CHUNKS, }, }, f, indent=2, ) logger.info(f"\nāœ“ Library saved to {OUTPUT_PATH}") logger.info(f" Processed {len(library)} books") # Print summary print("\n" + "=" * 60) print("BOOK LIBRARY SUMMARY") print("=" * 60) for book in library: print(f"\nšŸ“š {book['title']}") print(f" Topic: {book['topic']}") emotions_str = ", ".join(f"{e['label']} ({e['score']:.0%})" for e in book["emotions"]) print(f" Emotions: {emotions_str}") print(f" Summary: {book['summary'][:100]}...") print("\n" + "=" * 60) if __name__ == "__main__": main()