"""
Process book collection with LexiMind model.

Analyzes each book to generate:
- Overall topic classification
- Dominant emotions
- Concise summary

Results are saved to data/processed/books/library.json for future use.

Author: Oliver Perrin
Date: December 2025
"""

from __future__ import annotations

import json
import sys
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.inference.factory import create_inference_pipeline
from src.utils.logging import configure_logging, get_logger

configure_logging()
logger = get_logger(__name__)

# --------------- Configuration ---------------

BOOKS_DIR = PROJECT_ROOT / "data" / "raw" / "books"
OUTPUT_PATH = PROJECT_ROOT / "data" / "processed" / "books" / "library.json"

# Chunk books into manageable sections for analysis
MAX_CHUNK_LENGTH = 1000  # characters per chunk
MAX_CHUNKS = 5  # analyze first N chunks to get representative sample


# --------------- Book Processing ---------------


def clean_text(text: str) -> str:
    """Clean and normalize book text."""
    # Remove Project Gutenberg headers/footers (common patterns)
    lines = text.split("\n")
    start_idx = 0
    end_idx = len(lines)

    for i, line in enumerate(lines):
        if "START OF" in line.upper() and "PROJECT GUTENBERG" in line.upper():
            start_idx = i + 1
            break

    for i in range(len(lines) - 1, -1, -1):
        if "END OF" in lines[i].upper() and "PROJECT GUTENBERG" in lines[i].upper():
            end_idx = i
            break

    text = "\n".join(lines[start_idx:end_idx])

    # Basic cleanup
    text = text.strip()
    text = " ".join(text.split())  # normalize whitespace

    return text


def chunk_text(text: str, chunk_size: int = MAX_CHUNK_LENGTH) -> list[str]:
    """Split text into chunks for analysis."""
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        current_chunk.append(word)
        current_length += len(word) + 1  # +1 for space

        if current_length >= chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


def process_book(book_path: Path, pipeline) -> dict:
    """Analyze a single book and return metadata."""
    logger.info(f"Processing {book_path.name}...")

    # Read and clean
    try:
        text = book_path.read_text(encoding="utf-8", errors="ignore")
    except Exception as exc:
        logger.error(f"Failed to read {book_path.name}: {exc}")
        return {}

    text = clean_text(text)

    if not text or len(text) < 100:
        logger.warning(f"Skipping {book_path.name} - insufficient content")
        return {}

    # Chunk and sample
    chunks = chunk_text(text)
    sample_chunks = chunks[: min(MAX_CHUNKS, len(chunks))]

    logger.info(f"  Analyzing {len(sample_chunks)} chunks (of {len(chunks)} total)...")

    # Run inference on chunks
    try:
        topics = pipeline.predict_topics(sample_chunks)
        emotions = pipeline.predict_emotions(sample_chunks, threshold=0.3)
        summaries = pipeline.summarize(sample_chunks, max_length=64)

        # Aggregate results
        # Topic: most common prediction
        topic_counts: dict[str, int] = {}
        for t in topics:
            topic_counts[t.label] = topic_counts.get(t.label, 0) + 1
        dominant_topic = max(topic_counts.items(), key=lambda x: x[1])[0]

        # Emotion: aggregate top emotions
        all_emotions: dict[str, list[float]] = {}
        for emotion in emotions:
            for label, score in zip(emotion.labels, emotion.scores, strict=False):
                if label not in all_emotions:
                    all_emotions[label] = []
                all_emotions[label].append(score)

        # Average scores and take top 3
        emotion_scores = {
            label: sum(scores) / len(scores) for label, scores in all_emotions.items()
        }
        top_emotions = sorted(emotion_scores.items(), key=lambda x: x[1], reverse=True)[:3]

        # Summary: combine first few chunk summaries
        combined_summary = " ".join(summaries[:3])

        result: dict[str, object] = {
            "title": book_path.stem.replace("_", " ").title(),
            "filename": book_path.name,
            "topic": dominant_topic,
            "emotions": [{"label": label, "score": float(score)} for label, score in top_emotions],
            "summary": combined_summary,
            "word_count": len(text.split()),
            "chunks_analyzed": len(sample_chunks),
        }

        logger.info(
            f"  ✓ {result['title']}: {result['topic']} | "
            f"{', '.join(str(e['label']) for e in result['emotions'][:2] if isinstance(e, dict))}"  # type: ignore[index]
        )

        return result

    except Exception as exc:
        logger.error(f"Analysis failed for {book_path.name}: {exc}", exc_info=True)
        return {}


# --------------- Main ---------------


def main():
    """Process all books and save library."""
    logger.info("Loading inference pipeline...")

    pipeline, label_metadata = create_inference_pipeline(
        tokenizer_dir="artifacts/hf_tokenizer/",
        checkpoint_path="checkpoints/best.pt",
        labels_path="artifacts/labels.json",
    )

    logger.info("Finding books...")
    book_files = sorted(BOOKS_DIR.glob("*.txt"))

    if not book_files:
        logger.error(f"No books found in {BOOKS_DIR}")
        return

    logger.info(f"Found {len(book_files)} books")

    # Process each book
    library = []
    for book_path in book_files:
        result = process_book(book_path, pipeline)
        if result:
            library.append(result)

    # Save results
    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(OUTPUT_PATH, "w") as f:
        json.dump(
            {
                "books": library,
                "metadata": {
                    "total_books": len(library),
                    "chunk_size": MAX_CHUNK_LENGTH,
                    "chunks_per_book": MAX_CHUNKS,
                },
            },
            f,
            indent=2,
        )

    logger.info(f"\n✓ Library saved to {OUTPUT_PATH}")
    logger.info(f"  Processed {len(library)} books")

    # Print summary
    print("\n" + "=" * 60)
    print("BOOK LIBRARY SUMMARY")
    print("=" * 60)

    for book in library:
        print(f"\n📚 {book['title']}")
        print(f"   Topic: {book['topic']}")
        emotions_str = ", ".join(f"{e['label']} ({e['score']:.0%})" for e in book["emotions"])
        print(f"   Emotions: {emotions_str}")
        print(f"   Summary: {book['summary'][:100]}...")

    print("\n" + "=" * 60)


if __name__ == "__main__":
    main()