Spaces:

OliverPerrin
/

LexiMind

Running

App Files Files Community

OliverPerrin commited on Nov 18

Commit

00d412c

1 Parent(s): a504116

Fixed Gradio Summarization Issue

Browse files

Files changed (4) hide show

requirements-dev.txt +2 -1
requirements.txt +2 -1
scripts/demo_gradio.py +52 -0
scripts/eval_rouge.py +183 -0

requirements-dev.txt CHANGED Viewed

@@ -7,4 +7,5 @@ flake8>=6.0.0
 mypy>=1.4.0
 jupyter>=1.0.0
 ipywidgets>=8.0.0
-pre-commit>=3.4.0

 mypy>=1.4.0
 jupyter>=1.0.0
 ipywidgets>=8.0.0
+pre-commit>=3.4.0
+rouge-score>=0.1.2

requirements.txt CHANGED Viewed

@@ -11,4 +11,5 @@ datasets>=4.4.0
 gradio>=4.0.0
 seaborn
 pytest
-matplotlib

 gradio>=4.0.0
 seaborn
 pytest
+matplotlib
+rouge-score>=0.1.2

scripts/demo_gradio.py CHANGED Viewed

@@ -24,6 +24,8 @@ PROJECT_ROOT = Path(__file__).resolve().parent.parent
 if str(PROJECT_ROOT) not in sys.path:
     sys.path.insert(0, str(PROJECT_ROOT))
 from src.inference.factory import create_inference_pipeline
 from src.inference.pipeline import EmotionPrediction, InferencePipeline, TopicPrediction
 from src.utils.logging import configure_logging, get_logger
@@ -358,6 +360,39 @@ def generate_fallback_summary(text: str, max_chars: int = 320) -> str:
     return " ".join(fragments)
 SAMPLE_TEXT = (
     "Artificial intelligence is rapidly transforming the technology landscape. "
     "Machine learning algorithms are now capable of processing vast amounts of data, "
@@ -380,6 +415,8 @@ def create_interface() -> gr.Blocks:
             """
         )
         with gr.Row():
             with gr.Column(scale=1):
                 input_text = gr.Textbox(
@@ -417,11 +454,25 @@ def create_interface() -> gr.Blocks:
                             columns=2,
                             height=400,
                             interactive=False,
                         )
                         gr.Markdown(
                             "These PNGs come from the visualization-focused tests in `tests/test_models` and are consumed as-is."
                         )
                         refresh_visuals = gr.Button("Refresh Visuals")
                 gr.Markdown("### Download Results")
                 download_btn = gr.DownloadButton("Download JSON", visible=False)
@@ -432,6 +483,7 @@ def create_interface() -> gr.Blocks:
             outputs=[summary_output, emotion_output, topic_output, attention_output, download_btn],
         )
         refresh_visuals.click(fn=load_visualization_gallery, inputs=None, outputs=visuals)
         return demo

 if str(PROJECT_ROOT) not in sys.path:
     sys.path.insert(0, str(PROJECT_ROOT))
+ROUGE_REPORT_PATH = PROJECT_ROOT / "outputs" / "rouge_validation.json"
 from src.inference.factory import create_inference_pipeline
 from src.inference.pipeline import EmotionPrediction, InferencePipeline, TopicPrediction
 from src.utils.logging import configure_logging, get_logger
     return " ".join(fragments)
+def load_rouge_metrics():
+    columns = ["metric", "precision", "recall", "fmeasure"]
+    empty = pd.DataFrame(columns=columns)
+    if not ROUGE_REPORT_PATH.exists():
+        return empty, {"error": f"ROUGE report not found at {ROUGE_REPORT_PATH}"}
+    try:
+        with ROUGE_REPORT_PATH.open("r", encoding="utf-8") as handle:
+            report = json.load(handle)
+    except Exception as exc:  # pragma: no cover - surfaced in UI
+        logger.error("Failed to read ROUGE report: %s", exc, exc_info=True)
+        return empty, {"error": f"Unable to parse report: {exc}", "report_path": str(ROUGE_REPORT_PATH)}
+    rows: list[dict[str, object]] = []
+    for metric_name, components in report.get("metrics", {}).items():
+        rows.append(
+            {
+                "metric": metric_name,
+                "precision": round(float(components.get("precision", 0.0)), 4),
+                "recall": round(float(components.get("recall", 0.0)), 4),
+                "fmeasure": round(float(components.get("fmeasure", 0.0)), 4),
+            }
+        )
+    table = pd.DataFrame(rows, columns=columns) if rows else empty
+    metadata = {
+        "num_examples": report.get("num_examples"),
+        "config": report.get("config"),
+        "report_path": str(ROUGE_REPORT_PATH),
+    }
+    return table, metadata
 SAMPLE_TEXT = (
     "Artificial intelligence is rapidly transforming the technology landscape. "
     "Machine learning algorithms are now capable of processing vast amounts of data, "
             """
         )
+        initial_metrics, initial_metrics_meta = load_rouge_metrics()
         with gr.Row():
             with gr.Column(scale=1):
                 input_text = gr.Textbox(
                             columns=2,
                             height=400,
                             interactive=False,
+                            type="filepath"
                         )
                         gr.Markdown(
                             "These PNGs come from the visualization-focused tests in `tests/test_models` and are consumed as-is."
                         )
                         refresh_visuals = gr.Button("Refresh Visuals")
+                    with gr.TabItem("Metrics"):
+                        rouge_table = gr.Dataframe(
+                            value=initial_metrics,
+                            headers=["metric", "precision", "recall", "fmeasure"],
+                            datatype=["str", "number", "number", "number"],
+                            interactive=False,
+                            label="ROUGE Scores",
+                        )
+                        rouge_meta = gr.JSON(
+                            value=initial_metrics_meta,
+                            label="ROUGE Run Metadata",
+                        )
+                        refresh_metrics = gr.Button("Refresh Metrics")
                 gr.Markdown("### Download Results")
                 download_btn = gr.DownloadButton("Download JSON", visible=False)
             outputs=[summary_output, emotion_output, topic_output, attention_output, download_btn],
         )
         refresh_visuals.click(fn=load_visualization_gallery, inputs=None, outputs=visuals)
+        refresh_metrics.click(fn=load_rouge_metrics, inputs=None, outputs=[rouge_table, rouge_meta])
         return demo

scripts/eval_rouge.py ADDED Viewed

	@@ -0,0 +1,183 @@

+"""Utility script to evaluate LexiMind summaries with ROUGE."""
+from __future__ import annotations
+import argparse
+import json
+from collections import defaultdict
+from pathlib import Path
+from statistics import fmean
+from typing import Dict, Iterable, List, Sequence, Tuple
+import sys
+from rouge_score import rouge_scorer
+from tqdm import tqdm
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+if str(PROJECT_ROOT) not in sys.path:
+	sys.path.insert(0, str(PROJECT_ROOT))
+from src.inference.factory import create_inference_pipeline
+def parse_args() -> argparse.Namespace:
+	parser = argparse.ArgumentParser(description="Evaluate LexiMind summaries with ROUGE metrics.")
+	parser.add_argument("data", type=Path, help="Path to JSONL file with source text and gold summaries.")
+	parser.add_argument("checkpoint", type=Path, help="Path to the trained checkpoint (e.g., checkpoints/best.pt).")
+	parser.add_argument("labels", type=Path, help="Path to label metadata (e.g., artifacts/labels.json).")
+	parser.add_argument(
+		"--tokenizer-dir",
+		type=Path,
+		default=Path("artifacts/hf_tokenizer"),
+		help="Directory containing the saved tokenizer artifacts.",
+	)
+	parser.add_argument(
+		"--model-config",
+		type=Path,
+		default=None,
+		help="Optional YAML config describing the model architecture.",
+	)
+	parser.add_argument("--device", type=str, default="cpu", help="Device to run inference on (cpu or cuda).")
+	parser.add_argument("--batch-size", type=int, default=8, help="Number of samples per inference batch.")
+	parser.add_argument(
+		"--max-samples",
+		type=int,
+		default=None,
+		help="If provided, limit evaluation to the first N samples for quick smoke tests.",
+	)
+	parser.add_argument(
+		"--max-length",
+		type=int,
+		default=128,
+		help="Maximum length to pass into the summarization head during generation.",
+	)
+	parser.add_argument(
+		"--metrics",
+		type=str,
+		nargs="+",
+		default=("rouge1", "rouge2", "rougeL"),
+		help="ROUGE metrics to compute.",
+	)
+	parser.add_argument(
+		"--source-field",
+		type=str,
+		default="source",
+		help="Field name containing the input document in the JSONL examples.",
+	)
+	parser.add_argument(
+		"--target-field",
+		type=str,
+		default="summary",
+		help="Field name containing the reference summary in the JSONL examples.",
+	)
+	parser.add_argument(
+		"--no-stemmer",
+		action="store_true",
+		help="Disable Porter stemming inside the ROUGE scorer (defaults to enabled).",
+	)
+	parser.add_argument(
+		"--output",
+		type=Path,
+		default=None,
+		help="Optional path to save a JSON report with aggregate metrics and sample counts.",
+	)
+	return parser.parse_args()
+def load_examples(
+	path: Path,
+	source_field: str,
+	target_field: str,
+	max_samples: int | None,
+) -> List[Tuple[str, str]]:
+	examples: List[Tuple[str, str]] = []
+	with path.open("r", encoding="utf-8") as handle:
+		for line in handle:
+			line = line.strip()
+			if not line:
+				continue
+			record = json.loads(line)
+			try:
+				source = str(record[source_field])
+				target = str(record[target_field])
+			except KeyError as exc:  # pragma: no cover - invalid data surface at runtime
+				raise KeyError(f"Missing field in record: {exc} (available keys: {list(record)})") from exc
+			examples.append((source, target))
+			if max_samples is not None and len(examples) >= max_samples:
+				break
+	if not examples:
+		raise ValueError(f"No examples loaded from {path}")
+	return examples
+def batched(items: Sequence[Tuple[str, str]], batch_size: int) -> Iterable[Sequence[Tuple[str, str]]]:
+	for start in range(0, len(items), batch_size):
+		yield items[start : start + batch_size]
+def aggregate_scores(raw_scores: Dict[str, Dict[str, List[float]]]) -> Dict[str, Dict[str, float]]:
+	aggregated: Dict[str, Dict[str, float]] = {}
+	for metric, components in raw_scores.items():
+		aggregated[metric] = {
+			component: (fmean(values) if values else 0.0) for component, values in components.items()
+		}
+	return aggregated
+def main() -> None:
+	args = parse_args()
+	pipeline, _ = create_inference_pipeline(
+		checkpoint_path=args.checkpoint,
+		labels_path=args.labels,
+		tokenizer_dir=args.tokenizer_dir,
+		model_config_path=args.model_config,
+		device=args.device,
+		summary_max_length=args.max_length,
+	)
+	examples = load_examples(args.data, args.source_field, args.target_field, args.max_samples)
+	scorer = rouge_scorer.RougeScorer(list(args.metrics), use_stemmer=not args.no_stemmer)
+	score_store: Dict[str, Dict[str, List[float]]] = defaultdict(lambda: defaultdict(list))
+	for batch in tqdm(
+		list(batched(examples, args.batch_size)),
+		desc="Evaluating",
+		total=(len(examples) + args.batch_size - 1) // args.batch_size,
+	):
+		documents = [item[0] for item in batch]
+		references = [item[1] for item in batch]
+		predictions = pipeline.summarize(documents, max_length=args.max_length)
+		for reference, prediction in zip(references, predictions):
+			scores = scorer.score(reference, prediction)
+			for metric_name, score in scores.items():
+				score_store[metric_name]["precision"].append(score.precision)
+				score_store[metric_name]["recall"].append(score.recall)
+				score_store[metric_name]["fmeasure"].append(score.fmeasure)
+	aggregated = aggregate_scores(score_store)
+	report = {
+		"num_examples": len(examples),
+		"metrics": aggregated,
+		"config": {
+			"data": str(args.data),
+			"checkpoint": str(args.checkpoint),
+			"tokenizer_dir": str(args.tokenizer_dir),
+			"metrics": list(args.metrics),
+			"max_length": args.max_length,
+			"batch_size": args.batch_size,
+			"device": args.device,
+		},
+	}
+	print(json.dumps(report, indent=2))
+	if args.output:
+		args.output.parent.mkdir(parents=True, exist_ok=True)
+		with args.output.open("w", encoding="utf-8") as handle:
+			json.dump(report, handle, ensure_ascii=False, indent=2)
+if __name__ == "__main__":
+	main()