Spaces:

OpenEvals
/

open_benchmark_index

Running

App Files Files Community

Linker1907 commited on Nov 12

Commit

728f913

1 Parent(s): aa1085f

better

Browse files

Files changed (2) hide show

app.py +244 -168
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,52 +1,26 @@
 """
 Gradio dashboard to explore Lighteval tasks.
-Scans `src/lighteval/tasks/tasks` and `src/lighteval/tasks/multilingual/tasks`
-for module-level docstrings with this format:
-name: <task display name>
-dataset: <dataset id(s)>
-abstract: <free text>
-languages: <comma/newline separated language codes or names>
-tags: <comma/newline separated tags>
-paper: <url>
-This file stays outside the lighteval src tree, per request.
 """
-import ast
-import os
 import re
 from collections import Counter
-from git import Repo  # pip install gitpython
-from dataclasses import dataclass
 import gradio as gr
-git_url = "https://github.com/huggingface/lighteval.git"
-repo_dir = "./lighteval"
-if os.path.exists(repo_dir) and os.path.isdir(os.path.join(repo_dir, ".git")):
-    print(f"Pulling latest changes from {git_url}...")
-    repo = Repo(repo_dir)
-    repo.remotes.origin.pull()
-else:
-    print(f"Cloning {git_url} to {repo_dir}...")
-    Repo.clone_from(git_url, repo_dir)
-REPO_ROOT = "."
-TASK_DIRS = [
-    os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "tasks"),
-    os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "multilingual", "tasks"),
-]
 star_benchmarks = [
     "aime",
     "mmlu_pro",
-    "gpqa:diamond",
     "hle",
     "arc_agi_2",
     "ifbench",
@@ -62,7 +36,6 @@ star_benchmarks = [
 @dataclass
 class TaskDoc:
-    file_path: str
     module: str
     abstract: str
     languages: list[str]
@@ -70,108 +43,59 @@ class TaskDoc:
     paper: str | None
     dataset: str | None
     name: str | None = None
-def read_file_text(path: str) -> str | None:
-    try:
-        with open(path, "r", encoding="utf-8") as f:
-            return f.read()
-    except Exception:
-        return None
-def parse_module_docstring(text: str) -> str | None:
-    try:
-        mod = ast.parse(text)
-        return ast.get_docstring(mod)
-    except Exception:
-        # Fallback: naive regex for triple-quoted string at top
-        m = re.match(r"^\s*([\'\"])\1\1([\s\S]*?)\1\1\1", text)
-        return m.group(2).strip() if m else None
-def parse_sections(doc: str) -> dict[str, str]:
-    # Very simple section parser keyed by lines ending with ':' on their own
-    # Expected keys: name, dataset, abstract, languages, tags, paper
-    out: dict[str, str] = {"name": "", "dataset": "", "abstract": "", "languages": "", "tags": "", "paper": ""}
-    current_key: str | None = None
-    for raw_line in doc.splitlines():
-        line = raw_line.rstrip()
-        if line.endswith(":") and line.strip().lower() in {"name:", "dataset:", "abstract:", "languages:", "tags:", "paper:"}:
-            current_key = line[:-1].strip().lower()
-            continue
-        if current_key is not None:
-            # Preserve paragraphs; we will normalize later
-            out[current_key] = (out[current_key] + ("\n" if out[current_key] else "") + line).strip()
-    return out
-def split_list_field(value: str) -> list[str]:
-    if not value:
-        return []
-    # Support comma and newline separated values
-    parts = re.split(r"[\n,]", value)
-    cleaned: list[str] = []
-    for p in parts:
-        token = p.strip()
-        if not token:
-            continue
-        cleaned.append(token)
-    return cleaned
-def discover_task_files() -> list[str]:
-    files: list[str] = []
-    print(f"Discovering task files in: {TASK_DIRS}")
-    for base in TASK_DIRS:
-        print(f"Discovering task files in: {base}")
-        if not os.path.isdir(base):
-            continue
-        # Top-level python files in the directory
-        for name in os.listdir(base):
-            if name.endswith(".py"):
-                files.append(os.path.join(base, name))
-        # Also include subdirectory main.py files
-        for dirpath, dirnames, filenames in os.walk(base):
-            if dirpath == base:
-                continue
-            if "main.py" in filenames:
-                files.append(os.path.join(dirpath, "main.py"))
-    # Deduplicate while preserving order
-    seen: set = set()
-    unique_files: list[str] = []
-    for p in files:
-        if p in seen:
-            continue
-        seen.add(p)
-        unique_files.append(p)
-    return sorted(unique_files)
 def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
     docs: list[TaskDoc] = []
     language_counts: Counter = Counter()
     tag_set: set = set()
-    for path in discover_task_files():
-        text = read_file_text(path)
-        if not text:
-            continue
-        doc = parse_module_docstring(text)
-        if not doc:
-            continue
-        sections = parse_sections(doc)
-        abstract = sections.get("abstract", "").strip()
-        langs = [lang.lower() for lang in split_list_field(sections.get("languages", ""))]
-        tgs = [t.lower() for t in split_list_field(sections.get("tags", ""))]
-        paper = sections.get("paper", "").strip() or None
-        dataset = sections.get("dataset", "").strip() or None
-        name = sections.get("name", "").strip() or None
         for lang in langs:
             language_counts[lang] += 1
         for t in tgs:
             tag_set.add(t)
-        module = os.path.relpath(path, REPO_ROOT)
-        docs.append(TaskDoc(file_path=path, module=module, abstract=abstract, languages=langs, tags=tgs, paper=paper, dataset=dataset, name=name))
     languages_sorted = [
         lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
     ]
@@ -179,39 +103,30 @@ def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
     return docs, languages_sorted, tags_sorted
-def build_index() -> tuple[list[TaskDoc], list[str], list[str]]:
-    return index_tasks()
-ALL_TASKS, ALL_LANGS, ALL_TAGS = build_index()
-TOP_LANGS = ALL_LANGS[:8]  # show more by default
 def normalize_name_for_matching(name: str) -> str:
-    # Normalize for comparison: lowercase, remove underscores/spaces/colons
     return re.sub(r"[_\s:]+", "", name.lower())
 def is_starred_benchmark(td: TaskDoc) -> bool:
-    # Check multiple possible identifiers
-    parts = td.module.replace("\\", "/").split("/")
-    base_no_ext = parts[-1].rsplit(".", 1)[0]
-    fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
-    # Normalize all possible identifiers
     task_name_raw = (td.name or "").lower().strip()
     task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
     normalized_task_display = normalize_name_for_matching(task_name_display)
     normalized_module = normalize_name_for_matching(base_no_ext)
     normalized_name = normalize_name_for_matching(task_name_raw)
-    # Also check dataset if available
     normalized_dataset = normalize_name_for_matching(td.dataset or "")
-    # Check against star_benchmarks list - try multiple matching strategies
     for star_name in star_benchmarks:
         normalized_star = normalize_name_for_matching(star_name)
-        # Try exact match or substring match on various fields
         if (normalized_star == normalized_task_display or
             normalized_star == normalized_module or
             normalized_star == normalized_name or
@@ -225,27 +140,29 @@ def is_starred_benchmark(td: TaskDoc) -> bool:
 def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
     selected_langs = [lang.lower() for lang in (languages or [])]
     selected_tags = [t.lower() for t in (tags or [])]
     search_lc = (search or "").strip().lower()
     out: list[TaskDoc] = []
     for td in ALL_TASKS:
         if selected_langs and not any(lang in td.languages for lang in selected_langs):
             continue
         if selected_tags and not any(t in td.tags for t in selected_tags):
             continue
         if search_lc:
-            # Search module path, abstract, tags, and dataset names
             hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
             if search_lc not in hay:
                 continue
         out.append(td)
-    # Sort: starred benchmarks first, then by name
     out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower()))
     return out
 def truncate_text(text: str, max_length: int = 250) -> str:
     if len(text) <= max_length:
         return text
     truncated = text[:max_length]
@@ -255,16 +172,55 @@ def truncate_text(text: str, max_length: int = 250) -> str:
     return truncated + "..."
 def render_cards(tasks: list[TaskDoc]) -> str:
-    # Responsive grid of pretty cards; show all details without clicks
     items: list[str] = []
     for t in tasks:
-        parts = t.module.replace("\\", "/").split("/")
-        base_no_ext = parts[-1].rsplit(".", 1)[0]
-        fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
         task_name = (t.name or fallback_name).replace("_", " ").title()
-        mod_path = t.module.replace("\\", "/")
-        mod_path = mod_path.split("/", 1)[1]
         source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
         paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
         tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
@@ -285,6 +241,31 @@ def render_cards(tasks: list[TaskDoc]) -> str:
                 dataset_links.append(f'<span class="dataset-more">+{len(datasets) - 6} more</span>')
         dataset_html = " ".join(dataset_links) if dataset_links else ""
         star_icon = "⭐ " if is_starred_benchmark(t) else ""
         items.append(
             f"""
             <article class="card" tabindex="0" aria-label="Task {task_name}">
@@ -292,6 +273,7 @@ def render_cards(tasks: list[TaskDoc]) -> str:
               {chips_tags_html}
               {chips_langs_html}
               <div class="abstract">{abstract_html}</div>
               <div class="links">{links_html}</div>
             </article>
             """
@@ -318,13 +300,12 @@ def on_toggle_language_choices(show_all: bool, selected_langs: list[str], tags:
 def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
-    # Only toggle visibility; preserve current tag selections and keep them active in filtering
     tags_value: list[str] = selected_tags or []
     tasks = filter_tasks(languages, tags_value, search)
     count = len(tasks)
     total = len(ALL_TASKS)
     counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
-    # keep selections when showing; when hiding we keep value but component hidden (so filter still uses them)
     return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)
@@ -440,6 +421,82 @@ custom_css = """
   min-height: 48px;
 }
 .links {
   margin-top: 12px;
   font-size: 12px;
@@ -551,6 +608,39 @@ custom_css = """
   .links {
     border-top-color: rgba(148, 163, 184, 0.3);
   }
 }
 /* apply */
@@ -584,7 +674,6 @@ body {
 """
 with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo:
-    # Header / hero
     with gr.Row():
         with gr.Column():
             gr.Markdown(
@@ -595,46 +684,33 @@ with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo:
             )
             task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**")
-    # Controls and results in two columns (left: controls, right: cards)
     with gr.Row(equal_height=False):
         with gr.Column(scale=2):
             gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
-            # Search with interactive debounce
-            search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="", interactive=True)
-            # We want debounce behavior: use .change with every character by setting interactive=True and triggering on input
-            # Filters
             with gr.Group():
                 gr.Markdown("**Languages**")
                 show_all_langs = gr.Checkbox(label="Show all languages", value=False)
-                lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[])  # default none selected
             with gr.Group():
                 gr.Markdown("**Benchmark type**")
                 show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
                 tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
-            # small hint
             gr.Markdown("Tip: use the filters and search together. Results update live.")
         with gr.Column(scale=5):
             cards = gr.HTML()
-            # put an initially visible loading placeholder
             cards.value = "<div style='padding:18px'>Loading tasks…</div>"
-    # Wire interactions
-    # Toggle expand/collapse language choices
     show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
-    # Toggle tag filter visibility (keeps values)
     show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])
-    # Live filtering: wire change events on controls to update cards.
-    # Textbox: trigger on every change (interactive True). If Gradio runtime has debounce param, it's used internally.
     search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
     lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
     tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
-    # Initial load: display all tasks (starred benchmarks first)
     initial_tasks = filter_tasks([], [], "")
     cards.value = render_cards(initial_tasks)
-# Run with `python benchmark_finder/app.py`
-demo.launch()

 """
 Gradio dashboard to explore Lighteval tasks.
+Loads tasks from the lighteval Registry and displays them in a searchable,
+filterable interface.
 """
 import re
 from collections import Counter
+from dataclasses import dataclass, field
 import gradio as gr
+from lighteval.tasks.registry import Registry
+registry = Registry(custom_tasks=None, load_multilingual=True)
+modules_data = registry.get_tasks_dump()
 star_benchmarks = [
     "aime",
     "mmlu_pro",
+    "gpqa",
     "hle",
     "arc_agi_2",
     "ifbench",
 @dataclass
 class TaskDoc:
     module: str
     abstract: str
     languages: list[str]
     paper: str | None
     dataset: str | None
     name: str | None = None
+    task_names: list[str] = field(default_factory=list)
+def _module_to_github_path(module: str) -> str:
+    """Convert module path to GitHub source URL path."""
+    if module.startswith("lighteval."):
+        mod_path_parts = module[len("lighteval."):].split(".")
+        return "src/lighteval/" + "/".join(mod_path_parts) + ".py"
+    return "src/lighteval/" + module.replace(".", "/") + ".py"
 def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
+    """Load tasks from registry and build index."""
     docs: list[TaskDoc] = []
     language_counts: Counter = Counter()
     tag_set: set = set()
+    for entry in modules_data:
+        docstring = entry.get("docstring", {})
+        module = entry.get("module", "")
+        # Extract fields from docstring
+        abstract = docstring.get("abstract", "").strip()
+        langs = [lang.lower() for lang in docstring.get("languages", [])]
+        tgs = [t.lower() for t in docstring.get("tags", [])]
+        paper = docstring.get("paper", "").strip() or None
+        name = docstring.get("name", "").strip() or None
+        # Convert dataset array to comma-separated string
+        dataset_list = docstring.get("dataset", [])
+        dataset = ", ".join(dataset_list) if dataset_list else None
+        # Extract task names from tasks array
+        tasks_list = entry.get("tasks", [])
+        task_names = [task.get("name", "") for task in tasks_list if task.get("name")]
+        # Update counters
         for lang in langs:
             language_counts[lang] += 1
         for t in tgs:
             tag_set.add(t)
+        docs.append(TaskDoc(
+            module=module,
+            abstract=abstract,
+            languages=langs,
+            tags=tgs,
+            paper=paper,
+            dataset=dataset,
+            name=name,
+            task_names=task_names
+        ))
     languages_sorted = [
         lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
     ]
     return docs, languages_sorted, tags_sorted
+ALL_TASKS, ALL_LANGS, ALL_TAGS = index_tasks()
+TOP_LANGS = ALL_LANGS[:8]
 def normalize_name_for_matching(name: str) -> str:
+    """Normalize name for comparison: lowercase, remove underscores/spaces/colons."""
     return re.sub(r"[_\s:]+", "", name.lower())
 def is_starred_benchmark(td: TaskDoc) -> bool:
+    """Check if task is a starred benchmark."""
+    module_parts = td.module.split(".")
+    base_no_ext = module_parts[-1] if module_parts else ""
+    fallback_name = module_parts[-2] if base_no_ext == "main" and len(module_parts) >= 2 else base_no_ext
     task_name_raw = (td.name or "").lower().strip()
     task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
     normalized_task_display = normalize_name_for_matching(task_name_display)
     normalized_module = normalize_name_for_matching(base_no_ext)
     normalized_name = normalize_name_for_matching(task_name_raw)
     normalized_dataset = normalize_name_for_matching(td.dataset or "")
     for star_name in star_benchmarks:
         normalized_star = normalize_name_for_matching(star_name)
         if (normalized_star == normalized_task_display or
             normalized_star == normalized_module or
             normalized_star == normalized_name or
 def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
+    """Filter tasks by languages, tags, and search query."""
     selected_langs = [lang.lower() for lang in (languages or [])]
     selected_tags = [t.lower() for t in (tags or [])]
     search_lc = (search or "").strip().lower()
     out: list[TaskDoc] = []
     for td in ALL_TASKS:
         if selected_langs and not any(lang in td.languages for lang in selected_langs):
             continue
         if selected_tags and not any(t in td.tags for t in selected_tags):
             continue
         if search_lc:
             hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
             if search_lc not in hay:
                 continue
         out.append(td)
     out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower()))
     return out
 def truncate_text(text: str, max_length: int = 250) -> str:
+    """Truncate text to max_length, breaking at word boundary if possible."""
     if len(text) <= max_length:
         return text
     truncated = text[:max_length]
     return truncated + "..."
+def group_task_names_by_prefix(task_names: list[str]) -> list[str]:
+    """Group task names by prefix (part before colon).
+    If multiple tasks share the same prefix, only show the prefix once.
+    Tasks without a colon are shown as-is.
+    Preserves original order as much as possible.
+    """
+    prefix_groups: dict[str, list[str]] = {}
+    prefix_first_pos: dict[str, int] = {}  # Track first occurrence position
+    result: list[tuple[int, str]] = []  # (position, name) tuples
+    for pos, task_name in enumerate(task_names):
+        if ":" in task_name:
+            prefix = task_name.split(":")[0]
+            if prefix not in prefix_groups:
+                prefix_groups[prefix] = []
+                prefix_first_pos[prefix] = pos
+            prefix_groups[prefix].append(task_name)
+        else:
+            # Standalone task - add directly at its position
+            result.append((pos, task_name))
+    # Process prefix groups
+    for prefix, tasks in prefix_groups.items():
+        pos = prefix_first_pos[prefix]
+        if len(tasks) > 1:
+            # Multiple tasks share this prefix - show only the prefix
+            result.append((pos, prefix))
+        else:
+            # Only one task with this prefix - show the full task name
+            result.append((pos, tasks[0]))
+    # Sort by position to preserve original order
+    result.sort(key=lambda x: x[0])
+    return [name for _, name in result]
 def render_cards(tasks: list[TaskDoc]) -> str:
+    """Render task cards as HTML."""
     items: list[str] = []
     for t in tasks:
+        # Get display name
+        module_parts = t.module.split(".")
+        base_no_ext = module_parts[-1] if module_parts else ""
+        fallback_name = module_parts[-2] if base_no_ext == "main" and len(module_parts) >= 2 else base_no_ext
         task_name = (t.name or fallback_name).replace("_", " ").title()
+        # Build source link
+        mod_path = _module_to_github_path(t.module)
         source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
         paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
         tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
                 dataset_links.append(f'<span class="dataset-more">+{len(datasets) - 6} more</span>')
         dataset_html = " ".join(dataset_links) if dataset_links else ""
         star_icon = "⭐ " if is_starred_benchmark(t) else ""
+        # Display evaluation task names (max 3 visible, with dropdown for more)
+        # Group task names by prefix to collapse shared prefixes
+        task_names_html = ""
+        if t.task_names:
+            grouped_names = group_task_names_by_prefix(t.task_names)
+            visible_names = grouped_names[:3]
+            remaining_names = grouped_names[3:]
+            visible_html = " ".join([f'<span class="task-name">{name}</span>' for name in visible_names])
+            if remaining_names:
+                remaining_html = " ".join([f'<span class="task-name">{name}</span>' for name in remaining_names])
+                task_names_html = f'''
+                <div class="task-names">
+                  <div class="task-names-label">Run using lighteval:</div>
+                  <div class="task-names-list">{visible_html}</div>
+                  <details class="task-names-details">
+                    <summary class="task-names-summary">Show {len(remaining_names)} more</summary>
+                    <div class="task-names-list task-names-remaining">{remaining_html}</div>
+                  </details>
+                </div>
+                '''
+            else:
+                task_names_html = f'<div class="task-names"><div class="task-names-label">Run using lighteval:</div><div class="task-names-list">{visible_html}</div></div>'
         items.append(
             f"""
             <article class="card" tabindex="0" aria-label="Task {task_name}">
               {chips_tags_html}
               {chips_langs_html}
               <div class="abstract">{abstract_html}</div>
+              {task_names_html}
               <div class="links">{links_html}</div>
             </article>
             """
 def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
+    """Toggle tag filter visibility while preserving selections."""
     tags_value: list[str] = selected_tags or []
     tasks = filter_tasks(languages, tags_value, search)
     count = len(tasks)
     total = len(ALL_TASKS)
     counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
     return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)
   min-height: 48px;
 }
+.task-names {
+  margin-top: 10px;
+  padding-top: 8px;
+  border-top: 1px solid rgba(148, 163, 184, 0.15);
+}
+.task-names-label {
+  font-size: 11px;
+  font-weight: 600;
+  color: #64748b;
+  margin-bottom: 6px;
+  text-transform: uppercase;
+  letter-spacing: 0.5px;
+}
+.task-names-list {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 6px;
+}
+.task-names-remaining {
+  margin-top: 8px;
+  padding-top: 8px;
+  border-top: 1px solid rgba(148, 163, 184, 0.15);
+}
+.task-names-details {
+  margin-top: 8px;
+}
+.task-names-summary {
+  font-size: 11px;
+  font-weight: 600;
+  color: #64748b;
+  cursor: pointer;
+  user-select: none;
+  padding: 4px 8px;
+  border-radius: 4px;
+  display: inline-block;
+  transition: all 0.2s ease;
+  background: rgba(148, 163, 184, 0.1);
+}
+.task-names-summary:hover {
+  background: rgba(148, 163, 184, 0.2);
+  color: #475569;
+}
+.task-names-summary::-webkit-details-marker {
+  display: none;
+}
+.task-names-details[open] .task-names-summary {
+  margin-bottom: 8px;
+}
+.task-name {
+  display: inline-block;
+  padding: 3px 8px;
+  border-radius: 6px;
+  font-size: 11px;
+  font-weight: 500;
+  background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%);
+  color: #92400e;
+  border: 1px solid rgba(146, 64, 14, 0.2);
+  font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
+  transition: all 0.2s ease;
+}
+.task-name:hover {
+  transform: translateY(-1px);
+  box-shadow: 0 2px 6px rgba(146, 64, 14, 0.2);
+  background: linear-gradient(135deg, #fde68a 0%, #fcd34d 100%);
+}
 .links {
   margin-top: 12px;
   font-size: 12px;
   .links {
     border-top-color: rgba(148, 163, 184, 0.3);
   }
+  .task-names {
+    border-top-color: rgba(148, 163, 184, 0.25);
+  }
+  .task-names-label {
+    color: #94a3b8;
+  }
+  .task-name {
+    background: linear-gradient(135deg, rgba(146, 64, 14, 0.3) 0%, rgba(146, 64, 14, 0.2) 100%);
+    color: #fbbf24;
+    border-color: rgba(146, 64, 14, 0.3);
+  }
+  .task-name:hover {
+    background: linear-gradient(135deg, rgba(146, 64, 14, 0.4) 0%, rgba(146, 64, 14, 0.3) 100%);
+    box-shadow: 0 2px 6px rgba(251, 191, 36, 0.3);
+  }
+  .task-names-summary {
+    background: rgba(148, 163, 184, 0.15);
+    color: #94a3b8;
+  }
+  .task-names-summary:hover {
+    background: rgba(148, 163, 184, 0.25);
+    color: #cbd5e1;
+  }
+  .task-names-remaining {
+    border-top-color: rgba(148, 163, 184, 0.25);
+  }
 }
 /* apply */
 """
 with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo:
     with gr.Row():
         with gr.Column():
             gr.Markdown(
             )
             task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**")
     with gr.Row(equal_height=False):
         with gr.Column(scale=2):
             gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
+            search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="")
             with gr.Group():
                 gr.Markdown("**Languages**")
                 show_all_langs = gr.Checkbox(label="Show all languages", value=False)
+                lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[])
             with gr.Group():
                 gr.Markdown("**Benchmark type**")
                 show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
                 tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
             gr.Markdown("Tip: use the filters and search together. Results update live.")
         with gr.Column(scale=5):
             cards = gr.HTML()
             cards.value = "<div style='padding:18px'>Loading tasks…</div>"
     show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
     show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])
     search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
     lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
     tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
     initial_tasks = filter_tasks([], [], "")
     cards.value = render_cards(initial_tasks)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- git_python


1	+ git_python
2	+ git+https://github.com/huggingface/lighteval.git#egg=lighteval[dev]