Spaces:
Running
Running
Commit
·
728f913
1
Parent(s):
aa1085f
better
Browse files- app.py +244 -168
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -1,52 +1,26 @@
|
|
| 1 |
"""
|
| 2 |
Gradio dashboard to explore Lighteval tasks.
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
name: <task display name>
|
| 8 |
-
dataset: <dataset id(s)>
|
| 9 |
-
abstract: <free text>
|
| 10 |
-
languages: <comma/newline separated language codes or names>
|
| 11 |
-
tags: <comma/newline separated tags>
|
| 12 |
-
paper: <url>
|
| 13 |
-
|
| 14 |
-
This file stays outside the lighteval src tree, per request.
|
| 15 |
"""
|
| 16 |
|
| 17 |
-
import ast
|
| 18 |
-
import os
|
| 19 |
import re
|
| 20 |
from collections import Counter
|
| 21 |
-
from
|
| 22 |
-
from dataclasses import dataclass
|
| 23 |
|
| 24 |
import gradio as gr
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
if os.path.exists(repo_dir) and os.path.isdir(os.path.join(repo_dir, ".git")):
|
| 31 |
-
print(f"Pulling latest changes from {git_url}...")
|
| 32 |
-
repo = Repo(repo_dir)
|
| 33 |
-
repo.remotes.origin.pull()
|
| 34 |
-
else:
|
| 35 |
-
print(f"Cloning {git_url} to {repo_dir}...")
|
| 36 |
-
Repo.clone_from(git_url, repo_dir)
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
REPO_ROOT = "."
|
| 40 |
-
TASK_DIRS = [
|
| 41 |
-
os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "tasks"),
|
| 42 |
-
os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "multilingual", "tasks"),
|
| 43 |
-
]
|
| 44 |
|
| 45 |
|
| 46 |
star_benchmarks = [
|
| 47 |
"aime",
|
| 48 |
"mmlu_pro",
|
| 49 |
-
"gpqa
|
| 50 |
"hle",
|
| 51 |
"arc_agi_2",
|
| 52 |
"ifbench",
|
|
@@ -62,7 +36,6 @@ star_benchmarks = [
|
|
| 62 |
|
| 63 |
@dataclass
|
| 64 |
class TaskDoc:
|
| 65 |
-
file_path: str
|
| 66 |
module: str
|
| 67 |
abstract: str
|
| 68 |
languages: list[str]
|
|
@@ -70,108 +43,59 @@ class TaskDoc:
|
|
| 70 |
paper: str | None
|
| 71 |
dataset: str | None
|
| 72 |
name: str | None = None
|
|
|
|
| 73 |
|
| 74 |
|
| 75 |
-
def
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
def parse_module_docstring(text: str) -> str | None:
|
| 84 |
-
try:
|
| 85 |
-
mod = ast.parse(text)
|
| 86 |
-
return ast.get_docstring(mod)
|
| 87 |
-
except Exception:
|
| 88 |
-
# Fallback: naive regex for triple-quoted string at top
|
| 89 |
-
m = re.match(r"^\s*([\'\"])\1\1([\s\S]*?)\1\1\1", text)
|
| 90 |
-
return m.group(2).strip() if m else None
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
def parse_sections(doc: str) -> dict[str, str]:
|
| 94 |
-
# Very simple section parser keyed by lines ending with ':' on their own
|
| 95 |
-
# Expected keys: name, dataset, abstract, languages, tags, paper
|
| 96 |
-
out: dict[str, str] = {"name": "", "dataset": "", "abstract": "", "languages": "", "tags": "", "paper": ""}
|
| 97 |
-
current_key: str | None = None
|
| 98 |
-
for raw_line in doc.splitlines():
|
| 99 |
-
line = raw_line.rstrip()
|
| 100 |
-
if line.endswith(":") and line.strip().lower() in {"name:", "dataset:", "abstract:", "languages:", "tags:", "paper:"}:
|
| 101 |
-
current_key = line[:-1].strip().lower()
|
| 102 |
-
continue
|
| 103 |
-
if current_key is not None:
|
| 104 |
-
# Preserve paragraphs; we will normalize later
|
| 105 |
-
out[current_key] = (out[current_key] + ("\n" if out[current_key] else "") + line).strip()
|
| 106 |
-
return out
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
def split_list_field(value: str) -> list[str]:
|
| 110 |
-
if not value:
|
| 111 |
-
return []
|
| 112 |
-
# Support comma and newline separated values
|
| 113 |
-
parts = re.split(r"[\n,]", value)
|
| 114 |
-
cleaned: list[str] = []
|
| 115 |
-
for p in parts:
|
| 116 |
-
token = p.strip()
|
| 117 |
-
if not token:
|
| 118 |
-
continue
|
| 119 |
-
cleaned.append(token)
|
| 120 |
-
return cleaned
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
def discover_task_files() -> list[str]:
|
| 124 |
-
files: list[str] = []
|
| 125 |
-
print(f"Discovering task files in: {TASK_DIRS}")
|
| 126 |
-
for base in TASK_DIRS:
|
| 127 |
-
print(f"Discovering task files in: {base}")
|
| 128 |
-
if not os.path.isdir(base):
|
| 129 |
-
continue
|
| 130 |
-
# Top-level python files in the directory
|
| 131 |
-
for name in os.listdir(base):
|
| 132 |
-
if name.endswith(".py"):
|
| 133 |
-
files.append(os.path.join(base, name))
|
| 134 |
-
# Also include subdirectory main.py files
|
| 135 |
-
for dirpath, dirnames, filenames in os.walk(base):
|
| 136 |
-
if dirpath == base:
|
| 137 |
-
continue
|
| 138 |
-
if "main.py" in filenames:
|
| 139 |
-
files.append(os.path.join(dirpath, "main.py"))
|
| 140 |
-
# Deduplicate while preserving order
|
| 141 |
-
seen: set = set()
|
| 142 |
-
unique_files: list[str] = []
|
| 143 |
-
for p in files:
|
| 144 |
-
if p in seen:
|
| 145 |
-
continue
|
| 146 |
-
seen.add(p)
|
| 147 |
-
unique_files.append(p)
|
| 148 |
-
return sorted(unique_files)
|
| 149 |
|
| 150 |
|
| 151 |
def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
|
|
|
|
| 152 |
docs: list[TaskDoc] = []
|
| 153 |
language_counts: Counter = Counter()
|
| 154 |
tag_set: set = set()
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
for lang in langs:
|
| 170 |
language_counts[lang] += 1
|
| 171 |
for t in tgs:
|
| 172 |
tag_set.add(t)
|
| 173 |
-
|
| 174 |
-
docs.append(TaskDoc(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
languages_sorted = [
|
| 176 |
lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
|
| 177 |
]
|
|
@@ -179,39 +103,30 @@ def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
|
|
| 179 |
return docs, languages_sorted, tags_sorted
|
| 180 |
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
ALL_TASKS, ALL_LANGS, ALL_TAGS = build_index()
|
| 187 |
-
TOP_LANGS = ALL_LANGS[:8] # show more by default
|
| 188 |
|
| 189 |
|
| 190 |
def normalize_name_for_matching(name: str) -> str:
|
| 191 |
-
|
| 192 |
return re.sub(r"[_\s:]+", "", name.lower())
|
| 193 |
|
| 194 |
|
| 195 |
def is_starred_benchmark(td: TaskDoc) -> bool:
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
base_no_ext =
|
| 199 |
-
fallback_name =
|
| 200 |
|
| 201 |
-
# Normalize all possible identifiers
|
| 202 |
task_name_raw = (td.name or "").lower().strip()
|
| 203 |
task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
|
| 204 |
normalized_task_display = normalize_name_for_matching(task_name_display)
|
| 205 |
normalized_module = normalize_name_for_matching(base_no_ext)
|
| 206 |
normalized_name = normalize_name_for_matching(task_name_raw)
|
| 207 |
-
|
| 208 |
-
# Also check dataset if available
|
| 209 |
normalized_dataset = normalize_name_for_matching(td.dataset or "")
|
| 210 |
|
| 211 |
-
# Check against star_benchmarks list - try multiple matching strategies
|
| 212 |
for star_name in star_benchmarks:
|
| 213 |
normalized_star = normalize_name_for_matching(star_name)
|
| 214 |
-
# Try exact match or substring match on various fields
|
| 215 |
if (normalized_star == normalized_task_display or
|
| 216 |
normalized_star == normalized_module or
|
| 217 |
normalized_star == normalized_name or
|
|
@@ -225,27 +140,29 @@ def is_starred_benchmark(td: TaskDoc) -> bool:
|
|
| 225 |
|
| 226 |
|
| 227 |
def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
|
|
|
|
| 228 |
selected_langs = [lang.lower() for lang in (languages or [])]
|
| 229 |
selected_tags = [t.lower() for t in (tags or [])]
|
| 230 |
search_lc = (search or "").strip().lower()
|
| 231 |
out: list[TaskDoc] = []
|
|
|
|
| 232 |
for td in ALL_TASKS:
|
| 233 |
if selected_langs and not any(lang in td.languages for lang in selected_langs):
|
| 234 |
continue
|
| 235 |
if selected_tags and not any(t in td.tags for t in selected_tags):
|
| 236 |
continue
|
| 237 |
if search_lc:
|
| 238 |
-
# Search module path, abstract, tags, and dataset names
|
| 239 |
hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
|
| 240 |
if search_lc not in hay:
|
| 241 |
continue
|
| 242 |
out.append(td)
|
| 243 |
-
|
| 244 |
out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower()))
|
| 245 |
return out
|
| 246 |
|
| 247 |
|
| 248 |
def truncate_text(text: str, max_length: int = 250) -> str:
|
|
|
|
| 249 |
if len(text) <= max_length:
|
| 250 |
return text
|
| 251 |
truncated = text[:max_length]
|
|
@@ -255,16 +172,55 @@ def truncate_text(text: str, max_length: int = 250) -> str:
|
|
| 255 |
return truncated + "..."
|
| 256 |
|
| 257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
def render_cards(tasks: list[TaskDoc]) -> str:
|
| 259 |
-
|
| 260 |
items: list[str] = []
|
| 261 |
for t in tasks:
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
| 265 |
task_name = (t.name or fallback_name).replace("_", " ").title()
|
| 266 |
-
|
| 267 |
-
|
|
|
|
| 268 |
source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
|
| 269 |
paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
|
| 270 |
tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
|
|
@@ -285,6 +241,31 @@ def render_cards(tasks: list[TaskDoc]) -> str:
|
|
| 285 |
dataset_links.append(f'<span class="dataset-more">+{len(datasets) - 6} more</span>')
|
| 286 |
dataset_html = " ".join(dataset_links) if dataset_links else ""
|
| 287 |
star_icon = "⭐ " if is_starred_benchmark(t) else ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
items.append(
|
| 289 |
f"""
|
| 290 |
<article class="card" tabindex="0" aria-label="Task {task_name}">
|
|
@@ -292,6 +273,7 @@ def render_cards(tasks: list[TaskDoc]) -> str:
|
|
| 292 |
{chips_tags_html}
|
| 293 |
{chips_langs_html}
|
| 294 |
<div class="abstract">{abstract_html}</div>
|
|
|
|
| 295 |
<div class="links">{links_html}</div>
|
| 296 |
</article>
|
| 297 |
"""
|
|
@@ -318,13 +300,12 @@ def on_toggle_language_choices(show_all: bool, selected_langs: list[str], tags:
|
|
| 318 |
|
| 319 |
|
| 320 |
def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
|
| 321 |
-
|
| 322 |
tags_value: list[str] = selected_tags or []
|
| 323 |
tasks = filter_tasks(languages, tags_value, search)
|
| 324 |
count = len(tasks)
|
| 325 |
total = len(ALL_TASKS)
|
| 326 |
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
|
| 327 |
-
# keep selections when showing; when hiding we keep value but component hidden (so filter still uses them)
|
| 328 |
return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)
|
| 329 |
|
| 330 |
|
|
@@ -440,6 +421,82 @@ custom_css = """
|
|
| 440 |
min-height: 48px;
|
| 441 |
}
|
| 442 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
.links {
|
| 444 |
margin-top: 12px;
|
| 445 |
font-size: 12px;
|
|
@@ -551,6 +608,39 @@ custom_css = """
|
|
| 551 |
.links {
|
| 552 |
border-top-color: rgba(148, 163, 184, 0.3);
|
| 553 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
}
|
| 555 |
|
| 556 |
/* apply */
|
|
@@ -584,7 +674,6 @@ body {
|
|
| 584 |
"""
|
| 585 |
|
| 586 |
with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo:
|
| 587 |
-
# Header / hero
|
| 588 |
with gr.Row():
|
| 589 |
with gr.Column():
|
| 590 |
gr.Markdown(
|
|
@@ -595,46 +684,33 @@ with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo:
|
|
| 595 |
)
|
| 596 |
task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**")
|
| 597 |
|
| 598 |
-
# Controls and results in two columns (left: controls, right: cards)
|
| 599 |
with gr.Row(equal_height=False):
|
| 600 |
with gr.Column(scale=2):
|
| 601 |
gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
|
| 602 |
-
|
| 603 |
-
search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="", interactive=True)
|
| 604 |
-
# We want debounce behavior: use .change with every character by setting interactive=True and triggering on input
|
| 605 |
-
# Filters
|
| 606 |
with gr.Group():
|
| 607 |
gr.Markdown("**Languages**")
|
| 608 |
show_all_langs = gr.Checkbox(label="Show all languages", value=False)
|
| 609 |
-
lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[])
|
| 610 |
with gr.Group():
|
| 611 |
gr.Markdown("**Benchmark type**")
|
| 612 |
show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
|
| 613 |
tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
|
| 614 |
-
# small hint
|
| 615 |
gr.Markdown("Tip: use the filters and search together. Results update live.")
|
| 616 |
|
| 617 |
with gr.Column(scale=5):
|
| 618 |
cards = gr.HTML()
|
| 619 |
-
# put an initially visible loading placeholder
|
| 620 |
cards.value = "<div style='padding:18px'>Loading tasks…</div>"
|
| 621 |
|
| 622 |
-
# Wire interactions
|
| 623 |
-
# Toggle expand/collapse language choices
|
| 624 |
show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
|
| 625 |
-
# Toggle tag filter visibility (keeps values)
|
| 626 |
show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])
|
| 627 |
-
|
| 628 |
-
# Live filtering: wire change events on controls to update cards.
|
| 629 |
-
# Textbox: trigger on every change (interactive True). If Gradio runtime has debounce param, it's used internally.
|
| 630 |
search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
|
| 631 |
lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
|
| 632 |
tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
|
| 633 |
|
| 634 |
-
# Initial load: display all tasks (starred benchmarks first)
|
| 635 |
initial_tasks = filter_tasks([], [], "")
|
| 636 |
cards.value = render_cards(initial_tasks)
|
| 637 |
|
| 638 |
|
| 639 |
-
|
| 640 |
-
demo.launch()
|
|
|
|
| 1 |
"""
|
| 2 |
Gradio dashboard to explore Lighteval tasks.
|
| 3 |
|
| 4 |
+
Loads tasks from the lighteval Registry and displays them in a searchable,
|
| 5 |
+
filterable interface.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
|
|
|
|
|
|
| 8 |
import re
|
| 9 |
from collections import Counter
|
| 10 |
+
from dataclasses import dataclass, field
|
|
|
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
+
from lighteval.tasks.registry import Registry
|
| 14 |
|
| 15 |
|
| 16 |
+
registry = Registry(custom_tasks=None, load_multilingual=True)
|
| 17 |
+
modules_data = registry.get_tasks_dump()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
star_benchmarks = [
|
| 21 |
"aime",
|
| 22 |
"mmlu_pro",
|
| 23 |
+
"gpqa",
|
| 24 |
"hle",
|
| 25 |
"arc_agi_2",
|
| 26 |
"ifbench",
|
|
|
|
| 36 |
|
| 37 |
@dataclass
|
| 38 |
class TaskDoc:
|
|
|
|
| 39 |
module: str
|
| 40 |
abstract: str
|
| 41 |
languages: list[str]
|
|
|
|
| 43 |
paper: str | None
|
| 44 |
dataset: str | None
|
| 45 |
name: str | None = None
|
| 46 |
+
task_names: list[str] = field(default_factory=list)
|
| 47 |
|
| 48 |
|
| 49 |
+
def _module_to_github_path(module: str) -> str:
|
| 50 |
+
"""Convert module path to GitHub source URL path."""
|
| 51 |
+
if module.startswith("lighteval."):
|
| 52 |
+
mod_path_parts = module[len("lighteval."):].split(".")
|
| 53 |
+
return "src/lighteval/" + "/".join(mod_path_parts) + ".py"
|
| 54 |
+
return "src/lighteval/" + module.replace(".", "/") + ".py"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
|
| 58 |
+
"""Load tasks from registry and build index."""
|
| 59 |
docs: list[TaskDoc] = []
|
| 60 |
language_counts: Counter = Counter()
|
| 61 |
tag_set: set = set()
|
| 62 |
+
|
| 63 |
+
for entry in modules_data:
|
| 64 |
+
docstring = entry.get("docstring", {})
|
| 65 |
+
module = entry.get("module", "")
|
| 66 |
+
|
| 67 |
+
# Extract fields from docstring
|
| 68 |
+
abstract = docstring.get("abstract", "").strip()
|
| 69 |
+
langs = [lang.lower() for lang in docstring.get("languages", [])]
|
| 70 |
+
tgs = [t.lower() for t in docstring.get("tags", [])]
|
| 71 |
+
paper = docstring.get("paper", "").strip() or None
|
| 72 |
+
name = docstring.get("name", "").strip() or None
|
| 73 |
+
|
| 74 |
+
# Convert dataset array to comma-separated string
|
| 75 |
+
dataset_list = docstring.get("dataset", [])
|
| 76 |
+
dataset = ", ".join(dataset_list) if dataset_list else None
|
| 77 |
+
|
| 78 |
+
# Extract task names from tasks array
|
| 79 |
+
tasks_list = entry.get("tasks", [])
|
| 80 |
+
task_names = [task.get("name", "") for task in tasks_list if task.get("name")]
|
| 81 |
+
|
| 82 |
+
# Update counters
|
| 83 |
for lang in langs:
|
| 84 |
language_counts[lang] += 1
|
| 85 |
for t in tgs:
|
| 86 |
tag_set.add(t)
|
| 87 |
+
|
| 88 |
+
docs.append(TaskDoc(
|
| 89 |
+
module=module,
|
| 90 |
+
abstract=abstract,
|
| 91 |
+
languages=langs,
|
| 92 |
+
tags=tgs,
|
| 93 |
+
paper=paper,
|
| 94 |
+
dataset=dataset,
|
| 95 |
+
name=name,
|
| 96 |
+
task_names=task_names
|
| 97 |
+
))
|
| 98 |
+
|
| 99 |
languages_sorted = [
|
| 100 |
lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
|
| 101 |
]
|
|
|
|
| 103 |
return docs, languages_sorted, tags_sorted
|
| 104 |
|
| 105 |
|
| 106 |
+
ALL_TASKS, ALL_LANGS, ALL_TAGS = index_tasks()
|
| 107 |
+
TOP_LANGS = ALL_LANGS[:8]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
|
| 110 |
def normalize_name_for_matching(name: str) -> str:
|
| 111 |
+
"""Normalize name for comparison: lowercase, remove underscores/spaces/colons."""
|
| 112 |
return re.sub(r"[_\s:]+", "", name.lower())
|
| 113 |
|
| 114 |
|
| 115 |
def is_starred_benchmark(td: TaskDoc) -> bool:
|
| 116 |
+
"""Check if task is a starred benchmark."""
|
| 117 |
+
module_parts = td.module.split(".")
|
| 118 |
+
base_no_ext = module_parts[-1] if module_parts else ""
|
| 119 |
+
fallback_name = module_parts[-2] if base_no_ext == "main" and len(module_parts) >= 2 else base_no_ext
|
| 120 |
|
|
|
|
| 121 |
task_name_raw = (td.name or "").lower().strip()
|
| 122 |
task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
|
| 123 |
normalized_task_display = normalize_name_for_matching(task_name_display)
|
| 124 |
normalized_module = normalize_name_for_matching(base_no_ext)
|
| 125 |
normalized_name = normalize_name_for_matching(task_name_raw)
|
|
|
|
|
|
|
| 126 |
normalized_dataset = normalize_name_for_matching(td.dataset or "")
|
| 127 |
|
|
|
|
| 128 |
for star_name in star_benchmarks:
|
| 129 |
normalized_star = normalize_name_for_matching(star_name)
|
|
|
|
| 130 |
if (normalized_star == normalized_task_display or
|
| 131 |
normalized_star == normalized_module or
|
| 132 |
normalized_star == normalized_name or
|
|
|
|
| 140 |
|
| 141 |
|
| 142 |
def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
|
| 143 |
+
"""Filter tasks by languages, tags, and search query."""
|
| 144 |
selected_langs = [lang.lower() for lang in (languages or [])]
|
| 145 |
selected_tags = [t.lower() for t in (tags or [])]
|
| 146 |
search_lc = (search or "").strip().lower()
|
| 147 |
out: list[TaskDoc] = []
|
| 148 |
+
|
| 149 |
for td in ALL_TASKS:
|
| 150 |
if selected_langs and not any(lang in td.languages for lang in selected_langs):
|
| 151 |
continue
|
| 152 |
if selected_tags and not any(t in td.tags for t in selected_tags):
|
| 153 |
continue
|
| 154 |
if search_lc:
|
|
|
|
| 155 |
hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
|
| 156 |
if search_lc not in hay:
|
| 157 |
continue
|
| 158 |
out.append(td)
|
| 159 |
+
|
| 160 |
out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower()))
|
| 161 |
return out
|
| 162 |
|
| 163 |
|
| 164 |
def truncate_text(text: str, max_length: int = 250) -> str:
|
| 165 |
+
"""Truncate text to max_length, breaking at word boundary if possible."""
|
| 166 |
if len(text) <= max_length:
|
| 167 |
return text
|
| 168 |
truncated = text[:max_length]
|
|
|
|
| 172 |
return truncated + "..."
|
| 173 |
|
| 174 |
|
| 175 |
+
def group_task_names_by_prefix(task_names: list[str]) -> list[str]:
|
| 176 |
+
"""Group task names by prefix (part before colon).
|
| 177 |
+
|
| 178 |
+
If multiple tasks share the same prefix, only show the prefix once.
|
| 179 |
+
Tasks without a colon are shown as-is.
|
| 180 |
+
Preserves original order as much as possible.
|
| 181 |
+
"""
|
| 182 |
+
prefix_groups: dict[str, list[str]] = {}
|
| 183 |
+
prefix_first_pos: dict[str, int] = {} # Track first occurrence position
|
| 184 |
+
result: list[tuple[int, str]] = [] # (position, name) tuples
|
| 185 |
+
|
| 186 |
+
for pos, task_name in enumerate(task_names):
|
| 187 |
+
if ":" in task_name:
|
| 188 |
+
prefix = task_name.split(":")[0]
|
| 189 |
+
if prefix not in prefix_groups:
|
| 190 |
+
prefix_groups[prefix] = []
|
| 191 |
+
prefix_first_pos[prefix] = pos
|
| 192 |
+
prefix_groups[prefix].append(task_name)
|
| 193 |
+
else:
|
| 194 |
+
# Standalone task - add directly at its position
|
| 195 |
+
result.append((pos, task_name))
|
| 196 |
+
|
| 197 |
+
# Process prefix groups
|
| 198 |
+
for prefix, tasks in prefix_groups.items():
|
| 199 |
+
pos = prefix_first_pos[prefix]
|
| 200 |
+
if len(tasks) > 1:
|
| 201 |
+
# Multiple tasks share this prefix - show only the prefix
|
| 202 |
+
result.append((pos, prefix))
|
| 203 |
+
else:
|
| 204 |
+
# Only one task with this prefix - show the full task name
|
| 205 |
+
result.append((pos, tasks[0]))
|
| 206 |
+
|
| 207 |
+
# Sort by position to preserve original order
|
| 208 |
+
result.sort(key=lambda x: x[0])
|
| 209 |
+
return [name for _, name in result]
|
| 210 |
+
|
| 211 |
+
|
| 212 |
def render_cards(tasks: list[TaskDoc]) -> str:
|
| 213 |
+
"""Render task cards as HTML."""
|
| 214 |
items: list[str] = []
|
| 215 |
for t in tasks:
|
| 216 |
+
# Get display name
|
| 217 |
+
module_parts = t.module.split(".")
|
| 218 |
+
base_no_ext = module_parts[-1] if module_parts else ""
|
| 219 |
+
fallback_name = module_parts[-2] if base_no_ext == "main" and len(module_parts) >= 2 else base_no_ext
|
| 220 |
task_name = (t.name or fallback_name).replace("_", " ").title()
|
| 221 |
+
|
| 222 |
+
# Build source link
|
| 223 |
+
mod_path = _module_to_github_path(t.module)
|
| 224 |
source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
|
| 225 |
paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
|
| 226 |
tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
|
|
|
|
| 241 |
dataset_links.append(f'<span class="dataset-more">+{len(datasets) - 6} more</span>')
|
| 242 |
dataset_html = " ".join(dataset_links) if dataset_links else ""
|
| 243 |
star_icon = "⭐ " if is_starred_benchmark(t) else ""
|
| 244 |
+
|
| 245 |
+
# Display evaluation task names (max 3 visible, with dropdown for more)
|
| 246 |
+
# Group task names by prefix to collapse shared prefixes
|
| 247 |
+
task_names_html = ""
|
| 248 |
+
if t.task_names:
|
| 249 |
+
grouped_names = group_task_names_by_prefix(t.task_names)
|
| 250 |
+
visible_names = grouped_names[:3]
|
| 251 |
+
remaining_names = grouped_names[3:]
|
| 252 |
+
visible_html = " ".join([f'<span class="task-name">{name}</span>' for name in visible_names])
|
| 253 |
+
|
| 254 |
+
if remaining_names:
|
| 255 |
+
remaining_html = " ".join([f'<span class="task-name">{name}</span>' for name in remaining_names])
|
| 256 |
+
task_names_html = f'''
|
| 257 |
+
<div class="task-names">
|
| 258 |
+
<div class="task-names-label">Run using lighteval:</div>
|
| 259 |
+
<div class="task-names-list">{visible_html}</div>
|
| 260 |
+
<details class="task-names-details">
|
| 261 |
+
<summary class="task-names-summary">Show {len(remaining_names)} more</summary>
|
| 262 |
+
<div class="task-names-list task-names-remaining">{remaining_html}</div>
|
| 263 |
+
</details>
|
| 264 |
+
</div>
|
| 265 |
+
'''
|
| 266 |
+
else:
|
| 267 |
+
task_names_html = f'<div class="task-names"><div class="task-names-label">Run using lighteval:</div><div class="task-names-list">{visible_html}</div></div>'
|
| 268 |
+
|
| 269 |
items.append(
|
| 270 |
f"""
|
| 271 |
<article class="card" tabindex="0" aria-label="Task {task_name}">
|
|
|
|
| 273 |
{chips_tags_html}
|
| 274 |
{chips_langs_html}
|
| 275 |
<div class="abstract">{abstract_html}</div>
|
| 276 |
+
{task_names_html}
|
| 277 |
<div class="links">{links_html}</div>
|
| 278 |
</article>
|
| 279 |
"""
|
|
|
|
| 300 |
|
| 301 |
|
| 302 |
def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
|
| 303 |
+
"""Toggle tag filter visibility while preserving selections."""
|
| 304 |
tags_value: list[str] = selected_tags or []
|
| 305 |
tasks = filter_tasks(languages, tags_value, search)
|
| 306 |
count = len(tasks)
|
| 307 |
total = len(ALL_TASKS)
|
| 308 |
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
|
|
|
|
| 309 |
return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)
|
| 310 |
|
| 311 |
|
|
|
|
| 421 |
min-height: 48px;
|
| 422 |
}
|
| 423 |
|
| 424 |
+
.task-names {
|
| 425 |
+
margin-top: 10px;
|
| 426 |
+
padding-top: 8px;
|
| 427 |
+
border-top: 1px solid rgba(148, 163, 184, 0.15);
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
.task-names-label {
|
| 431 |
+
font-size: 11px;
|
| 432 |
+
font-weight: 600;
|
| 433 |
+
color: #64748b;
|
| 434 |
+
margin-bottom: 6px;
|
| 435 |
+
text-transform: uppercase;
|
| 436 |
+
letter-spacing: 0.5px;
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
.task-names-list {
|
| 440 |
+
display: flex;
|
| 441 |
+
flex-wrap: wrap;
|
| 442 |
+
gap: 6px;
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
.task-names-remaining {
|
| 446 |
+
margin-top: 8px;
|
| 447 |
+
padding-top: 8px;
|
| 448 |
+
border-top: 1px solid rgba(148, 163, 184, 0.15);
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
.task-names-details {
|
| 452 |
+
margin-top: 8px;
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
.task-names-summary {
|
| 456 |
+
font-size: 11px;
|
| 457 |
+
font-weight: 600;
|
| 458 |
+
color: #64748b;
|
| 459 |
+
cursor: pointer;
|
| 460 |
+
user-select: none;
|
| 461 |
+
padding: 4px 8px;
|
| 462 |
+
border-radius: 4px;
|
| 463 |
+
display: inline-block;
|
| 464 |
+
transition: all 0.2s ease;
|
| 465 |
+
background: rgba(148, 163, 184, 0.1);
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
.task-names-summary:hover {
|
| 469 |
+
background: rgba(148, 163, 184, 0.2);
|
| 470 |
+
color: #475569;
|
| 471 |
+
}
|
| 472 |
+
|
| 473 |
+
.task-names-summary::-webkit-details-marker {
|
| 474 |
+
display: none;
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
.task-names-details[open] .task-names-summary {
|
| 478 |
+
margin-bottom: 8px;
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
.task-name {
|
| 482 |
+
display: inline-block;
|
| 483 |
+
padding: 3px 8px;
|
| 484 |
+
border-radius: 6px;
|
| 485 |
+
font-size: 11px;
|
| 486 |
+
font-weight: 500;
|
| 487 |
+
background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%);
|
| 488 |
+
color: #92400e;
|
| 489 |
+
border: 1px solid rgba(146, 64, 14, 0.2);
|
| 490 |
+
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
| 491 |
+
transition: all 0.2s ease;
|
| 492 |
+
}
|
| 493 |
+
|
| 494 |
+
.task-name:hover {
|
| 495 |
+
transform: translateY(-1px);
|
| 496 |
+
box-shadow: 0 2px 6px rgba(146, 64, 14, 0.2);
|
| 497 |
+
background: linear-gradient(135deg, #fde68a 0%, #fcd34d 100%);
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
.links {
|
| 501 |
margin-top: 12px;
|
| 502 |
font-size: 12px;
|
|
|
|
| 608 |
.links {
|
| 609 |
border-top-color: rgba(148, 163, 184, 0.3);
|
| 610 |
}
|
| 611 |
+
|
| 612 |
+
.task-names {
|
| 613 |
+
border-top-color: rgba(148, 163, 184, 0.25);
|
| 614 |
+
}
|
| 615 |
+
|
| 616 |
+
.task-names-label {
|
| 617 |
+
color: #94a3b8;
|
| 618 |
+
}
|
| 619 |
+
|
| 620 |
+
.task-name {
|
| 621 |
+
background: linear-gradient(135deg, rgba(146, 64, 14, 0.3) 0%, rgba(146, 64, 14, 0.2) 100%);
|
| 622 |
+
color: #fbbf24;
|
| 623 |
+
border-color: rgba(146, 64, 14, 0.3);
|
| 624 |
+
}
|
| 625 |
+
|
| 626 |
+
.task-name:hover {
|
| 627 |
+
background: linear-gradient(135deg, rgba(146, 64, 14, 0.4) 0%, rgba(146, 64, 14, 0.3) 100%);
|
| 628 |
+
box-shadow: 0 2px 6px rgba(251, 191, 36, 0.3);
|
| 629 |
+
}
|
| 630 |
+
|
| 631 |
+
.task-names-summary {
|
| 632 |
+
background: rgba(148, 163, 184, 0.15);
|
| 633 |
+
color: #94a3b8;
|
| 634 |
+
}
|
| 635 |
+
|
| 636 |
+
.task-names-summary:hover {
|
| 637 |
+
background: rgba(148, 163, 184, 0.25);
|
| 638 |
+
color: #cbd5e1;
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
.task-names-remaining {
|
| 642 |
+
border-top-color: rgba(148, 163, 184, 0.25);
|
| 643 |
+
}
|
| 644 |
}
|
| 645 |
|
| 646 |
/* apply */
|
|
|
|
| 674 |
"""
|
| 675 |
|
| 676 |
with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo:
|
|
|
|
| 677 |
with gr.Row():
|
| 678 |
with gr.Column():
|
| 679 |
gr.Markdown(
|
|
|
|
| 684 |
)
|
| 685 |
task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**")
|
| 686 |
|
|
|
|
| 687 |
with gr.Row(equal_height=False):
|
| 688 |
with gr.Column(scale=2):
|
| 689 |
gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
|
| 690 |
+
search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="")
|
|
|
|
|
|
|
|
|
|
| 691 |
with gr.Group():
|
| 692 |
gr.Markdown("**Languages**")
|
| 693 |
show_all_langs = gr.Checkbox(label="Show all languages", value=False)
|
| 694 |
+
lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[])
|
| 695 |
with gr.Group():
|
| 696 |
gr.Markdown("**Benchmark type**")
|
| 697 |
show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
|
| 698 |
tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
|
|
|
|
| 699 |
gr.Markdown("Tip: use the filters and search together. Results update live.")
|
| 700 |
|
| 701 |
with gr.Column(scale=5):
|
| 702 |
cards = gr.HTML()
|
|
|
|
| 703 |
cards.value = "<div style='padding:18px'>Loading tasks…</div>"
|
| 704 |
|
|
|
|
|
|
|
| 705 |
show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
|
|
|
|
| 706 |
show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])
|
|
|
|
|
|
|
|
|
|
| 707 |
search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
|
| 708 |
lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
|
| 709 |
tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
|
| 710 |
|
|
|
|
| 711 |
initial_tasks = filter_tasks([], [], "")
|
| 712 |
cards.value = render_cards(initial_tasks)
|
| 713 |
|
| 714 |
|
| 715 |
+
if __name__ == "__main__":
|
| 716 |
+
demo.launch()
|
requirements.txt
CHANGED
|
@@ -1 +1,2 @@
|
|
| 1 |
-
git_python
|
|
|
|
|
|
| 1 |
+
git_python
|
| 2 |
+
git+https://github.com/huggingface/lighteval.git#egg=lighteval[dev]
|