Linker1907 commited on
Commit
728f913
·
1 Parent(s): aa1085f
Files changed (2) hide show
  1. app.py +244 -168
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,52 +1,26 @@
1
  """
2
  Gradio dashboard to explore Lighteval tasks.
3
 
4
- Scans `src/lighteval/tasks/tasks` and `src/lighteval/tasks/multilingual/tasks`
5
- for module-level docstrings with this format:
6
-
7
- name: <task display name>
8
- dataset: <dataset id(s)>
9
- abstract: <free text>
10
- languages: <comma/newline separated language codes or names>
11
- tags: <comma/newline separated tags>
12
- paper: <url>
13
-
14
- This file stays outside the lighteval src tree, per request.
15
  """
16
 
17
- import ast
18
- import os
19
  import re
20
  from collections import Counter
21
- from git import Repo # pip install gitpython
22
- from dataclasses import dataclass
23
 
24
  import gradio as gr
 
25
 
26
 
27
- git_url = "https://github.com/huggingface/lighteval.git"
28
- repo_dir = "./lighteval"
29
-
30
- if os.path.exists(repo_dir) and os.path.isdir(os.path.join(repo_dir, ".git")):
31
- print(f"Pulling latest changes from {git_url}...")
32
- repo = Repo(repo_dir)
33
- repo.remotes.origin.pull()
34
- else:
35
- print(f"Cloning {git_url} to {repo_dir}...")
36
- Repo.clone_from(git_url, repo_dir)
37
-
38
-
39
- REPO_ROOT = "."
40
- TASK_DIRS = [
41
- os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "tasks"),
42
- os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "multilingual", "tasks"),
43
- ]
44
 
45
 
46
  star_benchmarks = [
47
  "aime",
48
  "mmlu_pro",
49
- "gpqa:diamond",
50
  "hle",
51
  "arc_agi_2",
52
  "ifbench",
@@ -62,7 +36,6 @@ star_benchmarks = [
62
 
63
  @dataclass
64
  class TaskDoc:
65
- file_path: str
66
  module: str
67
  abstract: str
68
  languages: list[str]
@@ -70,108 +43,59 @@ class TaskDoc:
70
  paper: str | None
71
  dataset: str | None
72
  name: str | None = None
 
73
 
74
 
75
- def read_file_text(path: str) -> str | None:
76
- try:
77
- with open(path, "r", encoding="utf-8") as f:
78
- return f.read()
79
- except Exception:
80
- return None
81
-
82
-
83
- def parse_module_docstring(text: str) -> str | None:
84
- try:
85
- mod = ast.parse(text)
86
- return ast.get_docstring(mod)
87
- except Exception:
88
- # Fallback: naive regex for triple-quoted string at top
89
- m = re.match(r"^\s*([\'\"])\1\1([\s\S]*?)\1\1\1", text)
90
- return m.group(2).strip() if m else None
91
-
92
-
93
- def parse_sections(doc: str) -> dict[str, str]:
94
- # Very simple section parser keyed by lines ending with ':' on their own
95
- # Expected keys: name, dataset, abstract, languages, tags, paper
96
- out: dict[str, str] = {"name": "", "dataset": "", "abstract": "", "languages": "", "tags": "", "paper": ""}
97
- current_key: str | None = None
98
- for raw_line in doc.splitlines():
99
- line = raw_line.rstrip()
100
- if line.endswith(":") and line.strip().lower() in {"name:", "dataset:", "abstract:", "languages:", "tags:", "paper:"}:
101
- current_key = line[:-1].strip().lower()
102
- continue
103
- if current_key is not None:
104
- # Preserve paragraphs; we will normalize later
105
- out[current_key] = (out[current_key] + ("\n" if out[current_key] else "") + line).strip()
106
- return out
107
-
108
-
109
- def split_list_field(value: str) -> list[str]:
110
- if not value:
111
- return []
112
- # Support comma and newline separated values
113
- parts = re.split(r"[\n,]", value)
114
- cleaned: list[str] = []
115
- for p in parts:
116
- token = p.strip()
117
- if not token:
118
- continue
119
- cleaned.append(token)
120
- return cleaned
121
-
122
-
123
- def discover_task_files() -> list[str]:
124
- files: list[str] = []
125
- print(f"Discovering task files in: {TASK_DIRS}")
126
- for base in TASK_DIRS:
127
- print(f"Discovering task files in: {base}")
128
- if not os.path.isdir(base):
129
- continue
130
- # Top-level python files in the directory
131
- for name in os.listdir(base):
132
- if name.endswith(".py"):
133
- files.append(os.path.join(base, name))
134
- # Also include subdirectory main.py files
135
- for dirpath, dirnames, filenames in os.walk(base):
136
- if dirpath == base:
137
- continue
138
- if "main.py" in filenames:
139
- files.append(os.path.join(dirpath, "main.py"))
140
- # Deduplicate while preserving order
141
- seen: set = set()
142
- unique_files: list[str] = []
143
- for p in files:
144
- if p in seen:
145
- continue
146
- seen.add(p)
147
- unique_files.append(p)
148
- return sorted(unique_files)
149
 
150
 
151
  def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
 
152
  docs: list[TaskDoc] = []
153
  language_counts: Counter = Counter()
154
  tag_set: set = set()
155
- for path in discover_task_files():
156
- text = read_file_text(path)
157
- if not text:
158
- continue
159
- doc = parse_module_docstring(text)
160
- if not doc:
161
- continue
162
- sections = parse_sections(doc)
163
- abstract = sections.get("abstract", "").strip()
164
- langs = [lang.lower() for lang in split_list_field(sections.get("languages", ""))]
165
- tgs = [t.lower() for t in split_list_field(sections.get("tags", ""))]
166
- paper = sections.get("paper", "").strip() or None
167
- dataset = sections.get("dataset", "").strip() or None
168
- name = sections.get("name", "").strip() or None
 
 
 
 
 
 
 
169
  for lang in langs:
170
  language_counts[lang] += 1
171
  for t in tgs:
172
  tag_set.add(t)
173
- module = os.path.relpath(path, REPO_ROOT)
174
- docs.append(TaskDoc(file_path=path, module=module, abstract=abstract, languages=langs, tags=tgs, paper=paper, dataset=dataset, name=name))
 
 
 
 
 
 
 
 
 
 
175
  languages_sorted = [
176
  lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
177
  ]
@@ -179,39 +103,30 @@ def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
179
  return docs, languages_sorted, tags_sorted
180
 
181
 
182
- def build_index() -> tuple[list[TaskDoc], list[str], list[str]]:
183
- return index_tasks()
184
-
185
-
186
- ALL_TASKS, ALL_LANGS, ALL_TAGS = build_index()
187
- TOP_LANGS = ALL_LANGS[:8] # show more by default
188
 
189
 
190
  def normalize_name_for_matching(name: str) -> str:
191
- # Normalize for comparison: lowercase, remove underscores/spaces/colons
192
  return re.sub(r"[_\s:]+", "", name.lower())
193
 
194
 
195
  def is_starred_benchmark(td: TaskDoc) -> bool:
196
- # Check multiple possible identifiers
197
- parts = td.module.replace("\\", "/").split("/")
198
- base_no_ext = parts[-1].rsplit(".", 1)[0]
199
- fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
200
 
201
- # Normalize all possible identifiers
202
  task_name_raw = (td.name or "").lower().strip()
203
  task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
204
  normalized_task_display = normalize_name_for_matching(task_name_display)
205
  normalized_module = normalize_name_for_matching(base_no_ext)
206
  normalized_name = normalize_name_for_matching(task_name_raw)
207
-
208
- # Also check dataset if available
209
  normalized_dataset = normalize_name_for_matching(td.dataset or "")
210
 
211
- # Check against star_benchmarks list - try multiple matching strategies
212
  for star_name in star_benchmarks:
213
  normalized_star = normalize_name_for_matching(star_name)
214
- # Try exact match or substring match on various fields
215
  if (normalized_star == normalized_task_display or
216
  normalized_star == normalized_module or
217
  normalized_star == normalized_name or
@@ -225,27 +140,29 @@ def is_starred_benchmark(td: TaskDoc) -> bool:
225
 
226
 
227
  def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
 
228
  selected_langs = [lang.lower() for lang in (languages or [])]
229
  selected_tags = [t.lower() for t in (tags or [])]
230
  search_lc = (search or "").strip().lower()
231
  out: list[TaskDoc] = []
 
232
  for td in ALL_TASKS:
233
  if selected_langs and not any(lang in td.languages for lang in selected_langs):
234
  continue
235
  if selected_tags and not any(t in td.tags for t in selected_tags):
236
  continue
237
  if search_lc:
238
- # Search module path, abstract, tags, and dataset names
239
  hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
240
  if search_lc not in hay:
241
  continue
242
  out.append(td)
243
- # Sort: starred benchmarks first, then by name
244
  out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower()))
245
  return out
246
 
247
 
248
  def truncate_text(text: str, max_length: int = 250) -> str:
 
249
  if len(text) <= max_length:
250
  return text
251
  truncated = text[:max_length]
@@ -255,16 +172,55 @@ def truncate_text(text: str, max_length: int = 250) -> str:
255
  return truncated + "..."
256
 
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  def render_cards(tasks: list[TaskDoc]) -> str:
259
- # Responsive grid of pretty cards; show all details without clicks
260
  items: list[str] = []
261
  for t in tasks:
262
- parts = t.module.replace("\\", "/").split("/")
263
- base_no_ext = parts[-1].rsplit(".", 1)[0]
264
- fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
 
265
  task_name = (t.name or fallback_name).replace("_", " ").title()
266
- mod_path = t.module.replace("\\", "/")
267
- mod_path = mod_path.split("/", 1)[1]
 
268
  source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
269
  paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
270
  tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
@@ -285,6 +241,31 @@ def render_cards(tasks: list[TaskDoc]) -> str:
285
  dataset_links.append(f'<span class="dataset-more">+{len(datasets) - 6} more</span>')
286
  dataset_html = " ".join(dataset_links) if dataset_links else ""
287
  star_icon = "⭐ " if is_starred_benchmark(t) else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  items.append(
289
  f"""
290
  <article class="card" tabindex="0" aria-label="Task {task_name}">
@@ -292,6 +273,7 @@ def render_cards(tasks: list[TaskDoc]) -> str:
292
  {chips_tags_html}
293
  {chips_langs_html}
294
  <div class="abstract">{abstract_html}</div>
 
295
  <div class="links">{links_html}</div>
296
  </article>
297
  """
@@ -318,13 +300,12 @@ def on_toggle_language_choices(show_all: bool, selected_langs: list[str], tags:
318
 
319
 
320
  def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
321
- # Only toggle visibility; preserve current tag selections and keep them active in filtering
322
  tags_value: list[str] = selected_tags or []
323
  tasks = filter_tasks(languages, tags_value, search)
324
  count = len(tasks)
325
  total = len(ALL_TASKS)
326
  counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
327
- # keep selections when showing; when hiding we keep value but component hidden (so filter still uses them)
328
  return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)
329
 
330
 
@@ -440,6 +421,82 @@ custom_css = """
440
  min-height: 48px;
441
  }
442
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  .links {
444
  margin-top: 12px;
445
  font-size: 12px;
@@ -551,6 +608,39 @@ custom_css = """
551
  .links {
552
  border-top-color: rgba(148, 163, 184, 0.3);
553
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
  }
555
 
556
  /* apply */
@@ -584,7 +674,6 @@ body {
584
  """
585
 
586
  with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo:
587
- # Header / hero
588
  with gr.Row():
589
  with gr.Column():
590
  gr.Markdown(
@@ -595,46 +684,33 @@ with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo:
595
  )
596
  task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**")
597
 
598
- # Controls and results in two columns (left: controls, right: cards)
599
  with gr.Row(equal_height=False):
600
  with gr.Column(scale=2):
601
  gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
602
- # Search with interactive debounce
603
- search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="", interactive=True)
604
- # We want debounce behavior: use .change with every character by setting interactive=True and triggering on input
605
- # Filters
606
  with gr.Group():
607
  gr.Markdown("**Languages**")
608
  show_all_langs = gr.Checkbox(label="Show all languages", value=False)
609
- lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[]) # default none selected
610
  with gr.Group():
611
  gr.Markdown("**Benchmark type**")
612
  show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
613
  tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
614
- # small hint
615
  gr.Markdown("Tip: use the filters and search together. Results update live.")
616
 
617
  with gr.Column(scale=5):
618
  cards = gr.HTML()
619
- # put an initially visible loading placeholder
620
  cards.value = "<div style='padding:18px'>Loading tasks…</div>"
621
 
622
- # Wire interactions
623
- # Toggle expand/collapse language choices
624
  show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
625
- # Toggle tag filter visibility (keeps values)
626
  show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])
627
-
628
- # Live filtering: wire change events on controls to update cards.
629
- # Textbox: trigger on every change (interactive True). If Gradio runtime has debounce param, it's used internally.
630
  search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
631
  lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
632
  tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
633
 
634
- # Initial load: display all tasks (starred benchmarks first)
635
  initial_tasks = filter_tasks([], [], "")
636
  cards.value = render_cards(initial_tasks)
637
 
638
 
639
- # Run with `python benchmark_finder/app.py`
640
- demo.launch()
 
1
  """
2
  Gradio dashboard to explore Lighteval tasks.
3
 
4
+ Loads tasks from the lighteval Registry and displays them in a searchable,
5
+ filterable interface.
 
 
 
 
 
 
 
 
 
6
  """
7
 
 
 
8
  import re
9
  from collections import Counter
10
+ from dataclasses import dataclass, field
 
11
 
12
  import gradio as gr
13
+ from lighteval.tasks.registry import Registry
14
 
15
 
16
+ registry = Registry(custom_tasks=None, load_multilingual=True)
17
+ modules_data = registry.get_tasks_dump()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  star_benchmarks = [
21
  "aime",
22
  "mmlu_pro",
23
+ "gpqa",
24
  "hle",
25
  "arc_agi_2",
26
  "ifbench",
 
36
 
37
  @dataclass
38
  class TaskDoc:
 
39
  module: str
40
  abstract: str
41
  languages: list[str]
 
43
  paper: str | None
44
  dataset: str | None
45
  name: str | None = None
46
+ task_names: list[str] = field(default_factory=list)
47
 
48
 
49
+ def _module_to_github_path(module: str) -> str:
50
+ """Convert module path to GitHub source URL path."""
51
+ if module.startswith("lighteval."):
52
+ mod_path_parts = module[len("lighteval."):].split(".")
53
+ return "src/lighteval/" + "/".join(mod_path_parts) + ".py"
54
+ return "src/lighteval/" + module.replace(".", "/") + ".py"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
 
57
  def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
58
+ """Load tasks from registry and build index."""
59
  docs: list[TaskDoc] = []
60
  language_counts: Counter = Counter()
61
  tag_set: set = set()
62
+
63
+ for entry in modules_data:
64
+ docstring = entry.get("docstring", {})
65
+ module = entry.get("module", "")
66
+
67
+ # Extract fields from docstring
68
+ abstract = docstring.get("abstract", "").strip()
69
+ langs = [lang.lower() for lang in docstring.get("languages", [])]
70
+ tgs = [t.lower() for t in docstring.get("tags", [])]
71
+ paper = docstring.get("paper", "").strip() or None
72
+ name = docstring.get("name", "").strip() or None
73
+
74
+ # Convert dataset array to comma-separated string
75
+ dataset_list = docstring.get("dataset", [])
76
+ dataset = ", ".join(dataset_list) if dataset_list else None
77
+
78
+ # Extract task names from tasks array
79
+ tasks_list = entry.get("tasks", [])
80
+ task_names = [task.get("name", "") for task in tasks_list if task.get("name")]
81
+
82
+ # Update counters
83
  for lang in langs:
84
  language_counts[lang] += 1
85
  for t in tgs:
86
  tag_set.add(t)
87
+
88
+ docs.append(TaskDoc(
89
+ module=module,
90
+ abstract=abstract,
91
+ languages=langs,
92
+ tags=tgs,
93
+ paper=paper,
94
+ dataset=dataset,
95
+ name=name,
96
+ task_names=task_names
97
+ ))
98
+
99
  languages_sorted = [
100
  lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
101
  ]
 
103
  return docs, languages_sorted, tags_sorted
104
 
105
 
106
+ ALL_TASKS, ALL_LANGS, ALL_TAGS = index_tasks()
107
+ TOP_LANGS = ALL_LANGS[:8]
 
 
 
 
108
 
109
 
110
  def normalize_name_for_matching(name: str) -> str:
111
+ """Normalize name for comparison: lowercase, remove underscores/spaces/colons."""
112
  return re.sub(r"[_\s:]+", "", name.lower())
113
 
114
 
115
  def is_starred_benchmark(td: TaskDoc) -> bool:
116
+ """Check if task is a starred benchmark."""
117
+ module_parts = td.module.split(".")
118
+ base_no_ext = module_parts[-1] if module_parts else ""
119
+ fallback_name = module_parts[-2] if base_no_ext == "main" and len(module_parts) >= 2 else base_no_ext
120
 
 
121
  task_name_raw = (td.name or "").lower().strip()
122
  task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
123
  normalized_task_display = normalize_name_for_matching(task_name_display)
124
  normalized_module = normalize_name_for_matching(base_no_ext)
125
  normalized_name = normalize_name_for_matching(task_name_raw)
 
 
126
  normalized_dataset = normalize_name_for_matching(td.dataset or "")
127
 
 
128
  for star_name in star_benchmarks:
129
  normalized_star = normalize_name_for_matching(star_name)
 
130
  if (normalized_star == normalized_task_display or
131
  normalized_star == normalized_module or
132
  normalized_star == normalized_name or
 
140
 
141
 
142
  def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
143
+ """Filter tasks by languages, tags, and search query."""
144
  selected_langs = [lang.lower() for lang in (languages or [])]
145
  selected_tags = [t.lower() for t in (tags or [])]
146
  search_lc = (search or "").strip().lower()
147
  out: list[TaskDoc] = []
148
+
149
  for td in ALL_TASKS:
150
  if selected_langs and not any(lang in td.languages for lang in selected_langs):
151
  continue
152
  if selected_tags and not any(t in td.tags for t in selected_tags):
153
  continue
154
  if search_lc:
 
155
  hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
156
  if search_lc not in hay:
157
  continue
158
  out.append(td)
159
+
160
  out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower()))
161
  return out
162
 
163
 
164
  def truncate_text(text: str, max_length: int = 250) -> str:
165
+ """Truncate text to max_length, breaking at word boundary if possible."""
166
  if len(text) <= max_length:
167
  return text
168
  truncated = text[:max_length]
 
172
  return truncated + "..."
173
 
174
 
175
+ def group_task_names_by_prefix(task_names: list[str]) -> list[str]:
176
+ """Group task names by prefix (part before colon).
177
+
178
+ If multiple tasks share the same prefix, only show the prefix once.
179
+ Tasks without a colon are shown as-is.
180
+ Preserves original order as much as possible.
181
+ """
182
+ prefix_groups: dict[str, list[str]] = {}
183
+ prefix_first_pos: dict[str, int] = {} # Track first occurrence position
184
+ result: list[tuple[int, str]] = [] # (position, name) tuples
185
+
186
+ for pos, task_name in enumerate(task_names):
187
+ if ":" in task_name:
188
+ prefix = task_name.split(":")[0]
189
+ if prefix not in prefix_groups:
190
+ prefix_groups[prefix] = []
191
+ prefix_first_pos[prefix] = pos
192
+ prefix_groups[prefix].append(task_name)
193
+ else:
194
+ # Standalone task - add directly at its position
195
+ result.append((pos, task_name))
196
+
197
+ # Process prefix groups
198
+ for prefix, tasks in prefix_groups.items():
199
+ pos = prefix_first_pos[prefix]
200
+ if len(tasks) > 1:
201
+ # Multiple tasks share this prefix - show only the prefix
202
+ result.append((pos, prefix))
203
+ else:
204
+ # Only one task with this prefix - show the full task name
205
+ result.append((pos, tasks[0]))
206
+
207
+ # Sort by position to preserve original order
208
+ result.sort(key=lambda x: x[0])
209
+ return [name for _, name in result]
210
+
211
+
212
  def render_cards(tasks: list[TaskDoc]) -> str:
213
+ """Render task cards as HTML."""
214
  items: list[str] = []
215
  for t in tasks:
216
+ # Get display name
217
+ module_parts = t.module.split(".")
218
+ base_no_ext = module_parts[-1] if module_parts else ""
219
+ fallback_name = module_parts[-2] if base_no_ext == "main" and len(module_parts) >= 2 else base_no_ext
220
  task_name = (t.name or fallback_name).replace("_", " ").title()
221
+
222
+ # Build source link
223
+ mod_path = _module_to_github_path(t.module)
224
  source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
225
  paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
226
  tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
 
241
  dataset_links.append(f'<span class="dataset-more">+{len(datasets) - 6} more</span>')
242
  dataset_html = " ".join(dataset_links) if dataset_links else ""
243
  star_icon = "⭐ " if is_starred_benchmark(t) else ""
244
+
245
+ # Display evaluation task names (max 3 visible, with dropdown for more)
246
+ # Group task names by prefix to collapse shared prefixes
247
+ task_names_html = ""
248
+ if t.task_names:
249
+ grouped_names = group_task_names_by_prefix(t.task_names)
250
+ visible_names = grouped_names[:3]
251
+ remaining_names = grouped_names[3:]
252
+ visible_html = " ".join([f'<span class="task-name">{name}</span>' for name in visible_names])
253
+
254
+ if remaining_names:
255
+ remaining_html = " ".join([f'<span class="task-name">{name}</span>' for name in remaining_names])
256
+ task_names_html = f'''
257
+ <div class="task-names">
258
+ <div class="task-names-label">Run using lighteval:</div>
259
+ <div class="task-names-list">{visible_html}</div>
260
+ <details class="task-names-details">
261
+ <summary class="task-names-summary">Show {len(remaining_names)} more</summary>
262
+ <div class="task-names-list task-names-remaining">{remaining_html}</div>
263
+ </details>
264
+ </div>
265
+ '''
266
+ else:
267
+ task_names_html = f'<div class="task-names"><div class="task-names-label">Run using lighteval:</div><div class="task-names-list">{visible_html}</div></div>'
268
+
269
  items.append(
270
  f"""
271
  <article class="card" tabindex="0" aria-label="Task {task_name}">
 
273
  {chips_tags_html}
274
  {chips_langs_html}
275
  <div class="abstract">{abstract_html}</div>
276
+ {task_names_html}
277
  <div class="links">{links_html}</div>
278
  </article>
279
  """
 
300
 
301
 
302
  def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
303
+ """Toggle tag filter visibility while preserving selections."""
304
  tags_value: list[str] = selected_tags or []
305
  tasks = filter_tasks(languages, tags_value, search)
306
  count = len(tasks)
307
  total = len(ALL_TASKS)
308
  counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
 
309
  return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)
310
 
311
 
 
421
  min-height: 48px;
422
  }
423
 
424
+ .task-names {
425
+ margin-top: 10px;
426
+ padding-top: 8px;
427
+ border-top: 1px solid rgba(148, 163, 184, 0.15);
428
+ }
429
+
430
+ .task-names-label {
431
+ font-size: 11px;
432
+ font-weight: 600;
433
+ color: #64748b;
434
+ margin-bottom: 6px;
435
+ text-transform: uppercase;
436
+ letter-spacing: 0.5px;
437
+ }
438
+
439
+ .task-names-list {
440
+ display: flex;
441
+ flex-wrap: wrap;
442
+ gap: 6px;
443
+ }
444
+
445
+ .task-names-remaining {
446
+ margin-top: 8px;
447
+ padding-top: 8px;
448
+ border-top: 1px solid rgba(148, 163, 184, 0.15);
449
+ }
450
+
451
+ .task-names-details {
452
+ margin-top: 8px;
453
+ }
454
+
455
+ .task-names-summary {
456
+ font-size: 11px;
457
+ font-weight: 600;
458
+ color: #64748b;
459
+ cursor: pointer;
460
+ user-select: none;
461
+ padding: 4px 8px;
462
+ border-radius: 4px;
463
+ display: inline-block;
464
+ transition: all 0.2s ease;
465
+ background: rgba(148, 163, 184, 0.1);
466
+ }
467
+
468
+ .task-names-summary:hover {
469
+ background: rgba(148, 163, 184, 0.2);
470
+ color: #475569;
471
+ }
472
+
473
+ .task-names-summary::-webkit-details-marker {
474
+ display: none;
475
+ }
476
+
477
+ .task-names-details[open] .task-names-summary {
478
+ margin-bottom: 8px;
479
+ }
480
+
481
+ .task-name {
482
+ display: inline-block;
483
+ padding: 3px 8px;
484
+ border-radius: 6px;
485
+ font-size: 11px;
486
+ font-weight: 500;
487
+ background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%);
488
+ color: #92400e;
489
+ border: 1px solid rgba(146, 64, 14, 0.2);
490
+ font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
491
+ transition: all 0.2s ease;
492
+ }
493
+
494
+ .task-name:hover {
495
+ transform: translateY(-1px);
496
+ box-shadow: 0 2px 6px rgba(146, 64, 14, 0.2);
497
+ background: linear-gradient(135deg, #fde68a 0%, #fcd34d 100%);
498
+ }
499
+
500
  .links {
501
  margin-top: 12px;
502
  font-size: 12px;
 
608
  .links {
609
  border-top-color: rgba(148, 163, 184, 0.3);
610
  }
611
+
612
+ .task-names {
613
+ border-top-color: rgba(148, 163, 184, 0.25);
614
+ }
615
+
616
+ .task-names-label {
617
+ color: #94a3b8;
618
+ }
619
+
620
+ .task-name {
621
+ background: linear-gradient(135deg, rgba(146, 64, 14, 0.3) 0%, rgba(146, 64, 14, 0.2) 100%);
622
+ color: #fbbf24;
623
+ border-color: rgba(146, 64, 14, 0.3);
624
+ }
625
+
626
+ .task-name:hover {
627
+ background: linear-gradient(135deg, rgba(146, 64, 14, 0.4) 0%, rgba(146, 64, 14, 0.3) 100%);
628
+ box-shadow: 0 2px 6px rgba(251, 191, 36, 0.3);
629
+ }
630
+
631
+ .task-names-summary {
632
+ background: rgba(148, 163, 184, 0.15);
633
+ color: #94a3b8;
634
+ }
635
+
636
+ .task-names-summary:hover {
637
+ background: rgba(148, 163, 184, 0.25);
638
+ color: #cbd5e1;
639
+ }
640
+
641
+ .task-names-remaining {
642
+ border-top-color: rgba(148, 163, 184, 0.25);
643
+ }
644
  }
645
 
646
  /* apply */
 
674
  """
675
 
676
  with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo:
 
677
  with gr.Row():
678
  with gr.Column():
679
  gr.Markdown(
 
684
  )
685
  task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**")
686
 
 
687
  with gr.Row(equal_height=False):
688
  with gr.Column(scale=2):
689
  gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
690
+ search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="")
 
 
 
691
  with gr.Group():
692
  gr.Markdown("**Languages**")
693
  show_all_langs = gr.Checkbox(label="Show all languages", value=False)
694
+ lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[])
695
  with gr.Group():
696
  gr.Markdown("**Benchmark type**")
697
  show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
698
  tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
 
699
  gr.Markdown("Tip: use the filters and search together. Results update live.")
700
 
701
  with gr.Column(scale=5):
702
  cards = gr.HTML()
 
703
  cards.value = "<div style='padding:18px'>Loading tasks…</div>"
704
 
 
 
705
  show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
 
706
  show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])
 
 
 
707
  search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
708
  lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
709
  tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
710
 
 
711
  initial_tasks = filter_tasks([], [], "")
712
  cards.value = render_cards(initial_tasks)
713
 
714
 
715
+ if __name__ == "__main__":
716
+ demo.launch()
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- git_python
 
 
1
+ git_python
2
+ git+https://github.com/huggingface/lighteval.git#egg=lighteval[dev]