Spaces:

lmms-lab-si
/

EASI-Leaderboard

Running

App Files Files Community

yangzhitao commited on 14 days ago

Commit

6c930b9

1 Parent(s): c842956

refactor: reorganize schema definitions and enhance data loading functions for improved clarity and maintainability

Browse files

Files changed (5) hide show

src/about.py +7 -52
src/prepare.py +38 -141
src/schemas/display_toml.py +22 -0
src/schemas/init.py +0 -0
src/schemas/meta_toml.py +96 -0

src/about.py CHANGED Viewed

@@ -7,56 +7,11 @@ from loguru import logger
 from src.prepare import load_display_toml, load_meta_toml, prepare_space
 if typing.TYPE_CHECKING:
-    from src.prepare import MetaToml_Benchmark
 prepare_space()
-# class _Task(BaseModel):
-#     benchmark: Annotated[str, Field(description="The benchmark name")]
-#     metric: Annotated[str, Field(description="The metric name")]
-#     col_name: Annotated[str, Field(description="The column name")]
-# Select your tasks here
-# ---------------------------------------------------
-# class _Tasks(Enum):
-#     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-#     # acc
-#     task1_1 = _Task(benchmark="MindCube", metric="acc", col_name="MindCube(acc)")
-#     task2_1 = _Task(benchmark="MMSI", metric="acc", col_name="MMSI(acc)")
-#     task3_1 = _Task(benchmark="Omni", metric="acc", col_name="Omni(acc)")
-#     task4_1 = _Task(benchmark="Core", metric="acc", col_name="Core(acc)")
-#     task5_1 = _Task(benchmark="SpatialViz", metric="acc", col_name="SpatialViz(acc)")
-#     task6_1 = _Task(benchmark="STARE", metric="acc", col_name="STARE(acc)")
-#     task7_1 = _Task(benchmark="SITEBench", metric="acc", col_name="SITEBench(acc)")
-#     task8_1 = _Task(benchmark="VSI (MCQ)", metric="acc", col_name="VSI (MCQ)(acc)")
-#     # caa
-#     task1_2 = _Task(benchmark="MindCube", metric="caa", col_name="MindCube(caa)")
-#     task2_2 = _Task(benchmark="MMSI", metric="caa", col_name="MMSI(caa)")
-#     task3_2 = _Task(benchmark="Omni", metric="caa", col_name="Omni(caa)")
-#     task4_2 = _Task(benchmark="Core", metric="caa", col_name="Core(caa)")
-#     task5_2 = _Task(benchmark="SpatialViz", metric="caa", col_name="SpatialViz(caa)")
-#     task6_2 = _Task(benchmark="STARE", metric="caa", col_name="STARE(caa)")
-#     task7_2 = _Task(benchmark="SITEBench", metric="caa", col_name="SITEBench(caa)")
-#     task8_2 = _Task(benchmark="VSI (MCQ)", metric="caa", col_name="VSI (MCQ)(caa)")
-#     # rand
-#     task1_3 = _Task(benchmark="MindCube", metric="rand", col_name="MindCube(rand)")
-#     task2_3 = _Task(benchmark="MMSI", metric="rand", col_name="MMSI(rand)")
-#     task3_3 = _Task(benchmark="Omni", metric="rand", col_name="Omni(rand)")
-#     task4_3 = _Task(benchmark="Core", metric="rand", col_name="Core(rand)")
-#     task5_3 = _Task(benchmark="SpatialViz", metric="rand", col_name="SpatialViz(rand)")
-#     task6_3 = _Task(benchmark="STARE", metric="rand", col_name="STARE(rand)")
-#     task7_3 = _Task(benchmark="SITEBench", metric="rand", col_name="SITEBench(rand)")
-#     task8_3 = _Task(benchmark="VSI (MCQ)", metric="rand", col_name="VSI (MCQ)(rand)")
-# BENCHMARKS = {m.value.benchmark for m in Tasks}
-# METRICS = {m.value.metric for m in Tasks}
-# COL_NAMES = {m.value.col_name for m in Tasks}
 @lru_cache(maxsize=1)
 def get_benchmarks() -> list["MetaToml_Benchmark"]:
     meta_toml = load_meta_toml()
@@ -81,22 +36,22 @@ NUM_FEWSHOT = 0  # Change with your few shot
 TITLE = """<h1 align="center" id="space-title">EASI Leaderboard</h1>"""
 # What does your leaderboard evaluate?
-INTRODUCTION_TEXT = """
 **EASI: Holistic Evaluation of Multimodal LLMs on Spatial Intelligence**
 EASI conceptualizes a comprehensive taxonomy of spatial tasks that unifies existing benchmarks and a standardized protocol for the fair evaluation of state-of-the-art proprietary and open-source models.
-"""
 # Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = """
 ## Leaderboard
 You can find the documentation of EASI here: [EvolvingLMMs-Lab/EASI](https://github.com/EvolvingLMMs-Lab/EASI).
 And the dataset for this leaderboard: [lmms-lab-si/EASI-Leaderboard-Data](https://huggingface.co/datasets/lmms-lab-si/EASI-Leaderboard-Data)
-"""
-EVALUATION_QUEUE_TEXT = """
 ## Some good practices before submitting an evaluation with EASI
 ### 1) Make sure you can load your model and tokenizer using AutoClasses:
@@ -124,7 +79,7 @@ When we add extra information about models to the leaderboard, it will be automa
 If your model is displayed in the `FAILED` category, its execution stopped.
 Make sure you have followed the above steps first.
 If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
-"""
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = dedent("""

 from src.prepare import load_display_toml, load_meta_toml, prepare_space
 if typing.TYPE_CHECKING:
+    from src.schemas.meta_toml import MetaToml_Benchmark
 prepare_space()
 @lru_cache(maxsize=1)
 def get_benchmarks() -> list["MetaToml_Benchmark"]:
     meta_toml = load_meta_toml()
 TITLE = """<h1 align="center" id="space-title">EASI Leaderboard</h1>"""
 # What does your leaderboard evaluate?
+INTRODUCTION_TEXT = dedent("""
 **EASI: Holistic Evaluation of Multimodal LLMs on Spatial Intelligence**
 EASI conceptualizes a comprehensive taxonomy of spatial tasks that unifies existing benchmarks and a standardized protocol for the fair evaluation of state-of-the-art proprietary and open-source models.
+""")
 # Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = dedent("""
 ## Leaderboard
 You can find the documentation of EASI here: [EvolvingLMMs-Lab/EASI](https://github.com/EvolvingLMMs-Lab/EASI).
 And the dataset for this leaderboard: [lmms-lab-si/EASI-Leaderboard-Data](https://huggingface.co/datasets/lmms-lab-si/EASI-Leaderboard-Data)
+""")
+EVALUATION_QUEUE_TEXT = dedent("""
 ## Some good practices before submitting an evaluation with EASI
 ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 If your model is displayed in the `FAILED` category, its execution stopped.
 Make sure you have followed the above steps first.
 If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
+""")
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = dedent("""

src/prepare.py CHANGED Viewed

@@ -1,15 +1,14 @@
 import os
 import sys
-from functools import cached_property, lru_cache
 from pathlib import Path
-from typing import Annotated
 from huggingface_hub import snapshot_download
 from loguru import logger
-from pydantic import BaseModel, ConfigDict, Field
-from typing_extensions import Self
 from src.envs import API, settings
 if sys.version_info < (3, 11):
     from tomli import load as toml_load
@@ -22,132 +21,49 @@ PREPARED_FLAG: bool = os.getenv("NO_DOWNLOAD", 0) == 1
 def prepare_space():
     """Space initialisation"""
-    def _restart_space():
-        API.restart_space(repo_id=settings.REPO_ID)
     global PREPARED_FLAG
     if not PREPARED_FLAG:
-        try:
-            snapshot_download(
-                repo_id=settings.QUEUE_REPO_ID,
-                local_dir=settings.EVAL_REQUESTS_PATH,
-                repo_type="dataset",
-                tqdm_class=None,
-                etag_timeout=30,
-                token=settings.HF_TOKEN.get_secret_value(),
-            )
-        except Exception as e:
-            logger.error(f"Error downloading eval queue: {e!s}")
-            _restart_space()
-        try:
-            snapshot_download(
-                repo_id=settings.RESULTS_REPO_ID,
-                local_dir=settings.EVAL_RESULTS_PATH,
-                repo_type="dataset",
-                tqdm_class=None,
-                etag_timeout=30,
-                allow_patterns=["leaderboard/*.toml", "leaderboard/**/*.json"],
-                token=settings.HF_TOKEN.get_secret_value(),
-            )
-        except Exception as e:
-            logger.error(f"Error downloading eval queue: {e!s}")
-            _restart_space()
         PREPARED_FLAG = True
     load_meta_toml()
     load_display_toml()
-class MetaToml(BaseModel):
-    model_config = ConfigDict(extra="allow", frozen=True)
-    models: list["MetaToml_Model"]
-    @cached_property
-    def model_key_to_model(self) -> dict[str, "MetaToml_Model"]:
-        return {model.key: model for model in self.models}
-    @cached_property
-    def model_title_to_model(self) -> dict[str, "MetaToml_Model"]:
-        """Model title (lower case) to model mapping"""
-        return {model.title.lower(): model for model in self.models}
-    benchmarks: list["MetaToml_Benchmark"]
-    @cached_property
-    def benchmark_key_to_benchmark(self) -> dict[str, "MetaToml_Benchmark"]:
-        return {benchmark.key: benchmark for benchmark in self.benchmarks}
-    model_repos: list["MetaToml_ModelRepo"]
-    @cached_property
-    def model_key_to_repo(self) -> dict[str, "MetaToml_ModelRepo"]:
-        return {repo.key: repo for repo in self.model_repos}
-    # --- Helper properties ---
-    @cached_property
-    def model_title_to_repo(self) -> dict[str, "MetaToml_ModelRepo"]:
-        """Model title (lower case) to model repo mapping"""
-        mapping = {}
-        for model in self.models:
-            title = model.title.lower()
-            key = model.key
-            repo = self.model_key_to_repo.get(key)
-            if repo:
-                mapping[title] = repo
-        return mapping
-    @cached_property
-    def model_title_to_key(self) -> dict[str, str]:
-        return {model.title.lower(): model.key for model in self.models}
-    @cached_property
-    def benchmark_title_to_key(self) -> dict[str, str]:
-        return {benchmark.title.lower(): benchmark.key for benchmark in self.benchmarks}
-    @cached_property
-    def model_key_to_repo_id(self) -> dict[str, str]:
-        return {model.key: model.repo_id for model in self.model_repos if model.repo_id is not None}
-class _HashableComparableMixin(BaseModel):
-    model_config = ConfigDict(extra="allow", frozen=True)
-    key: str
-    title: str
-    def __hash__(self) -> int:
-        return hash(self.key)
-    def __eq__(self, other: Self) -> bool:
-        return (self.key, self.title) == (other.key, other.title)
-    def __lt__(self, other: Self) -> bool:
-        return (self.key, self.title) < (other.key, other.title)
-    def __gt__(self, other: Self) -> bool:
-        return (self.key, self.title) > (other.key, other.title)
-    def __le__(self, other: Self) -> bool:
-        return (self.key, self.title) <= (other.key, other.title)
-    def __ge__(self, other: Self) -> bool:
-        return (self.key, self.title) >= (other.key, other.title)
-class MetaToml_Benchmark(_HashableComparableMixin):
-    disabled: bool = False
-class MetaToml_Model(_HashableComparableMixin): ...
-class MetaToml_ModelRepo(BaseModel):
-    model_config = ConfigDict(extra="allow", frozen=True)
-    key: str
-    repo_id: str | None = None
-    link: str | None = None
 @lru_cache(maxsize=1)
@@ -162,25 +78,6 @@ def load_meta_toml() -> MetaToml:
     return meta_toml
-class DisplayToml(BaseModel):
-    model_config = ConfigDict(extra="allow", frozen=True)
-    version: Annotated[str, Field(..., description="The version of the results.")]
-    benchmarks_order: Annotated[
-        list[str],
-        Field(
-            default_factory=lambda: [
-                "vsi_bench",
-                "mmsi_bench",
-                "mindcube_tiny",
-                "viewspatial",
-                "site",
-            ],
-            description="The predefined order of the benchmarks.",
-        ),
-    ]
 @lru_cache(maxsize=1)
 def load_display_toml() -> DisplayToml:
     display_toml_path = Path(settings.EVAL_RESULTS_PATH) / "leaderboard" / "display.toml"

 import os
 import sys
+from functools import lru_cache
 from pathlib import Path
 from huggingface_hub import snapshot_download
 from loguru import logger
 from src.envs import API, settings
+from src.schemas.display_toml import DisplayToml
+from src.schemas.meta_toml import MetaToml
 if sys.version_info < (3, 11):
     from tomli import load as toml_load
 def prepare_space():
     """Space initialisation"""
     global PREPARED_FLAG
     if not PREPARED_FLAG:
+        download_results()
+        download_queue()
         PREPARED_FLAG = True
     load_meta_toml()
     load_display_toml()
+def _restart_space():
+    API.restart_space(repo_id=settings.REPO_ID)
+def download_results():
+    try:
+        snapshot_download(
+            repo_id=settings.QUEUE_REPO_ID,
+            local_dir=settings.EVAL_REQUESTS_PATH,
+            repo_type="dataset",
+            tqdm_class=None,
+            etag_timeout=30,
+            token=settings.HF_TOKEN.get_secret_value(),
+        )
+    except Exception as e:
+        logger.error(f"Error downloading eval queue: {e!s}")
+        _restart_space()
+def download_queue():
+    try:
+        snapshot_download(
+            repo_id=settings.RESULTS_REPO_ID,
+            local_dir=settings.EVAL_RESULTS_PATH,
+            repo_type="dataset",
+            tqdm_class=None,
+            etag_timeout=30,
+            allow_patterns=["leaderboard/*.toml", "leaderboard/**/*.json"],
+            token=settings.HF_TOKEN.get_secret_value(),
+        )
+    except Exception as e:
+        logger.error(f"Error downloading eval queue: {e!s}")
+        _restart_space()
 @lru_cache(maxsize=1)
     return meta_toml
 @lru_cache(maxsize=1)
 def load_display_toml() -> DisplayToml:
     display_toml_path = Path(settings.EVAL_RESULTS_PATH) / "leaderboard" / "display.toml"

src/schemas/display_toml.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from typing import Annotated
+from pydantic import BaseModel, ConfigDict, Field
+class DisplayToml(BaseModel):
+    model_config = ConfigDict(extra="allow", frozen=True)
+    version: Annotated[str, Field(..., description="The version of the results.")]
+    benchmarks_order: Annotated[
+        list[str],
+        Field(
+            default_factory=lambda: [
+                "vsi_bench",
+                "mmsi_bench",
+                "mindcube_tiny",
+                "viewspatial",
+                "site",
+            ],
+            description="The predefined order of the benchmarks.",
+        ),
+    ]

src/schemas/init.py ADDED Viewed

File without changes

src/schemas/meta_toml.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from functools import cached_property
+from pydantic import BaseModel, ConfigDict
+from typing_extensions import Self
+class MetaToml(BaseModel):
+    model_config = ConfigDict(extra="allow", frozen=True)
+    models: list["MetaToml_Model"]
+    @cached_property
+    def model_key_to_model(self) -> dict[str, "MetaToml_Model"]:
+        return {model.key: model for model in self.models}
+    @cached_property
+    def model_title_to_model(self) -> dict[str, "MetaToml_Model"]:
+        """Model title (lower case) to model mapping"""
+        return {model.title.lower(): model for model in self.models}
+    benchmarks: list["MetaToml_Benchmark"]
+    @cached_property
+    def benchmark_key_to_benchmark(self) -> dict[str, "MetaToml_Benchmark"]:
+        return {benchmark.key: benchmark for benchmark in self.benchmarks}
+    model_repos: list["MetaToml_ModelRepo"]
+    @cached_property
+    def model_key_to_repo(self) -> dict[str, "MetaToml_ModelRepo"]:
+        return {repo.key: repo for repo in self.model_repos}
+    # --- Helper properties ---
+    @cached_property
+    def model_title_to_repo(self) -> dict[str, "MetaToml_ModelRepo"]:
+        """Model title (lower case) to model repo mapping"""
+        mapping = {}
+        for model in self.models:
+            title = model.title.lower()
+            key = model.key
+            repo = self.model_key_to_repo.get(key)
+            if repo:
+                mapping[title] = repo
+        return mapping
+    @cached_property
+    def model_title_to_key(self) -> dict[str, str]:
+        return {model.title.lower(): model.key for model in self.models}
+    @cached_property
+    def benchmark_title_to_key(self) -> dict[str, str]:
+        return {benchmark.title.lower(): benchmark.key for benchmark in self.benchmarks}
+    @cached_property
+    def model_key_to_repo_id(self) -> dict[str, str]:
+        return {model.key: model.repo_id for model in self.model_repos if model.repo_id is not None}
+class _HashableComparableMixin(BaseModel):
+    model_config = ConfigDict(extra="allow", frozen=True)
+    key: str
+    title: str
+    def __hash__(self) -> int:
+        return hash(self.key)
+    def __eq__(self, other: Self) -> bool:
+        return (self.key, self.title) == (other.key, other.title)
+    def __lt__(self, other: Self) -> bool:
+        return (self.key, self.title) < (other.key, other.title)
+    def __gt__(self, other: Self) -> bool:
+        return (self.key, self.title) > (other.key, other.title)
+    def __le__(self, other: Self) -> bool:
+        return (self.key, self.title) <= (other.key, other.title)
+    def __ge__(self, other: Self) -> bool:
+        return (self.key, self.title) >= (other.key, other.title)
+class MetaToml_Benchmark(_HashableComparableMixin):
+    disabled: bool = False
+class MetaToml_Model(_HashableComparableMixin): ...
+class MetaToml_ModelRepo(BaseModel):
+    model_config = ConfigDict(extra="allow", frozen=True)
+    key: str
+    repo_id: str | None = None
+    link: str | None = None