Spaces:

librarian-bots
/

metadata_explorer

Sleeping

App Files Files Community

davanstrien HF Staff commited on Jan 20, 2023

Commit

52415b9

1 Parent(s): 63242ee

Upload 2 files

Browse files

Files changed (2) hide show

app.py +50 -9
data.parquet +2 -2

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import numpy as np
 from itertools import combinations
 from toolz import unique
 import pandas as pd
 pd.options.plotting.backend = "plotly"
@@ -37,13 +38,22 @@ def prep_dataset():
     df['languages'] = df['languages'].apply(_clean_tags)
     df['datasets'] = df['datasets'].apply(_clean_tags)
     df['tags'] = df['tags'].apply(_clean_tags)
     df = df.drop(columns=['Unnamed: 0'])
     df.to_parquet("data.parquet")
     return df
 def load_data():
-    return pd.read_parquet("data.parquet")
 def filter_df_by_library(filter='transformers'):
@@ -64,6 +74,7 @@ def get_all_tags():
     tags = df['tags'].to_list()
     return list(concat(tags))
 @lru_cache()
 def get_case_sensitive_duplicate_tags():
     tags = get_all_tags()
@@ -78,6 +89,15 @@ def get_case_sensitive_duplicate_tags():
 def display_case_sensitive_duplicate_tags():
     return pd.DataFrame(get_case_sensitive_duplicate_tags())
 def tag_frequency(case_sensitive=True):
     tags = get_all_tags()
     if not case_sensitive:
@@ -85,15 +105,16 @@ def tag_frequency(case_sensitive=True):
     tags_frequencies = dict(frequencies(tags))
     df = pd.DataFrame.from_dict(tags_frequencies, orient='index', columns=['Count']).sort_values(
         by='Count', ascending=False)
-    return df
-def plot_frequency(filter):
-    df = filter_df_by_library(filter)
     tags = concat(df['tags'])
     tags = dict(frequencies(tags))
     df = pd.DataFrame.from_dict(tags, orient='index', columns=['Count']).sort_values(
         by='Count', ascending=False)
-    return df
 def has_model_card_by_library(top_n):
@@ -116,10 +137,15 @@ def model_card_length_by_library(top_n):
     # df['library'] = df.library.apply(lambda library: f"[{library}](https://huggingface.co/models?library={library})")
     # return df.to_markdown()
 df = load_data()
 top_n = df.library.value_counts().shape[0]
 with gr.Blocks() as demo:
     gr.Markdown("# 🤗 Hub Metadata Explorer")
     gr.Markdown("Some explanation")
@@ -128,12 +154,27 @@ with gr.Blocks() as demo:
         with gr.Row():
             gr.Markdown("thsh")
         with gr.Row():
-            case_sensitive = gr.Checkbox(False,label=)
-            gr.Plot(tag_frequency())
         with gr.Row():
             gr.Markdown(f"Number of tags which are case sensitive {len(get_case_sensitive_duplicate_tags())}")
-            with gr.Accordion("View duplicate tags", open=False):
                 gr.Dataframe(display_case_sensitive_duplicate_tags())
     with gr.Tab("Model Cards"):
         gr.Markdown("""Model cards are a key component of metadata for a model. Model cards can include both
         information created by a human i.e. outlining the goals behind the creation of the model and information

 from itertools import combinations
 from toolz import unique
 import pandas as pd
+from pathlib import Path
 pd.options.plotting.backend = "plotly"
     df['languages'] = df['languages'].apply(_clean_tags)
     df['datasets'] = df['datasets'].apply(_clean_tags)
     df['tags'] = df['tags'].apply(_clean_tags)
+    df['has_languages'] = df.languages.apply(len) > 0
+    df['has_tags'] = df.tags.apply(len) > 0
+    df['has_dataset'] = df.datasets.apply(len) > 0
+    df['has_co2'] = df.co2.isnull()
+    df['has_co2'] = df.co2.apply(lambda x: x is not None)
     df = df.drop(columns=['Unnamed: 0'])
     df.to_parquet("data.parquet")
     return df
 def load_data():
+    return (
+        pd.read_parquet("data.parquet")
+        if Path('data.parquet').exists()
+        else prep_dataset()
+    )
 def filter_df_by_library(filter='transformers'):
     tags = df['tags'].to_list()
     return list(concat(tags))
 @lru_cache()
 def get_case_sensitive_duplicate_tags():
     tags = get_all_tags()
 def display_case_sensitive_duplicate_tags():
     return pd.DataFrame(get_case_sensitive_duplicate_tags())
+def get_number_of_tags(case_sensitive=True):
+    tags = set(get_all_tags())
+    if case_sensitive:
+        return f"Total number of case sensitive tags: {len(tags)}"
+    tags = {tag.lower() for tag in tags}
+    return f"Total number of case insensitive tags: {len(tags)}"
 def tag_frequency(case_sensitive=True):
     tags = get_all_tags()
     if not case_sensitive:
     tags_frequencies = dict(frequencies(tags))
     df = pd.DataFrame.from_dict(tags_frequencies, orient='index', columns=['Count']).sort_values(
         by='Count', ascending=False)
+    return df.reset_index()
+def tag_frequency_by_library(library_filter):
+    df = filter_df_by_library(library_filter)
     tags = concat(df['tags'])
     tags = dict(frequencies(tags))
     df = pd.DataFrame.from_dict(tags, orient='index', columns=['Count']).sort_values(
         by='Count', ascending=False)
+    return df.reset_index()
 def has_model_card_by_library(top_n):
     # df['library'] = df.library.apply(lambda library: f"[{library}](https://huggingface.co/models?library={library})")
     # return df.to_markdown()
+def metadata_coverage_by_library(metadata_field):
+    df = load_data()
+    return df.groupby('library')[metadata_field].mean().sort_values().plot.barh()
 df = load_data()
 top_n = df.library.value_counts().shape[0]
+libraries = [library for library in df.library.unique() if library]
+metadata_coverage_columns = [c for c in df.columns if c.startswith("has")]
 with gr.Blocks() as demo:
     gr.Markdown("# 🤗 Hub Metadata Explorer")
     gr.Markdown("Some explanation")
         with gr.Row():
             gr.Markdown("thsh")
         with gr.Row():
+            case_sensitive = gr.Checkbox(True, label="Case sensitive", )
+            mk = gr.Markdown()
+            case_sensitive.change(get_number_of_tags, [case_sensitive], mk, queue=False)
+        with gr.Accordion("Tag Frequencies", open=False):
+            df = gr.Dataframe()
+            case_sensitive.change(tag_frequency, [case_sensitive], df, queue=False)
         with gr.Row():
             gr.Markdown(f"Number of tags which are case sensitive {len(get_case_sensitive_duplicate_tags())}")
+        with gr.Row():
+            with gr.Accordion("View case sensitive tag pairs", open=False):
                 gr.Dataframe(display_case_sensitive_duplicate_tags())
+    with gr.Tab("Tags frequencies by library"):
+        gr.Markdown("Tags by library")
+        library_choice = gr.Dropdown(choices=libraries, label="select library")
+        df = gr.Dataframe()
+        library_choice.change(tag_frequency_by_library, [library_choice], df, queue=False)
+    with gr.Tab("Tag health by library"):
+        metadata_field = gr.Dropdown(choices=metadata_coverage_columns)
+        plot = gr.Plot()
+        metadata_field.change(metadata_coverage_by_library, [metadata_field], plot, queue=False)
     with gr.Tab("Model Cards"):
         gr.Markdown("""Model cards are a key component of metadata for a model. Model cards can include both
         information created by a human i.e. outlining the goals behind the creation of the model and information

data.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:32f51e54683c1fa3390bc4e318e1008d686844bb451b82c3c1a91787e2b986d9
-size 3765676

 version https://git-lfs.github.com/spec/v1
+oid sha256:c1c592d270fa95ba0309a163e8300e1eab3378d6fefa60ac7ef7accb6772e9e6
+size 3802125