davanstrien HF Staff commited on
Commit
52415b9
·
1 Parent(s): 63242ee

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +50 -9
  2. data.parquet +2 -2
app.py CHANGED
@@ -10,6 +10,7 @@ import numpy as np
10
  from itertools import combinations
11
  from toolz import unique
12
  import pandas as pd
 
13
 
14
  pd.options.plotting.backend = "plotly"
15
 
@@ -37,13 +38,22 @@ def prep_dataset():
37
  df['languages'] = df['languages'].apply(_clean_tags)
38
  df['datasets'] = df['datasets'].apply(_clean_tags)
39
  df['tags'] = df['tags'].apply(_clean_tags)
 
 
 
 
 
40
  df = df.drop(columns=['Unnamed: 0'])
41
  df.to_parquet("data.parquet")
42
  return df
43
 
44
 
45
  def load_data():
46
- return pd.read_parquet("data.parquet")
 
 
 
 
47
 
48
 
49
  def filter_df_by_library(filter='transformers'):
@@ -64,6 +74,7 @@ def get_all_tags():
64
  tags = df['tags'].to_list()
65
  return list(concat(tags))
66
 
 
67
  @lru_cache()
68
  def get_case_sensitive_duplicate_tags():
69
  tags = get_all_tags()
@@ -78,6 +89,15 @@ def get_case_sensitive_duplicate_tags():
78
  def display_case_sensitive_duplicate_tags():
79
  return pd.DataFrame(get_case_sensitive_duplicate_tags())
80
 
 
 
 
 
 
 
 
 
 
81
  def tag_frequency(case_sensitive=True):
82
  tags = get_all_tags()
83
  if not case_sensitive:
@@ -85,15 +105,16 @@ def tag_frequency(case_sensitive=True):
85
  tags_frequencies = dict(frequencies(tags))
86
  df = pd.DataFrame.from_dict(tags_frequencies, orient='index', columns=['Count']).sort_values(
87
  by='Count', ascending=False)
88
- return df
 
89
 
90
- def plot_frequency(filter):
91
- df = filter_df_by_library(filter)
92
  tags = concat(df['tags'])
93
  tags = dict(frequencies(tags))
94
  df = pd.DataFrame.from_dict(tags, orient='index', columns=['Count']).sort_values(
95
  by='Count', ascending=False)
96
- return df
97
 
98
 
99
  def has_model_card_by_library(top_n):
@@ -116,10 +137,15 @@ def model_card_length_by_library(top_n):
116
  # df['library'] = df.library.apply(lambda library: f"[{library}](https://huggingface.co/models?library={library})")
117
  # return df.to_markdown()
118
 
 
 
 
 
119
 
120
  df = load_data()
121
  top_n = df.library.value_counts().shape[0]
122
-
 
123
  with gr.Blocks() as demo:
124
  gr.Markdown("# 🤗 Hub Metadata Explorer")
125
  gr.Markdown("Some explanation")
@@ -128,12 +154,27 @@ with gr.Blocks() as demo:
128
  with gr.Row():
129
  gr.Markdown("thsh")
130
  with gr.Row():
131
- case_sensitive = gr.Checkbox(False,label=)
132
- gr.Plot(tag_frequency())
 
 
 
 
133
  with gr.Row():
134
  gr.Markdown(f"Number of tags which are case sensitive {len(get_case_sensitive_duplicate_tags())}")
135
- with gr.Accordion("View duplicate tags", open=False):
 
136
  gr.Dataframe(display_case_sensitive_duplicate_tags())
 
 
 
 
 
 
 
 
 
 
137
  with gr.Tab("Model Cards"):
138
  gr.Markdown("""Model cards are a key component of metadata for a model. Model cards can include both
139
  information created by a human i.e. outlining the goals behind the creation of the model and information
 
10
  from itertools import combinations
11
  from toolz import unique
12
  import pandas as pd
13
+ from pathlib import Path
14
 
15
  pd.options.plotting.backend = "plotly"
16
 
 
38
  df['languages'] = df['languages'].apply(_clean_tags)
39
  df['datasets'] = df['datasets'].apply(_clean_tags)
40
  df['tags'] = df['tags'].apply(_clean_tags)
41
+ df['has_languages'] = df.languages.apply(len) > 0
42
+ df['has_tags'] = df.tags.apply(len) > 0
43
+ df['has_dataset'] = df.datasets.apply(len) > 0
44
+ df['has_co2'] = df.co2.isnull()
45
+ df['has_co2'] = df.co2.apply(lambda x: x is not None)
46
  df = df.drop(columns=['Unnamed: 0'])
47
  df.to_parquet("data.parquet")
48
  return df
49
 
50
 
51
  def load_data():
52
+ return (
53
+ pd.read_parquet("data.parquet")
54
+ if Path('data.parquet').exists()
55
+ else prep_dataset()
56
+ )
57
 
58
 
59
  def filter_df_by_library(filter='transformers'):
 
74
  tags = df['tags'].to_list()
75
  return list(concat(tags))
76
 
77
+
78
  @lru_cache()
79
  def get_case_sensitive_duplicate_tags():
80
  tags = get_all_tags()
 
89
  def display_case_sensitive_duplicate_tags():
90
  return pd.DataFrame(get_case_sensitive_duplicate_tags())
91
 
92
+
93
+ def get_number_of_tags(case_sensitive=True):
94
+ tags = set(get_all_tags())
95
+ if case_sensitive:
96
+ return f"Total number of case sensitive tags: {len(tags)}"
97
+ tags = {tag.lower() for tag in tags}
98
+ return f"Total number of case insensitive tags: {len(tags)}"
99
+
100
+
101
  def tag_frequency(case_sensitive=True):
102
  tags = get_all_tags()
103
  if not case_sensitive:
 
105
  tags_frequencies = dict(frequencies(tags))
106
  df = pd.DataFrame.from_dict(tags_frequencies, orient='index', columns=['Count']).sort_values(
107
  by='Count', ascending=False)
108
+ return df.reset_index()
109
+
110
 
111
+ def tag_frequency_by_library(library_filter):
112
+ df = filter_df_by_library(library_filter)
113
  tags = concat(df['tags'])
114
  tags = dict(frequencies(tags))
115
  df = pd.DataFrame.from_dict(tags, orient='index', columns=['Count']).sort_values(
116
  by='Count', ascending=False)
117
+ return df.reset_index()
118
 
119
 
120
  def has_model_card_by_library(top_n):
 
137
  # df['library'] = df.library.apply(lambda library: f"[{library}](https://huggingface.co/models?library={library})")
138
  # return df.to_markdown()
139
 
140
+ def metadata_coverage_by_library(metadata_field):
141
+ df = load_data()
142
+ return df.groupby('library')[metadata_field].mean().sort_values().plot.barh()
143
+
144
 
145
  df = load_data()
146
  top_n = df.library.value_counts().shape[0]
147
+ libraries = [library for library in df.library.unique() if library]
148
+ metadata_coverage_columns = [c for c in df.columns if c.startswith("has")]
149
  with gr.Blocks() as demo:
150
  gr.Markdown("# 🤗 Hub Metadata Explorer")
151
  gr.Markdown("Some explanation")
 
154
  with gr.Row():
155
  gr.Markdown("thsh")
156
  with gr.Row():
157
+ case_sensitive = gr.Checkbox(True, label="Case sensitive", )
158
+ mk = gr.Markdown()
159
+ case_sensitive.change(get_number_of_tags, [case_sensitive], mk, queue=False)
160
+ with gr.Accordion("Tag Frequencies", open=False):
161
+ df = gr.Dataframe()
162
+ case_sensitive.change(tag_frequency, [case_sensitive], df, queue=False)
163
  with gr.Row():
164
  gr.Markdown(f"Number of tags which are case sensitive {len(get_case_sensitive_duplicate_tags())}")
165
+ with gr.Row():
166
+ with gr.Accordion("View case sensitive tag pairs", open=False):
167
  gr.Dataframe(display_case_sensitive_duplicate_tags())
168
+ with gr.Tab("Tags frequencies by library"):
169
+ gr.Markdown("Tags by library")
170
+ library_choice = gr.Dropdown(choices=libraries, label="select library")
171
+ df = gr.Dataframe()
172
+ library_choice.change(tag_frequency_by_library, [library_choice], df, queue=False)
173
+ with gr.Tab("Tag health by library"):
174
+ metadata_field = gr.Dropdown(choices=metadata_coverage_columns)
175
+ plot = gr.Plot()
176
+ metadata_field.change(metadata_coverage_by_library, [metadata_field], plot, queue=False)
177
+
178
  with gr.Tab("Model Cards"):
179
  gr.Markdown("""Model cards are a key component of metadata for a model. Model cards can include both
180
  information created by a human i.e. outlining the goals behind the creation of the model and information
data.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32f51e54683c1fa3390bc4e318e1008d686844bb451b82c3c1a91787e2b986d9
3
- size 3765676
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c592d270fa95ba0309a163e8300e1eab3378d6fefa60ac7ef7accb6772e9e6
3
+ size 3802125