| | from ast import literal_eval |
| |
|
| | def make_lang_list(row): |
| | languages = row["languages"] |
| | if languages == "none": |
| | return [] |
| | return literal_eval(languages) |
| |
|
| | def language_count(row): |
| | return len(row["languages"]) |
| |
|
| | def process_for_lang(data, modality): |
| | |
| | if modality == "NLP": |
| | data = data[data["modality"] == "nlp"] |
| | elif modality == "Audio": |
| | data = data[data["modality"] == "audio"] |
| | elif modality == "Multimodal": |
| | data = data[data["modality"] == "multimodal"] |
| |
|
| | |
| | data.loc[data.languages == "False", 'languages'] = None |
| | data.loc[data.languages == {}, 'languages'] = None |
| |
|
| | |
| | no_lang_count = data["languages"].isna().sum() |
| |
|
| | |
| | |
| | data["languages"] = data["languages"].fillna('none') |
| | data["languages"] = data.apply(make_lang_list, axis=1) |
| | data["language_count"] = data.apply(language_count, axis=1) |
| |
|
| | |
| | models_with_langs = data[data["language_count"] > 0] |
| | langs = models_with_langs["languages"].explode() |
| | langs = langs[langs != {}] |
| | total_langs = len(langs.unique()) |
| |
|
| | data['multilingual'] = data.apply(lambda x: int("multilingual" in x['languages']), axis=1) |
| |
|
| | return data, no_lang_count, total_langs, langs.unique() |
| |
|
| | def filter_multilinguality(data, linguality): |
| | if linguality == "Just Multilingual": |
| | multilingual_tag = data["multilingual"] == 1 |
| | multiple_lang_tags = data["language_count"] > 1 |
| | return data[multilingual_tag | multiple_lang_tags] |
| | elif linguality == "Three or more languages": |
| | return data[data["language_count"] >= 3] |
| | else: |
| | return data |