Spaces:
Sleeping
Sleeping
File size: 7,933 Bytes
4cd4e6d 9daa7e5 dab38fa 4cd4e6d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import re, os, tempfile, math
import gradio as gr
import pandas as pd
import torch
print(f"Using torch version: {torch.__version__}")
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from functools import lru_cache
from transformers import pipeline
import emoji
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0 # deterministic language detection
# ----------------------
# Models and pipelines
# ----------------------
SENTIMENT_CHOICES = {
"SST-2 (binary, reviews)": "distilbert-base-uncased-finetuned-sst-2-english", # POSITIVE/NEGATIVE
"Twitter (EN, 3-class)": "cardiffnlp/twitter-roberta-base-sentiment-latest", # NEGATIVE/NEUTRAL/POSITIVE
"Twitter (Multilingual, 3-class)": "cardiffnlp/twitter-xlm-roberta-base-sentiment"
}
EMOTION_MODEL = "bhadresh-savani/distilbert-base-uncased-emotion" # sadness/joy/love/anger/fear/surprise
@lru_cache(maxsize=8)
def get_pipe(task, model_id):
# device_map="auto" if GPU is present in Colab; else CPU fallback
return pipeline(task, model=model_id, tokenizer=model_id if "cardiffnlp" in model_id else None)
# ----------------------
# Text utilities
# ----------------------
URL_RE = re.compile(r"https?://\S+|www\.\S+")
MENTION_RE = re.compile(r"@\w+")
HASHTAG_RE = re.compile(r"#(\w+)")
def clean_line(s, demojize=True, strip_social=True, lower=False):
if demojize:
s = emoji.demojize(s, language='en')
if strip_social:
s = URL_RE.sub("", s)
s = MENTION_RE.sub("", s)
# Keep hashtag token but turn into plain word
s = HASHTAG_RE.sub(r"\1", s)
s = s.strip()
if lower:
s = s.lower()
return s
def detect_langs(lines, probe=30):
# quick language probe on a sample
sample = [l for l in lines if l.strip()][:probe]
counts = {}
for s in sample:
try:
code = detect(s)
counts[code] = counts.get(code, 0) + 1
except:
counts["unk"] = counts.get("unk", 0) + 1
total = sum(counts.values()) or 1
share_en = counts.get("en", 0) / total
return counts, share_en
# ----------------------
# Core analyzer
# ----------------------
def run_analysis(
text_block, file_obj, text_col, mode, sentiment_model_choice, auto_model, demojize_opt,
strip_social_opt, lower_opt, batch_size
):
# Collect lines from textbox and/or CSV
lines = []
if text_block:
lines.extend([l.rstrip() for l in text_block.splitlines() if l.strip()])
df_in = None
if file_obj is not None:
try:
df_in = pd.read_csv(file_obj.name)
use_col = text_col if (text_col and text_col in df_in.columns) else None
if not use_col:
# naive auto-pick
for c in ["text", "message", "msg", "content", "body"]:
if c in df_in.columns:
use_col = c
break
if not use_col:
return (pd.DataFrame([{"error": "CSV loaded, but no text column selected/found."}]),
plt.figure(), gr.update(value=None), "No language info")
lines.extend([str(x) for x in df_in[use_col].astype(str).tolist()])
except Exception as e:
return (pd.DataFrame([{"error": f"Failed to read CSV: {e}"}]),
plt.figure(), gr.update(value=None), "No language info")
if not lines:
return (pd.DataFrame([{"error": "Enter text or upload CSV with a text column."}]),
plt.figure(), gr.update(value=None), "No language info")
# Preprocess
proc = [clean_line(l, demojize=demojize_opt, strip_social=strip_social_opt, lower=lower_opt) for l in lines]
# Language probe to optionally switch sentiment model
lang_counts, share_en = detect_langs(proc, probe=min(30, len(proc)))
lang_info = f"Lang probe (top): {dict(sorted(lang_counts.items(), key=lambda x: -x[1])[:3])}, EN share≈{round(share_en,2)}"
# Choose model
if mode == "Sentiment":
if auto_model:
model_id = SENTIMENT_CHOICES["Twitter (EN, 3-class)"] if share_en >= 0.6 else SENTIMENT_CHOICES["Twitter (Multilingual, 3-class)"]
else:
model_id = SENTIMENT_CHOICES[sentiment_model_choice]
pipe = get_pipe("sentiment-analysis", model_id)
else:
pipe = get_pipe("text-classification", EMOTION_MODEL)
# Batched inference for speed
outputs = []
for i in range(0, len(proc), batch_size):
batch = proc[i:i+batch_size]
outs = pipe(batch, batch_size=batch_size, truncation=True)
# normalize to list[dict]
for out in outs:
out0 = out[0] if isinstance(out, list) else out
outputs.append({"label": out0["label"], "score": float(out0["score"])})
# Build results DataFrame
rows = []
for idx, (raw, out) in enumerate(zip(lines, outputs), 1):
rows.append({
"idx": idx,
"text": raw,
"label": out["label"],
"score": round(out["score"], 4)
})
df = pd.DataFrame(rows)
# Distribution plot (matplotlib)
counts = df["label"].value_counts().sort_values(ascending=False)
fig, ax = plt.subplots(figsize=(6.5, 3.2))
counts.plot(kind="bar", ax=ax, color="#4C78A8")
ax.set_title("Label Distribution")
ax.set_xlabel("Label")
ax.set_ylabel("Count")
plt.tight_layout()
# Export to CSV
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
df.to_csv(tmp.name, index=False)
return df, fig, tmp.name, lang_info
# ----------------------
# UI
# ----------------------
with gr.Blocks(title="Chat Mood Analyzer — Ultimate") as demo:
gr.Markdown("## Chat Mood Analyzer — Ultimate Edition")
with gr.Row():
with gr.Column():
txt = gr.Textbox(lines=10, label="Paste chat (one message per line)")
file_in = gr.File(label="Or upload CSV", file_types=[".csv"], file_count="single")
text_col = gr.Textbox(value="", label="CSV text column (auto-detect if blank)")
mode = gr.Radio(["Sentiment", "Emotion"], value="Sentiment", label="Analysis mode")
with gr.Accordion("Sentiment model settings", open=False):
auto_model = gr.Checkbox(value=True, label="Auto-pick tweet-aware EN vs Multilingual")
sentiment_model = gr.Dropdown(
choices=list(SENTIMENT_CHOICES.keys()),
value="Twitter (EN, 3-class)",
label="Manual model (used if Auto is OFF)"
)
gr.Markdown("Tip: Twitter models understand slang/emojis better than SST‑2 review models.")
with gr.Accordion("Preprocessing", open=False):
demojize_opt = gr.Checkbox(value=True, label="Convert emojis to text (:face_with_tears_of_joy:)")
strip_social_opt = gr.Checkbox(value=True, label="Strip URLs/@mentions/#hashtags")
lower_opt = gr.Checkbox(value=False, label="Lowercase text")
batch_size = gr.Slider(1, 64, value=16, step=1, label="Batch size")
run = gr.Button("Analyze", variant="primary")
clear = gr.ClearButton([txt, file_in])
with gr.Column():
out_table = gr.Dataframe(label="Per-message results", wrap=True)
out_plot = gr.Plot(label="Label distribution")
download = gr.File(label="Download results (.csv)")
lang_probe = gr.Markdown()
evt = run.click(
fn=run_analysis,
inputs=[txt, file_in, text_col, mode, sentiment_model, auto_model,
demojize_opt, strip_social_opt, lower_opt, batch_size],
outputs=[out_table, out_plot, download, lang_probe],
concurrency_limit=4,
show_progress=True
)
demo.queue(max_size=64, default_concurrency_limit=2)
demo.launch(share=True)
|