Spaces:

manhteky123
/

seminar-chuyen-de

Sleeping

seminar-chuyen-de / app /nlp_processor.py

manhteky123

Initial commit: Vietnamese Sentiment Analysis App with Flask and Transformers

244aaa1 24 days ago

4.53 kB

	import os
	os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
	os.environ['TRANSFORMERS_NO_TF'] = '1'

	from transformers import pipeline
	import re

	# Import Underthesea (tùy chọn theo đề bài)
	try:
	from underthesea import word_tokenize
	USE_UNDERTHESEA = True
	except ImportError:
	USE_UNDERTHESEA = False
	print("⚠️ Underthesea không được cài đặt, sẽ dùng chuẩn hóa cơ bản")

	class SentimentProcessor:
	def __init__(self):
	"""Khởi tạo pipeline phân loại cảm xúc theo đề bài"""
	print("\n🔄 Đang tải model Transformer...")
	try:
	# Sử dụng pipeline sentiment-analysis như đề bài yêu cầu
	# Model: cardiffnlp/twitter-xlm-roberta-base-sentiment (multilingual, đã train sentiment)
	self.classifier = pipeline(
	"sentiment-analysis",
	model="cardiffnlp/twitter-xlm-roberta-base-sentiment", # Hỗ trợ đa ngôn ngữ, đã train sentiment
	device=-1 # CPU
	)
	print("✅ Đã tải model XLM-RoBERTa Multilingual Sentiment")
	except Exception as e:
	print(f"❌ Lỗi khi tải model: {e}")
	raise

	# Từ điển chuẩn hóa từ viết tắt
	self.normalization_dict = {
	'rat': 'rất',
	'k': 'không',
	'ko': 'không',
	'hok': 'không',
	'dc': 'được',
	'dk': 'được',
	'ms': 'mới',
	'mk': 'mình',
	'mik': 'mình',
	'cx': 'cũng',
	'j': 'gì',
	'vs': 'với',
	'tks': 'cảm ơn',
	'thank': 'cảm ơn',
	}

	def preprocess_text(self, text):
	"""Tiền xử lý văn bản theo đề bài"""
	if not text or len(text.strip()) < 5:
	raise ValueError("Câu quá ngắn hoặc rỗng")

	# Giới hạn độ dài ≤ 50 ký tự theo đề bài (Component 1: Tiền xử lý)
	if len(text) > 50:
	text = text[:50]

	# Loại bỏ khoảng trắng thừa
	text = ' '.join(text.split())

	# Chuẩn hóa từ viết tắt (10-20 từ phổ biến theo đề bài)
	words = text.lower().split()
	normalized_words = [self.normalization_dict.get(word, word) for word in words]
	text = ' '.join(normalized_words)

	# Tùy chọn: Dùng Underthesea để tokenize tiếng Việt
	if USE_UNDERTHESEA:
	try:
	text = word_tokenize(text, format="text")
	except:
	pass # Nếu lỗi thì dùng text gốc

	return text

	def classify_sentiment(self, text):
	"""Phân loại cảm xúc theo đề bài (Component 2 & 3)"""
	try:
	# Component 1: Tiền xử lý
	processed_text = self.preprocess_text(text)

	# Component 2: Phân loại cảm xúc qua pipeline
	result = self.classifier(processed_text)[0]

	# Chuẩn hóa nhãn về 3 loại theo đề bài
	label = result['label'].upper()
	score = result['score']

	# Mapping các nhãn khác nhau về POSITIVE, NEUTRAL, NEGATIVE
	if 'POS' in label or 'LABEL_2' in label or '5' in label or '4' in label:
	sentiment = 'POSITIVE'
	elif 'NEG' in label or 'LABEL_0' in label or '1' in label or '2' in label:
	sentiment = 'NEGATIVE'
	else:
	sentiment = 'NEUTRAL'

	# Component 3: Nếu xác suất < 0.5, trả về NEUTRAL mặc định theo đề bài
	if score < 0.5:
	sentiment = 'NEUTRAL'

	# Đầu ra dictionary theo đề bài (chỉ 2 trường bắt buộc)
	return {
	'text': text,
	'sentiment': sentiment,
	'score': round(score, 2) # Thêm score để hiển thị
	}

	except ValueError as e:
	raise e
	except Exception as e:
	raise Exception(f"Lỗi khi phân loại: {str(e)}")

	# Khởi tạo processor toàn cục (cache để tái sử dụng)
	_processor = None

	def get_processor():
	"""Lấy processor (singleton pattern)"""
	global _processor
	if _processor is None:
	_processor = SentimentProcessor()
	return _processor