import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TRANSFORMERS_NO_TF'] = '1'

from transformers import pipeline
import re

# Import Underthesea (tùy chọn theo đề bài)
try:
    from underthesea import word_tokenize
    USE_UNDERTHESEA = True
except ImportError:
    USE_UNDERTHESEA = False
    print("⚠️  Underthesea không được cài đặt, sẽ dùng chuẩn hóa cơ bản")

class SentimentProcessor:
    def __init__(self):
        """Khởi tạo pipeline phân loại cảm xúc theo đề bài"""
        print("\n🔄 Đang tải model Transformer...")
        try:
            # Sử dụng pipeline sentiment-analysis như đề bài yêu cầu
            # Model: cardiffnlp/twitter-xlm-roberta-base-sentiment (multilingual, đã train sentiment)
            self.classifier = pipeline(
                "sentiment-analysis",
                model="cardiffnlp/twitter-xlm-roberta-base-sentiment",  # Hỗ trợ đa ngôn ngữ, đã train sentiment
                device=-1  # CPU
            )
            print("✅ Đã tải model XLM-RoBERTa Multilingual Sentiment")
        except Exception as e:
            print(f"❌ Lỗi khi tải model: {e}")
            raise
        
        # Từ điển chuẩn hóa từ viết tắt
        self.normalization_dict = {
            'rat': 'rất',
            'k': 'không',
            'ko': 'không',
            'hok': 'không',
            'dc': 'được',
            'dk': 'được',
            'ms': 'mới',
            'mk': 'mình',
            'mik': 'mình',
            'cx': 'cũng',
            'j': 'gì',
            'vs': 'với',
            'tks': 'cảm ơn',
            'thank': 'cảm ơn',
        }
    
    def preprocess_text(self, text):
        """Tiền xử lý văn bản theo đề bài"""
        if not text or len(text.strip()) < 5:
            raise ValueError("Câu quá ngắn hoặc rỗng")
        
        # Giới hạn độ dài ≤ 50 ký tự theo đề bài (Component 1: Tiền xử lý)
        if len(text) > 50:
            text = text[:50]
        
        # Loại bỏ khoảng trắng thừa
        text = ' '.join(text.split())
        
        # Chuẩn hóa từ viết tắt (10-20 từ phổ biến theo đề bài)
        words = text.lower().split()
        normalized_words = [self.normalization_dict.get(word, word) for word in words]
        text = ' '.join(normalized_words)
        
        # Tùy chọn: Dùng Underthesea để tokenize tiếng Việt
        if USE_UNDERTHESEA:
            try:
                text = word_tokenize(text, format="text")
            except:
                pass  # Nếu lỗi thì dùng text gốc
        
        return text
    
    def classify_sentiment(self, text):
        """Phân loại cảm xúc theo đề bài (Component 2 & 3)"""
        try:
            # Component 1: Tiền xử lý
            processed_text = self.preprocess_text(text)
            
            # Component 2: Phân loại cảm xúc qua pipeline
            result = self.classifier(processed_text)[0]
            
            # Chuẩn hóa nhãn về 3 loại theo đề bài
            label = result['label'].upper()
            score = result['score']
            
            # Mapping các nhãn khác nhau về POSITIVE, NEUTRAL, NEGATIVE
            if 'POS' in label or 'LABEL_2' in label or '5' in label or '4' in label:
                sentiment = 'POSITIVE'
            elif 'NEG' in label or 'LABEL_0' in label or '1' in label or '2' in label:
                sentiment = 'NEGATIVE'
            else:
                sentiment = 'NEUTRAL'
            
            # Component 3: Nếu xác suất < 0.5, trả về NEUTRAL mặc định theo đề bài
            if score < 0.5:
                sentiment = 'NEUTRAL'
            
            # Đầu ra dictionary theo đề bài (chỉ 2 trường bắt buộc)
            return {
                'text': text,
                'sentiment': sentiment,
                'score': round(score, 2)  # Thêm score để hiển thị
            }
            
        except ValueError as e:
            raise e
        except Exception as e:
            raise Exception(f"Lỗi khi phân loại: {str(e)}")

# Khởi tạo processor toàn cục (cache để tái sử dụng)
_processor = None

def get_processor():
    """Lấy processor (singleton pattern)"""
    global _processor
    if _processor is None:
        _processor = SentimentProcessor()
    return _processor