import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
import time

print(f"[{time.time()}] SCRIPT START: DeepSeek Coder 1.3B Chat (Conditional Quantization). PID: {os.getpid()}")

# --- Configuration ---
MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[{time.time()}] Using device: {DEVICE}")
print(f"[{time.time()}] PyTorch version: {torch.__version__}")

# --- Load Model and Tokenizer ---
model = None
tokenizer = None
model_load_error = None

try:
    print(f"[{time.time()}] Loading tokenizer for {MODEL_NAME}...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    print(f"[{time.time()}] Tokenizer loaded. Vocab size: {tokenizer.vocab_size if tokenizer else 'N/A'}")

    if tokenizer and tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print(f"[{time.time()}] Set pad_token to eos_token: {tokenizer.pad_token}")

    print(f"[{time.time()}] Attempting to load model {MODEL_NAME}...")

    if DEVICE == "cuda":
        print(f"[{time.time()}] Configuring 8-bit quantization for GPU...")
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=quantization_config,
            device_map="auto", # Let accelerate handle device mapping for GPU
            trust_remote_code=True
        )
        print(f"[{time.time()}] Model {MODEL_NAME} loaded with 8-bit quantization on GPU.")
    else: # CPU
        print(f"[{time.time()}] Loading model {MODEL_NAME} on CPU without bitsandbytes quantization.")
        # When on CPU, load without quantization_config to avoid bitsandbytes issues.
        # This will use more RAM but is more stable if bitsandbytes CPU support is problematic.
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float32, # Use float32 for CPU for broader compatibility
            trust_remote_code=True,
            low_cpu_mem_usage=True # Helpful for larger models on CPU
        )
        # Explicitly move to CPU if not already (low_cpu_mem_usage might handle parts of this)
        model.to(DEVICE) 
        print(f"[{time.time()}] Model {MODEL_NAME} loaded on CPU (FP32 precision).")

    model.eval() 
    # print(f"[{time.time()}] Model footprint: {model.get_memory_footprint()}") # Useful if available

except Exception as e:
    model_load_error = str(e)
    print(f"[{time.time()}] CRITICAL ERROR loading model or tokenizer: {e}")
    import traceback
    traceback.print_exc()


# --- Chat Function (remains the same as your previous version) ---
def generate_chat_response(message, history):
    print(f"[{time.time()}] generate_chat_response called. Message: '{message}'")

    if model_load_error or not model or not tokenizer:
        error_msg = f"Model not loaded. Error: {model_load_error if model_load_error else 'Unknown reason.'}"
        print(f"[{time.time()}] {error_msg}")
        return error_msg

    prompt_parts = []
    for user_msg, bot_msg in history:
        prompt_parts.append(f"### Instruction:\n{user_msg}\n### Response:\n{bot_msg}")
    prompt_parts.append(f"### Instruction:\n{message}\n### Response:")
    prompt = "\n".join(prompt_parts)

    try:
        print(f"[{time.time()}] Encoding prompt for model (length: {len(prompt)} chars)...")
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
        
        # Move inputs to the model's device if not using device_map="auto" or if it's explicitly CPU
        if DEVICE == "cpu": # Or check model.device directly
            inputs = inputs.to(model.device) 
        # If device_map="auto" was used (GPU case), inputs are often handled by accelerate

        print(f"[{time.time()}] Generating response... Input token length: {inputs['input_ids'].shape[1]}")

        with torch.no_grad(): 
            output_sequences = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_new_tokens=200,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                do_sample=True, 
                top_p=0.95,
                top_k=50,
                temperature=0.7
            )
        
        response_text = tokenizer.decode(output_sequences[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
        response_text = response_text.strip() 
        
        print(f"[{time.time()}] Raw generated text: '{response_text}'")
        if not response_text:
            response_text = "I'm not sure how to respond to that right now."
        return response_text
    except Exception as e:
        print(f"[{time.time()}] Error during text generation: {e}")
        import traceback
        traceback.print_exc()
        return f"Sorry, I encountered an error while generating a response: {e}"

# --- Gradio Interface (remains the same) ---
if __name__ == "__main__":
    print(f"[{time.time()}] MAIN: Building Gradio interface (DeepSeek Coder - Conditional Quantization)...")
    interface_title = f"Chat with LLM (deepseek-coder-1.3B)"
    interface_description = f"""
    This app runs **{MODEL_NAME}** directly in this Space.
    Model loading might take a few minutes. Running on: **{DEVICE.upper()}**.
    Quantization is attempted on GPU, bypassed on CPU to avoid `bitsandbytes` issues.
    """
    if model_load_error:
        interface_description += f"\n\n<h3 style='color:red;'>MODEL LOADING FAILED: {model_load_error}</h3>"
    elif not model or not tokenizer:
        interface_description += "\n\n<h3 style='color:orange;'>Warning: Model or tokenizer not available. Chat may not function.</h3>"

    chat_interface = gr.ChatInterface(
        fn=generate_chat_response,
        title=interface_title,
        description=interface_description,
        examples=[["Hello, what can you do?"], ["Write a python function to calculate factorial."]],
        cache_examples=False,
    )
    print(f"[{time.time()}] MAIN: Attempting to launch Gradio app...")
    try:
        chat_interface.queue().launch(debug=True) 
        print(f"[{time.time()}] MAIN: Gradio app launch() called. Monitor logs for 'Application startup complete'.")
    except Exception as e:
        print(f"[{time.time()}] FATAL ERROR during launch: {e}")
        with open("launch_error.txt", "w") as f_err: 
            f_err.write(f"Error during launch: {str(e)}\n")
print(f"[{time.time()}] SCRIPT END: DeepSeek Coder app.py (Conditional Quantization) has finished.")