File size: 6,877 Bytes
b27e9cd
497021f
56ac5db
0d595cf
497021f
 
56ac5db
497021f
 
56ac5db
497021f
 
 
56ac5db
497021f
 
 
 
 
 
 
 
56ac5db
497021f
 
 
 
 
 
56ac5db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497021f
 
 
 
56ac5db
 
497021f
56ac5db
 
497021f
 
d0364db
497021f
 
 
 
d0364db
56ac5db
 
 
 
 
b27e9cd
497021f
56ac5db
 
497021f
56ac5db
 
 
 
497021f
56ac5db
497021f
56ac5db
497021f
56ac5db
 
 
 
497021f
56ac5db
 
 
 
497021f
 
56ac5db
 
497021f
 
 
 
 
 
 
56ac5db
 
 
497021f
56ac5db
7571122
56ac5db
da6ef76
497021f
56ac5db
 
 
497021f
 
 
 
 
 
 
 
 
 
56ac5db
 
b27e9cd
497021f
d0364db
56ac5db
497021f
d0364db
497021f
56ac5db
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
import time

print(f"[{time.time()}] SCRIPT START: DeepSeek Coder 1.3B Chat (Conditional Quantization). PID: {os.getpid()}")

# --- Configuration ---
MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[{time.time()}] Using device: {DEVICE}")
print(f"[{time.time()}] PyTorch version: {torch.__version__}")

# --- Load Model and Tokenizer ---
model = None
tokenizer = None
model_load_error = None

try:
    print(f"[{time.time()}] Loading tokenizer for {MODEL_NAME}...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    print(f"[{time.time()}] Tokenizer loaded. Vocab size: {tokenizer.vocab_size if tokenizer else 'N/A'}")

    if tokenizer and tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        print(f"[{time.time()}] Set pad_token to eos_token: {tokenizer.pad_token}")

    print(f"[{time.time()}] Attempting to load model {MODEL_NAME}...")

    if DEVICE == "cuda":
        print(f"[{time.time()}] Configuring 8-bit quantization for GPU...")
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
        )
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=quantization_config,
            device_map="auto", # Let accelerate handle device mapping for GPU
            trust_remote_code=True
        )
        print(f"[{time.time()}] Model {MODEL_NAME} loaded with 8-bit quantization on GPU.")
    else: # CPU
        print(f"[{time.time()}] Loading model {MODEL_NAME} on CPU without bitsandbytes quantization.")
        # When on CPU, load without quantization_config to avoid bitsandbytes issues.
        # This will use more RAM but is more stable if bitsandbytes CPU support is problematic.
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float32, # Use float32 for CPU for broader compatibility
            trust_remote_code=True,
            low_cpu_mem_usage=True # Helpful for larger models on CPU
        )
        # Explicitly move to CPU if not already (low_cpu_mem_usage might handle parts of this)
        model.to(DEVICE) 
        print(f"[{time.time()}] Model {MODEL_NAME} loaded on CPU (FP32 precision).")

    model.eval() 
    # print(f"[{time.time()}] Model footprint: {model.get_memory_footprint()}") # Useful if available

except Exception as e:
    model_load_error = str(e)
    print(f"[{time.time()}] CRITICAL ERROR loading model or tokenizer: {e}")
    import traceback
    traceback.print_exc()


# --- Chat Function (remains the same as your previous version) ---
def generate_chat_response(message, history):
    print(f"[{time.time()}] generate_chat_response called. Message: '{message}'")

    if model_load_error or not model or not tokenizer:
        error_msg = f"Model not loaded. Error: {model_load_error if model_load_error else 'Unknown reason.'}"
        print(f"[{time.time()}] {error_msg}")
        return error_msg

    prompt_parts = []
    for user_msg, bot_msg in history:
        prompt_parts.append(f"### Instruction:\n{user_msg}\n### Response:\n{bot_msg}")
    prompt_parts.append(f"### Instruction:\n{message}\n### Response:")
    prompt = "\n".join(prompt_parts)

    try:
        print(f"[{time.time()}] Encoding prompt for model (length: {len(prompt)} chars)...")
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
        
        # Move inputs to the model's device if not using device_map="auto" or if it's explicitly CPU
        if DEVICE == "cpu": # Or check model.device directly
            inputs = inputs.to(model.device) 
        # If device_map="auto" was used (GPU case), inputs are often handled by accelerate

        print(f"[{time.time()}] Generating response... Input token length: {inputs['input_ids'].shape[1]}")

        with torch.no_grad(): 
            output_sequences = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_new_tokens=200,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                do_sample=True, 
                top_p=0.95,
                top_k=50,
                temperature=0.7
            )
        
        response_text = tokenizer.decode(output_sequences[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
        response_text = response_text.strip() 
        
        print(f"[{time.time()}] Raw generated text: '{response_text}'")
        if not response_text:
            response_text = "I'm not sure how to respond to that right now."
        return response_text
    except Exception as e:
        print(f"[{time.time()}] Error during text generation: {e}")
        import traceback
        traceback.print_exc()
        return f"Sorry, I encountered an error while generating a response: {e}"

# --- Gradio Interface (remains the same) ---
if __name__ == "__main__":
    print(f"[{time.time()}] MAIN: Building Gradio interface (DeepSeek Coder - Conditional Quantization)...")
    interface_title = f"Chat with LLM (deepseek-coder-1.3B)"
    interface_description = f"""
    This app runs **{MODEL_NAME}** directly in this Space.
    Model loading might take a few minutes. Running on: **{DEVICE.upper()}**.
    Quantization is attempted on GPU, bypassed on CPU to avoid `bitsandbytes` issues.
    """
    if model_load_error:
        interface_description += f"\n\n<h3 style='color:red;'>MODEL LOADING FAILED: {model_load_error}</h3>"
    elif not model or not tokenizer:
        interface_description += "\n\n<h3 style='color:orange;'>Warning: Model or tokenizer not available. Chat may not function.</h3>"

    chat_interface = gr.ChatInterface(
        fn=generate_chat_response,
        title=interface_title,
        description=interface_description,
        examples=[["Hello, what can you do?"], ["Write a python function to calculate factorial."]],
        cache_examples=False,
    )
    print(f"[{time.time()}] MAIN: Attempting to launch Gradio app...")
    try:
        chat_interface.queue().launch(debug=True) 
        print(f"[{time.time()}] MAIN: Gradio app launch() called. Monitor logs for 'Application startup complete'.")
    except Exception as e:
        print(f"[{time.time()}] FATAL ERROR during launch: {e}")
        with open("launch_error.txt", "w") as f_err: 
            f_err.write(f"Error during launch: {str(e)}\n")
print(f"[{time.time()}] SCRIPT END: DeepSeek Coder app.py (Conditional Quantization) has finished.")