import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig import os import time print(f"[{time.time()}] SCRIPT START: DeepSeek Coder 1.3B Chat (Conditional Quantization). PID: {os.getpid()}") # --- Configuration --- MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(f"[{time.time()}] Using device: {DEVICE}") print(f"[{time.time()}] PyTorch version: {torch.__version__}") # --- Load Model and Tokenizer --- model = None tokenizer = None model_load_error = None try: print(f"[{time.time()}] Loading tokenizer for {MODEL_NAME}...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) print(f"[{time.time()}] Tokenizer loaded. Vocab size: {tokenizer.vocab_size if tokenizer else 'N/A'}") if tokenizer and tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print(f"[{time.time()}] Set pad_token to eos_token: {tokenizer.pad_token}") print(f"[{time.time()}] Attempting to load model {MODEL_NAME}...") if DEVICE == "cuda": print(f"[{time.time()}] Configuring 8-bit quantization for GPU...") quantization_config = BitsAndBytesConfig( load_in_8bit=True, bnb_8bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 ) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, quantization_config=quantization_config, device_map="auto", # Let accelerate handle device mapping for GPU trust_remote_code=True ) print(f"[{time.time()}] Model {MODEL_NAME} loaded with 8-bit quantization on GPU.") else: # CPU print(f"[{time.time()}] Loading model {MODEL_NAME} on CPU without bitsandbytes quantization.") # When on CPU, load without quantization_config to avoid bitsandbytes issues. # This will use more RAM but is more stable if bitsandbytes CPU support is problematic. model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float32, # Use float32 for CPU for broader compatibility trust_remote_code=True, low_cpu_mem_usage=True # Helpful for larger models on CPU ) # Explicitly move to CPU if not already (low_cpu_mem_usage might handle parts of this) model.to(DEVICE) print(f"[{time.time()}] Model {MODEL_NAME} loaded on CPU (FP32 precision).") model.eval() # print(f"[{time.time()}] Model footprint: {model.get_memory_footprint()}") # Useful if available except Exception as e: model_load_error = str(e) print(f"[{time.time()}] CRITICAL ERROR loading model or tokenizer: {e}") import traceback traceback.print_exc() # --- Chat Function (remains the same as your previous version) --- def generate_chat_response(message, history): print(f"[{time.time()}] generate_chat_response called. Message: '{message}'") if model_load_error or not model or not tokenizer: error_msg = f"Model not loaded. Error: {model_load_error if model_load_error else 'Unknown reason.'}" print(f"[{time.time()}] {error_msg}") return error_msg prompt_parts = [] for user_msg, bot_msg in history: prompt_parts.append(f"### Instruction:\n{user_msg}\n### Response:\n{bot_msg}") prompt_parts.append(f"### Instruction:\n{message}\n### Response:") prompt = "\n".join(prompt_parts) try: print(f"[{time.time()}] Encoding prompt for model (length: {len(prompt)} chars)...") inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500) # Move inputs to the model's device if not using device_map="auto" or if it's explicitly CPU if DEVICE == "cpu": # Or check model.device directly inputs = inputs.to(model.device) # If device_map="auto" was used (GPU case), inputs are often handled by accelerate print(f"[{time.time()}] Generating response... Input token length: {inputs['input_ids'].shape[1]}") with torch.no_grad(): output_sequences = model.generate( input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_new_tokens=200, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, do_sample=True, top_p=0.95, top_k=50, temperature=0.7 ) response_text = tokenizer.decode(output_sequences[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True) response_text = response_text.strip() print(f"[{time.time()}] Raw generated text: '{response_text}'") if not response_text: response_text = "I'm not sure how to respond to that right now." return response_text except Exception as e: print(f"[{time.time()}] Error during text generation: {e}") import traceback traceback.print_exc() return f"Sorry, I encountered an error while generating a response: {e}" # --- Gradio Interface (remains the same) --- if __name__ == "__main__": print(f"[{time.time()}] MAIN: Building Gradio interface (DeepSeek Coder - Conditional Quantization)...") interface_title = f"Chat with LLM (deepseek-coder-1.3B)" interface_description = f""" This app runs **{MODEL_NAME}** directly in this Space. Model loading might take a few minutes. Running on: **{DEVICE.upper()}**. Quantization is attempted on GPU, bypassed on CPU to avoid `bitsandbytes` issues. """ if model_load_error: interface_description += f"\n\n

MODEL LOADING FAILED: {model_load_error}

" elif not model or not tokenizer: interface_description += "\n\n

Warning: Model or tokenizer not available. Chat may not function.

" chat_interface = gr.ChatInterface( fn=generate_chat_response, title=interface_title, description=interface_description, examples=[["Hello, what can you do?"], ["Write a python function to calculate factorial."]], cache_examples=False, ) print(f"[{time.time()}] MAIN: Attempting to launch Gradio app...") try: chat_interface.queue().launch(debug=True) print(f"[{time.time()}] MAIN: Gradio app launch() called. Monitor logs for 'Application startup complete'.") except Exception as e: print(f"[{time.time()}] FATAL ERROR during launch: {e}") with open("launch_error.txt", "w") as f_err: f_err.write(f"Error during launch: {str(e)}\n") print(f"[{time.time()}] SCRIPT END: DeepSeek Coder app.py (Conditional Quantization) has finished.")