Spaces:
Sleeping
Sleeping
File size: 6,877 Bytes
b27e9cd 497021f 56ac5db 0d595cf 497021f 56ac5db 497021f 56ac5db 497021f 56ac5db 497021f 56ac5db 497021f 56ac5db 497021f 56ac5db 497021f 56ac5db 497021f d0364db 497021f d0364db 56ac5db b27e9cd 497021f 56ac5db 497021f 56ac5db 497021f 56ac5db 497021f 56ac5db 497021f 56ac5db 497021f 56ac5db 497021f 56ac5db 497021f 56ac5db 497021f 56ac5db 7571122 56ac5db da6ef76 497021f 56ac5db 497021f 56ac5db b27e9cd 497021f d0364db 56ac5db 497021f d0364db 497021f 56ac5db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
import time
print(f"[{time.time()}] SCRIPT START: DeepSeek Coder 1.3B Chat (Conditional Quantization). PID: {os.getpid()}")
# --- Configuration ---
MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[{time.time()}] Using device: {DEVICE}")
print(f"[{time.time()}] PyTorch version: {torch.__version__}")
# --- Load Model and Tokenizer ---
model = None
tokenizer = None
model_load_error = None
try:
print(f"[{time.time()}] Loading tokenizer for {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
print(f"[{time.time()}] Tokenizer loaded. Vocab size: {tokenizer.vocab_size if tokenizer else 'N/A'}")
if tokenizer and tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"[{time.time()}] Set pad_token to eos_token: {tokenizer.pad_token}")
print(f"[{time.time()}] Attempting to load model {MODEL_NAME}...")
if DEVICE == "cuda":
print(f"[{time.time()}] Configuring 8-bit quantization for GPU...")
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=quantization_config,
device_map="auto", # Let accelerate handle device mapping for GPU
trust_remote_code=True
)
print(f"[{time.time()}] Model {MODEL_NAME} loaded with 8-bit quantization on GPU.")
else: # CPU
print(f"[{time.time()}] Loading model {MODEL_NAME} on CPU without bitsandbytes quantization.")
# When on CPU, load without quantization_config to avoid bitsandbytes issues.
# This will use more RAM but is more stable if bitsandbytes CPU support is problematic.
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float32, # Use float32 for CPU for broader compatibility
trust_remote_code=True,
low_cpu_mem_usage=True # Helpful for larger models on CPU
)
# Explicitly move to CPU if not already (low_cpu_mem_usage might handle parts of this)
model.to(DEVICE)
print(f"[{time.time()}] Model {MODEL_NAME} loaded on CPU (FP32 precision).")
model.eval()
# print(f"[{time.time()}] Model footprint: {model.get_memory_footprint()}") # Useful if available
except Exception as e:
model_load_error = str(e)
print(f"[{time.time()}] CRITICAL ERROR loading model or tokenizer: {e}")
import traceback
traceback.print_exc()
# --- Chat Function (remains the same as your previous version) ---
def generate_chat_response(message, history):
print(f"[{time.time()}] generate_chat_response called. Message: '{message}'")
if model_load_error or not model or not tokenizer:
error_msg = f"Model not loaded. Error: {model_load_error if model_load_error else 'Unknown reason.'}"
print(f"[{time.time()}] {error_msg}")
return error_msg
prompt_parts = []
for user_msg, bot_msg in history:
prompt_parts.append(f"### Instruction:\n{user_msg}\n### Response:\n{bot_msg}")
prompt_parts.append(f"### Instruction:\n{message}\n### Response:")
prompt = "\n".join(prompt_parts)
try:
print(f"[{time.time()}] Encoding prompt for model (length: {len(prompt)} chars)...")
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
# Move inputs to the model's device if not using device_map="auto" or if it's explicitly CPU
if DEVICE == "cpu": # Or check model.device directly
inputs = inputs.to(model.device)
# If device_map="auto" was used (GPU case), inputs are often handled by accelerate
print(f"[{time.time()}] Generating response... Input token length: {inputs['input_ids'].shape[1]}")
with torch.no_grad():
output_sequences = model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
max_new_tokens=200,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
do_sample=True,
top_p=0.95,
top_k=50,
temperature=0.7
)
response_text = tokenizer.decode(output_sequences[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
response_text = response_text.strip()
print(f"[{time.time()}] Raw generated text: '{response_text}'")
if not response_text:
response_text = "I'm not sure how to respond to that right now."
return response_text
except Exception as e:
print(f"[{time.time()}] Error during text generation: {e}")
import traceback
traceback.print_exc()
return f"Sorry, I encountered an error while generating a response: {e}"
# --- Gradio Interface (remains the same) ---
if __name__ == "__main__":
print(f"[{time.time()}] MAIN: Building Gradio interface (DeepSeek Coder - Conditional Quantization)...")
interface_title = f"Chat with LLM (deepseek-coder-1.3B)"
interface_description = f"""
This app runs **{MODEL_NAME}** directly in this Space.
Model loading might take a few minutes. Running on: **{DEVICE.upper()}**.
Quantization is attempted on GPU, bypassed on CPU to avoid `bitsandbytes` issues.
"""
if model_load_error:
interface_description += f"\n\n<h3 style='color:red;'>MODEL LOADING FAILED: {model_load_error}</h3>"
elif not model or not tokenizer:
interface_description += "\n\n<h3 style='color:orange;'>Warning: Model or tokenizer not available. Chat may not function.</h3>"
chat_interface = gr.ChatInterface(
fn=generate_chat_response,
title=interface_title,
description=interface_description,
examples=[["Hello, what can you do?"], ["Write a python function to calculate factorial."]],
cache_examples=False,
)
print(f"[{time.time()}] MAIN: Attempting to launch Gradio app...")
try:
chat_interface.queue().launch(debug=True)
print(f"[{time.time()}] MAIN: Gradio app launch() called. Monitor logs for 'Application startup complete'.")
except Exception as e:
print(f"[{time.time()}] FATAL ERROR during launch: {e}")
with open("launch_error.txt", "w") as f_err:
f_err.write(f"Error during launch: {str(e)}\n")
print(f"[{time.time()}] SCRIPT END: DeepSeek Coder app.py (Conditional Quantization) has finished.")
|