RAG-PDFChat / app.py
Dushyant4342's picture
Update app.py
da6ef76 verified
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import os
import time
print(f"[{time.time()}] SCRIPT START: DeepSeek Coder 1.3B Chat (Conditional Quantization). PID: {os.getpid()}")
# --- Configuration ---
MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[{time.time()}] Using device: {DEVICE}")
print(f"[{time.time()}] PyTorch version: {torch.__version__}")
# --- Load Model and Tokenizer ---
model = None
tokenizer = None
model_load_error = None
try:
print(f"[{time.time()}] Loading tokenizer for {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
print(f"[{time.time()}] Tokenizer loaded. Vocab size: {tokenizer.vocab_size if tokenizer else 'N/A'}")
if tokenizer and tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"[{time.time()}] Set pad_token to eos_token: {tokenizer.pad_token}")
print(f"[{time.time()}] Attempting to load model {MODEL_NAME}...")
if DEVICE == "cuda":
print(f"[{time.time()}] Configuring 8-bit quantization for GPU...")
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
bnb_8bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
quantization_config=quantization_config,
device_map="auto", # Let accelerate handle device mapping for GPU
trust_remote_code=True
)
print(f"[{time.time()}] Model {MODEL_NAME} loaded with 8-bit quantization on GPU.")
else: # CPU
print(f"[{time.time()}] Loading model {MODEL_NAME} on CPU without bitsandbytes quantization.")
# When on CPU, load without quantization_config to avoid bitsandbytes issues.
# This will use more RAM but is more stable if bitsandbytes CPU support is problematic.
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float32, # Use float32 for CPU for broader compatibility
trust_remote_code=True,
low_cpu_mem_usage=True # Helpful for larger models on CPU
)
# Explicitly move to CPU if not already (low_cpu_mem_usage might handle parts of this)
model.to(DEVICE)
print(f"[{time.time()}] Model {MODEL_NAME} loaded on CPU (FP32 precision).")
model.eval()
# print(f"[{time.time()}] Model footprint: {model.get_memory_footprint()}") # Useful if available
except Exception as e:
model_load_error = str(e)
print(f"[{time.time()}] CRITICAL ERROR loading model or tokenizer: {e}")
import traceback
traceback.print_exc()
# --- Chat Function (remains the same as your previous version) ---
def generate_chat_response(message, history):
print(f"[{time.time()}] generate_chat_response called. Message: '{message}'")
if model_load_error or not model or not tokenizer:
error_msg = f"Model not loaded. Error: {model_load_error if model_load_error else 'Unknown reason.'}"
print(f"[{time.time()}] {error_msg}")
return error_msg
prompt_parts = []
for user_msg, bot_msg in history:
prompt_parts.append(f"### Instruction:\n{user_msg}\n### Response:\n{bot_msg}")
prompt_parts.append(f"### Instruction:\n{message}\n### Response:")
prompt = "\n".join(prompt_parts)
try:
print(f"[{time.time()}] Encoding prompt for model (length: {len(prompt)} chars)...")
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
# Move inputs to the model's device if not using device_map="auto" or if it's explicitly CPU
if DEVICE == "cpu": # Or check model.device directly
inputs = inputs.to(model.device)
# If device_map="auto" was used (GPU case), inputs are often handled by accelerate
print(f"[{time.time()}] Generating response... Input token length: {inputs['input_ids'].shape[1]}")
with torch.no_grad():
output_sequences = model.generate(
input_ids=inputs['input_ids'],
attention_mask=inputs['attention_mask'],
max_new_tokens=200,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
do_sample=True,
top_p=0.95,
top_k=50,
temperature=0.7
)
response_text = tokenizer.decode(output_sequences[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
response_text = response_text.strip()
print(f"[{time.time()}] Raw generated text: '{response_text}'")
if not response_text:
response_text = "I'm not sure how to respond to that right now."
return response_text
except Exception as e:
print(f"[{time.time()}] Error during text generation: {e}")
import traceback
traceback.print_exc()
return f"Sorry, I encountered an error while generating a response: {e}"
# --- Gradio Interface (remains the same) ---
if __name__ == "__main__":
print(f"[{time.time()}] MAIN: Building Gradio interface (DeepSeek Coder - Conditional Quantization)...")
interface_title = f"Chat with LLM (deepseek-coder-1.3B)"
interface_description = f"""
This app runs **{MODEL_NAME}** directly in this Space.
Model loading might take a few minutes. Running on: **{DEVICE.upper()}**.
Quantization is attempted on GPU, bypassed on CPU to avoid `bitsandbytes` issues.
"""
if model_load_error:
interface_description += f"\n\n<h3 style='color:red;'>MODEL LOADING FAILED: {model_load_error}</h3>"
elif not model or not tokenizer:
interface_description += "\n\n<h3 style='color:orange;'>Warning: Model or tokenizer not available. Chat may not function.</h3>"
chat_interface = gr.ChatInterface(
fn=generate_chat_response,
title=interface_title,
description=interface_description,
examples=[["Hello, what can you do?"], ["Write a python function to calculate factorial."]],
cache_examples=False,
)
print(f"[{time.time()}] MAIN: Attempting to launch Gradio app...")
try:
chat_interface.queue().launch(debug=True)
print(f"[{time.time()}] MAIN: Gradio app launch() called. Monitor logs for 'Application startup complete'.")
except Exception as e:
print(f"[{time.time()}] FATAL ERROR during launch: {e}")
with open("launch_error.txt", "w") as f_err:
f_err.write(f"Error during launch: {str(e)}\n")
print(f"[{time.time()}] SCRIPT END: DeepSeek Coder app.py (Conditional Quantization) has finished.")