Spaces:

Dushyant4342
/

RAG-PDFChat

Sleeping

App Files Files Community

RAG-PDFChat / app.py

Dushyant4342

Update app.py

da6ef76 verified 7 months ago

raw

history blame contribute delete

6.88 kB

	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
	import os
	import time

	print(f"[{time.time()}] SCRIPT START: DeepSeek Coder 1.3B Chat (Conditional Quantization). PID: {os.getpid()}")

	# --- Configuration ---
	MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"[{time.time()}] Using device: {DEVICE}")
	print(f"[{time.time()}] PyTorch version: {torch.__version__}")

	# --- Load Model and Tokenizer ---
	model = None
	tokenizer = None
	model_load_error = None

	try:
	print(f"[{time.time()}] Loading tokenizer for {MODEL_NAME}...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
	print(f"[{time.time()}] Tokenizer loaded. Vocab size: {tokenizer.vocab_size if tokenizer else 'N/A'}")

	if tokenizer and tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	print(f"[{time.time()}] Set pad_token to eos_token: {tokenizer.pad_token}")

	print(f"[{time.time()}] Attempting to load model {MODEL_NAME}...")

	if DEVICE == "cuda":
	print(f"[{time.time()}] Configuring 8-bit quantization for GPU...")
	quantization_config = BitsAndBytesConfig(
	load_in_8bit=True,
	bnb_8bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
	)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	quantization_config=quantization_config,
	device_map="auto", # Let accelerate handle device mapping for GPU
	trust_remote_code=True
	)
	print(f"[{time.time()}] Model {MODEL_NAME} loaded with 8-bit quantization on GPU.")
	else: # CPU
	print(f"[{time.time()}] Loading model {MODEL_NAME} on CPU without bitsandbytes quantization.")
	# When on CPU, load without quantization_config to avoid bitsandbytes issues.
	# This will use more RAM but is more stable if bitsandbytes CPU support is problematic.
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float32, # Use float32 for CPU for broader compatibility
	trust_remote_code=True,
	low_cpu_mem_usage=True # Helpful for larger models on CPU
	)
	# Explicitly move to CPU if not already (low_cpu_mem_usage might handle parts of this)
	model.to(DEVICE)
	print(f"[{time.time()}] Model {MODEL_NAME} loaded on CPU (FP32 precision).")

	model.eval()
	# print(f"[{time.time()}] Model footprint: {model.get_memory_footprint()}") # Useful if available

	except Exception as e:
	model_load_error = str(e)
	print(f"[{time.time()}] CRITICAL ERROR loading model or tokenizer: {e}")
	import traceback
	traceback.print_exc()


	# --- Chat Function (remains the same as your previous version) ---
	def generate_chat_response(message, history):
	print(f"[{time.time()}] generate_chat_response called. Message: '{message}'")

	if model_load_error or not model or not tokenizer:
	error_msg = f"Model not loaded. Error: {model_load_error if model_load_error else 'Unknown reason.'}"
	print(f"[{time.time()}] {error_msg}")
	return error_msg

	prompt_parts = []
	for user_msg, bot_msg in history:
	prompt_parts.append(f"### Instruction:\n{user_msg}\n### Response:\n{bot_msg}")
	prompt_parts.append(f"### Instruction:\n{message}\n### Response:")
	prompt = "\n".join(prompt_parts)

	try:
	print(f"[{time.time()}] Encoding prompt for model (length: {len(prompt)} chars)...")
	inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)

	# Move inputs to the model's device if not using device_map="auto" or if it's explicitly CPU
	if DEVICE == "cpu": # Or check model.device directly
	inputs = inputs.to(model.device)
	# If device_map="auto" was used (GPU case), inputs are often handled by accelerate

	print(f"[{time.time()}] Generating response... Input token length: {inputs['input_ids'].shape[1]}")

	with torch.no_grad():
	output_sequences = model.generate(
	input_ids=inputs['input_ids'],
	attention_mask=inputs['attention_mask'],
	max_new_tokens=200,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id,
	do_sample=True,
	top_p=0.95,
	top_k=50,
	temperature=0.7
	)

	response_text = tokenizer.decode(output_sequences[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
	response_text = response_text.strip()

	print(f"[{time.time()}] Raw generated text: '{response_text}'")
	if not response_text:
	response_text = "I'm not sure how to respond to that right now."
	return response_text
	except Exception as e:
	print(f"[{time.time()}] Error during text generation: {e}")
	import traceback
	traceback.print_exc()
	return f"Sorry, I encountered an error while generating a response: {e}"

	# --- Gradio Interface (remains the same) ---
	if __name__ == "__main__":
	print(f"[{time.time()}] MAIN: Building Gradio interface (DeepSeek Coder - Conditional Quantization)...")
	interface_title = f"Chat with LLM (deepseek-coder-1.3B)"
	interface_description = f"""
	This app runs {MODEL_NAME} directly in this Space.
	Model loading might take a few minutes. Running on: {DEVICE.upper()}.
	Quantization is attempted on GPU, bypassed on CPU to avoid `bitsandbytes` issues.
	"""
	if model_load_error:
	interface_description += f"\n\n<h3 style='color:red;'>MODEL LOADING FAILED: {model_load_error}</h3>"
	elif not model or not tokenizer:
	interface_description += "\n\n<h3 style='color:orange;'>Warning: Model or tokenizer not available. Chat may not function.</h3>"

	chat_interface = gr.ChatInterface(
	fn=generate_chat_response,
	title=interface_title,
	description=interface_description,
	examples=[["Hello, what can you do?"], ["Write a python function to calculate factorial."]],
	cache_examples=False,
	)
	print(f"[{time.time()}] MAIN: Attempting to launch Gradio app...")
	try:
	chat_interface.queue().launch(debug=True)
	print(f"[{time.time()}] MAIN: Gradio app launch() called. Monitor logs for 'Application startup complete'.")
	except Exception as e:
	print(f"[{time.time()}] FATAL ERROR during launch: {e}")
	with open("launch_error.txt", "w") as f_err:
	f_err.write(f"Error during launch: {str(e)}\n")
	print(f"[{time.time()}] SCRIPT END: DeepSeek Coder app.py (Conditional Quantization) has finished.")