Spaces:

Dushyant4342
/

RAG-PDFChat

Sleeping

App Files Files Community

Dushyant4342 commited on May 19

Commit

56ac5db

verified ·

1 Parent(s): 497021f

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -83

app.py CHANGED Viewed

@@ -1,55 +1,73 @@
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import os
 import time
-print(f"[{time.time()}] SCRIPT START: Small Local LLM Chat. PID: {os.getpid()}")
 # --- Configuration ---
-MODEL_NAME = "distilgpt2" # A small and efficient model
-# For slightly larger, try "gpt2" (the smallest version of GPT-2)
-# MODEL_NAME = "gpt2"
-# Determine device: use CUDA if available, otherwise CPU.
-# For small models on typical HF Spaces, CPU is often the only option or more stable.
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"[{time.time()}] Using device: {DEVICE}")
 # --- Load Model and Tokenizer ---
-# This section can take some time and memory, especially on first run (downloading model)
 model = None
 tokenizer = None
 model_load_error = None
 try:
     print(f"[{time.time()}] Loading tokenizer for {MODEL_NAME}...")
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     print(f"[{time.time()}] Tokenizer loaded. Vocab size: {tokenizer.vocab_size if tokenizer else 'N/A'}")
-    # Add a padding token if it doesn't exist (common for GPT-2 models)
     if tokenizer and tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
         print(f"[{time.time()}] Set pad_token to eos_token: {tokenizer.pad_token}")
-    print(f"[{time.time()}] Loading model {MODEL_NAME} to {DEVICE}...")
-    # For CPU, ensure model is explicitly moved. For 'auto', it might try GPU.
-    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
-    model.eval() # Set model to evaluation mode
-    print(f"[{time.time()}] Model {MODEL_NAME} loaded successfully on {DEVICE}.")
 except Exception as e:
     model_load_error = str(e)
     print(f"[{time.time()}] CRITICAL ERROR loading model or tokenizer: {e}")
-    # Fallback or error display will be handled in the Gradio UI
-# --- Chat Function ---
 def generate_chat_response(message, history):
-    """
-    Generates a response from the local LLM.
-    'message' is the user's new input.
-    'history' is a list of previous [user, bot] pairs.
-    """
     print(f"[{time.time()}] generate_chat_response called. Message: '{message}'")
     if model_load_error or not model or not tokenizer:
@@ -57,99 +75,76 @@ def generate_chat_response(message, history):
         print(f"[{time.time()}] {error_msg}")
         return error_msg
-    # Basic conversation history formatting (can be improved)
-    # We'll prepend the history to the current message to give some context.
-    # Keep history short to avoid exceeding max input length for small models.
-    prompt = ""
-    # Limit history to last 2 turns to keep prompt short
-    for user_msg, bot_msg in history[-2:]:
-        prompt += f"User: {user_msg}\nBot: {bot_msg}\n"
-    prompt += f"User: {message}\nBot:"
     try:
-        print(f"[{time.time()}] Encoding prompt for model...")
-        # Ensure padding_side is set correctly if using padding during generation (though not typical for single prompt generation)
-        # tokenizer.padding_side = "left" # Important for decoder-only models if batching
-        inputs = tokenizer.encode_plus(
-            prompt,
-            return_tensors="pt",
-            padding=True, # Pad to max length of batch (or model if single)
-            truncation=True,
-            max_length=512 # Max input length for the model (distilgpt2 is 1024, but keep it reasonable)
-        ).to(DEVICE)
-        input_ids = inputs["input_ids"]
-        attention_mask = inputs["attention_mask"]
-        print(f"[{time.time()}] Generating response... Input ID length: {input_ids.shape[1]}")
-        # Generate response
-        # `max_length` here is the total length of input + output
-        # `max_new_tokens` is usually preferred for controlling output length specifically
-        with torch.no_grad(): # Disable gradient calculations for inference
             output_sequences = model.generate(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                max_new_tokens=60,  # Max number of new tokens to generate
-                num_return_sequences=1,
-                pad_token_id=tokenizer.pad_token_id, # Use the pad token ID from tokenizer
                 eos_token_id=tokenizer.eos_token_id,
-                do_sample=True, # Enable sampling for more diverse outputs
-                top_k=50,       # Consider top_k tokens for sampling
-                top_p=0.95,     # Use nucleus sampling
-                temperature=0.8 # Controls randomness
             )
-        # Decode the generated sequence
-        response_text = tokenizer.decode(output_sequences[0][input_ids.shape[-1]:], skip_special_tokens=True)
-        # Basic post-processing: remove potential artifacts or incomplete sentences if needed
-        response_text = response_text.strip()
         print(f"[{time.time()}] Raw generated text: '{response_text}'")
         if not response_text:
             response_text = "I'm not sure how to respond to that right now."
         return response_text
     except Exception as e:
         print(f"[{time.time()}] Error during text generation: {e}")
-        return f"Error generating response: {e}"
-# --- Gradio Interface ---
 if __name__ == "__main__":
-    print(f"[{time.time()}] MAIN: Building Gradio interface (Small Local LLM Chat)...")
-    interface_title = f"Chat with Small Local LLM ({MODEL_NAME})"
     interface_description = f"""
-    This app runs a small language model ({MODEL_NAME}) directly in this Space.
-    Responses might be slow and simple due to the model's size and CPU processing.
     """
     if model_load_error:
         interface_description += f"\n\n<h3 style='color:red;'>MODEL LOADING FAILED: {model_load_error}</h3>"
     elif not model or not tokenizer:
         interface_description += "\n\n<h3 style='color:orange;'>Warning: Model or tokenizer not available. Chat may not function.</h3>"
     chat_interface = gr.ChatInterface(
         fn=generate_chat_response,
         title=interface_title,
         description=interface_description,
-        examples=[["Hello, who are you?"], ["What is 1+1?"]],
-        cache_examples=False, # Disable caching for dynamic model responses
-        retry_btn="Retry",
-        undo_btn="Delete last",
-        clear_btn="Clear chat",
     )
     print(f"[{time.time()}] MAIN: Attempting to launch Gradio app...")
     try:
-        chat_interface.queue().launch(debug=True) # queue() for better handling, debug=True for logs
         print(f"[{time.time()}] MAIN: Gradio app launch() called. Monitor logs for 'Application startup complete'.")
     except Exception as e:
         print(f"[{time.time()}] FATAL ERROR during launch: {e}")
-        with open("launch_error.txt", "w") as f_err: # Fallback error logging
-            f_err.write(f"Error during Small LLM Chat launch: {str(e)}\n")
-print(f"[{time.time()}] SCRIPT END: Small Local LLM Chat app.py has finished initial setup.")

 import gradio as gr
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import os
 import time
+print(f"[{time.time()}] SCRIPT START: DeepSeek Coder 1.3B Chat (Conditional Quantization). PID: {os.getpid()}")
 # --- Configuration ---
+MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"[{time.time()}] Using device: {DEVICE}")
+print(f"[{time.time()}] PyTorch version: {torch.__version__}")
 # --- Load Model and Tokenizer ---
 model = None
 tokenizer = None
 model_load_error = None
 try:
     print(f"[{time.time()}] Loading tokenizer for {MODEL_NAME}...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
     print(f"[{time.time()}] Tokenizer loaded. Vocab size: {tokenizer.vocab_size if tokenizer else 'N/A'}")
     if tokenizer and tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
         print(f"[{time.time()}] Set pad_token to eos_token: {tokenizer.pad_token}")
+    print(f"[{time.time()}] Attempting to load model {MODEL_NAME}...")
+    if DEVICE == "cuda":
+        print(f"[{time.time()}] Configuring 8-bit quantization for GPU...")
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=True,
+            bnb_8bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            quantization_config=quantization_config,
+            device_map="auto", # Let accelerate handle device mapping for GPU
+            trust_remote_code=True
+        )
+        print(f"[{time.time()}] Model {MODEL_NAME} loaded with 8-bit quantization on GPU.")
+    else: # CPU
+        print(f"[{time.time()}] Loading model {MODEL_NAME} on CPU without bitsandbytes quantization.")
+        # When on CPU, load without quantization_config to avoid bitsandbytes issues.
+        # This will use more RAM but is more stable if bitsandbytes CPU support is problematic.
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_NAME,
+            torch_dtype=torch.float32, # Use float32 for CPU for broader compatibility
+            trust_remote_code=True,
+            low_cpu_mem_usage=True # Helpful for larger models on CPU
+        )
+        # Explicitly move to CPU if not already (low_cpu_mem_usage might handle parts of this)
+        model.to(DEVICE)
+        print(f"[{time.time()}] Model {MODEL_NAME} loaded on CPU (FP32 precision).")
+    model.eval()
+    # print(f"[{time.time()}] Model footprint: {model.get_memory_footprint()}") # Useful if available
 except Exception as e:
     model_load_error = str(e)
     print(f"[{time.time()}] CRITICAL ERROR loading model or tokenizer: {e}")
+    import traceback
+    traceback.print_exc()
+# --- Chat Function (remains the same as your previous version) ---
 def generate_chat_response(message, history):
     print(f"[{time.time()}] generate_chat_response called. Message: '{message}'")
     if model_load_error or not model or not tokenizer:
         print(f"[{time.time()}] {error_msg}")
         return error_msg
+    prompt_parts = []
+    for user_msg, bot_msg in history:
+        prompt_parts.append(f"### Instruction:\n{user_msg}\n### Response:\n{bot_msg}")
+    prompt_parts.append(f"### Instruction:\n{message}\n### Response:")
+    prompt = "\n".join(prompt_parts)
     try:
+        print(f"[{time.time()}] Encoding prompt for model (length: {len(prompt)} chars)...")
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
+        # Move inputs to the model's device if not using device_map="auto" or if it's explicitly CPU
+        if DEVICE == "cpu": # Or check model.device directly
+            inputs = inputs.to(model.device)
+        # If device_map="auto" was used (GPU case), inputs are often handled by accelerate
+        print(f"[{time.time()}] Generating response... Input token length: {inputs['input_ids'].shape[1]}")
+        with torch.no_grad():
             output_sequences = model.generate(
+                input_ids=inputs['input_ids'],
+                attention_mask=inputs['attention_mask'],
+                max_new_tokens=200,
+                pad_token_id=tokenizer.eos_token_id,
                 eos_token_id=tokenizer.eos_token_id,
+                do_sample=True,
+                top_p=0.95,
+                top_k=50,
+                temperature=0.7
             )
+        response_text = tokenizer.decode(output_sequences[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
+        response_text = response_text.strip()
         print(f"[{time.time()}] Raw generated text: '{response_text}'")
         if not response_text:
             response_text = "I'm not sure how to respond to that right now."
         return response_text
     except Exception as e:
         print(f"[{time.time()}] Error during text generation: {e}")
+        import traceback
+        traceback.print_exc()
+        return f"Sorry, I encountered an error while generating a response: {e}"
+# --- Gradio Interface (remains the same) ---
 if __name__ == "__main__":
+    print(f"[{time.time()}] MAIN: Building Gradio interface (DeepSeek Coder - Conditional Quantization)...")
+    interface_title = f"Chat with LLM ({MODEL_NAME} - Conditional Quantization)"
     interface_description = f"""
+    This app runs **{MODEL_NAME}** directly in this Space.
+    Model loading might take a few minutes. Running on: **{DEVICE.upper()}**.
+    Quantization is attempted on GPU, bypassed on CPU to avoid `bitsandbytes` issues.
     """
     if model_load_error:
         interface_description += f"\n\n<h3 style='color:red;'>MODEL LOADING FAILED: {model_load_error}</h3>"
     elif not model or not tokenizer:
         interface_description += "\n\n<h3 style='color:orange;'>Warning: Model or tokenizer not available. Chat may not function.</h3>"
     chat_interface = gr.ChatInterface(
         fn=generate_chat_response,
         title=interface_title,
         description=interface_description,
+        examples=[["Hello, what can you do?"], ["Write a python function to calculate factorial."]],
+        cache_examples=False,
     )
     print(f"[{time.time()}] MAIN: Attempting to launch Gradio app...")
     try:
+        chat_interface.queue().launch(debug=True)
         print(f"[{time.time()}] MAIN: Gradio app launch() called. Monitor logs for 'Application startup complete'.")
     except Exception as e:
         print(f"[{time.time()}] FATAL ERROR during launch: {e}")
+        with open("launch_error.txt", "w") as f_err:
+            f_err.write(f"Error during launch: {str(e)}\n")
+print(f"[{time.time()}] SCRIPT END: DeepSeek Coder app.py (Conditional Quantization) has finished.")