Dushyant4342 commited on
Commit
56ac5db
·
verified ·
1 Parent(s): 497021f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -83
app.py CHANGED
@@ -1,55 +1,73 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import os
5
  import time
6
 
7
- print(f"[{time.time()}] SCRIPT START: Small Local LLM Chat. PID: {os.getpid()}")
8
 
9
  # --- Configuration ---
10
- MODEL_NAME = "distilgpt2" # A small and efficient model
11
- # For slightly larger, try "gpt2" (the smallest version of GPT-2)
12
- # MODEL_NAME = "gpt2"
13
 
14
- # Determine device: use CUDA if available, otherwise CPU.
15
- # For small models on typical HF Spaces, CPU is often the only option or more stable.
16
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
  print(f"[{time.time()}] Using device: {DEVICE}")
 
18
 
19
  # --- Load Model and Tokenizer ---
20
- # This section can take some time and memory, especially on first run (downloading model)
21
  model = None
22
  tokenizer = None
23
  model_load_error = None
24
 
25
  try:
26
  print(f"[{time.time()}] Loading tokenizer for {MODEL_NAME}...")
27
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
28
  print(f"[{time.time()}] Tokenizer loaded. Vocab size: {tokenizer.vocab_size if tokenizer else 'N/A'}")
29
 
30
- # Add a padding token if it doesn't exist (common for GPT-2 models)
31
  if tokenizer and tokenizer.pad_token is None:
32
  tokenizer.pad_token = tokenizer.eos_token
33
  print(f"[{time.time()}] Set pad_token to eos_token: {tokenizer.pad_token}")
34
 
35
- print(f"[{time.time()}] Loading model {MODEL_NAME} to {DEVICE}...")
36
- # For CPU, ensure model is explicitly moved. For 'auto', it might try GPU.
37
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
38
- model.eval() # Set model to evaluation mode
39
- print(f"[{time.time()}] Model {MODEL_NAME} loaded successfully on {DEVICE}.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  except Exception as e:
42
  model_load_error = str(e)
43
  print(f"[{time.time()}] CRITICAL ERROR loading model or tokenizer: {e}")
44
- # Fallback or error display will be handled in the Gradio UI
 
45
 
46
- # --- Chat Function ---
 
47
  def generate_chat_response(message, history):
48
- """
49
- Generates a response from the local LLM.
50
- 'message' is the user's new input.
51
- 'history' is a list of previous [user, bot] pairs.
52
- """
53
  print(f"[{time.time()}] generate_chat_response called. Message: '{message}'")
54
 
55
  if model_load_error or not model or not tokenizer:
@@ -57,99 +75,76 @@ def generate_chat_response(message, history):
57
  print(f"[{time.time()}] {error_msg}")
58
  return error_msg
59
 
60
- # Basic conversation history formatting (can be improved)
61
- # We'll prepend the history to the current message to give some context.
62
- # Keep history short to avoid exceeding max input length for small models.
63
- prompt = ""
64
- # Limit history to last 2 turns to keep prompt short
65
- for user_msg, bot_msg in history[-2:]:
66
- prompt += f"User: {user_msg}\nBot: {bot_msg}\n"
67
- prompt += f"User: {message}\nBot:"
68
 
69
  try:
70
- print(f"[{time.time()}] Encoding prompt for model...")
71
- # Ensure padding_side is set correctly if using padding during generation (though not typical for single prompt generation)
72
- # tokenizer.padding_side = "left" # Important for decoder-only models if batching
73
 
74
- inputs = tokenizer.encode_plus(
75
- prompt,
76
- return_tensors="pt",
77
- padding=True, # Pad to max length of batch (or model if single)
78
- truncation=True,
79
- max_length=512 # Max input length for the model (distilgpt2 is 1024, but keep it reasonable)
80
- ).to(DEVICE)
81
-
82
- input_ids = inputs["input_ids"]
83
- attention_mask = inputs["attention_mask"]
84
 
85
- print(f"[{time.time()}] Generating response... Input ID length: {input_ids.shape[1]}")
86
 
87
- # Generate response
88
- # `max_length` here is the total length of input + output
89
- # `max_new_tokens` is usually preferred for controlling output length specifically
90
- with torch.no_grad(): # Disable gradient calculations for inference
91
  output_sequences = model.generate(
92
- input_ids=input_ids,
93
- attention_mask=attention_mask,
94
- max_new_tokens=60, # Max number of new tokens to generate
95
- num_return_sequences=1,
96
- pad_token_id=tokenizer.pad_token_id, # Use the pad token ID from tokenizer
97
  eos_token_id=tokenizer.eos_token_id,
98
- do_sample=True, # Enable sampling for more diverse outputs
99
- top_k=50, # Consider top_k tokens for sampling
100
- top_p=0.95, # Use nucleus sampling
101
- temperature=0.8 # Controls randomness
102
  )
103
 
104
- # Decode the generated sequence
105
- response_text = tokenizer.decode(output_sequences[0][input_ids.shape[-1]:], skip_special_tokens=True)
106
-
107
- # Basic post-processing: remove potential artifacts or incomplete sentences if needed
108
- response_text = response_text.strip()
109
 
110
  print(f"[{time.time()}] Raw generated text: '{response_text}'")
111
  if not response_text:
112
  response_text = "I'm not sure how to respond to that right now."
113
-
114
  return response_text
115
-
116
  except Exception as e:
117
  print(f"[{time.time()}] Error during text generation: {e}")
118
- return f"Error generating response: {e}"
 
 
119
 
120
- # --- Gradio Interface ---
121
  if __name__ == "__main__":
122
- print(f"[{time.time()}] MAIN: Building Gradio interface (Small Local LLM Chat)...")
123
-
124
- interface_title = f"Chat with Small Local LLM ({MODEL_NAME})"
125
  interface_description = f"""
126
- This app runs a small language model ({MODEL_NAME}) directly in this Space.
127
- Responses might be slow and simple due to the model's size and CPU processing.
 
128
  """
129
  if model_load_error:
130
  interface_description += f"\n\n<h3 style='color:red;'>MODEL LOADING FAILED: {model_load_error}</h3>"
131
  elif not model or not tokenizer:
132
  interface_description += "\n\n<h3 style='color:orange;'>Warning: Model or tokenizer not available. Chat may not function.</h3>"
133
 
134
-
135
  chat_interface = gr.ChatInterface(
136
  fn=generate_chat_response,
137
  title=interface_title,
138
  description=interface_description,
139
- examples=[["Hello, who are you?"], ["What is 1+1?"]],
140
- cache_examples=False, # Disable caching for dynamic model responses
141
- retry_btn="Retry",
142
- undo_btn="Delete last",
143
- clear_btn="Clear chat",
144
  )
145
-
146
  print(f"[{time.time()}] MAIN: Attempting to launch Gradio app...")
147
  try:
148
- chat_interface.queue().launch(debug=True) # queue() for better handling, debug=True for logs
149
  print(f"[{time.time()}] MAIN: Gradio app launch() called. Monitor logs for 'Application startup complete'.")
150
  except Exception as e:
151
  print(f"[{time.time()}] FATAL ERROR during launch: {e}")
152
- with open("launch_error.txt", "w") as f_err: # Fallback error logging
153
- f_err.write(f"Error during Small LLM Chat launch: {str(e)}\n")
154
-
155
- print(f"[{time.time()}] SCRIPT END: Small Local LLM Chat app.py has finished initial setup.")
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
4
  import os
5
  import time
6
 
7
+ print(f"[{time.time()}] SCRIPT START: DeepSeek Coder 1.3B Chat (Conditional Quantization). PID: {os.getpid()}")
8
 
9
  # --- Configuration ---
10
+ MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"
 
 
11
 
 
 
12
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
  print(f"[{time.time()}] Using device: {DEVICE}")
14
+ print(f"[{time.time()}] PyTorch version: {torch.__version__}")
15
 
16
  # --- Load Model and Tokenizer ---
 
17
  model = None
18
  tokenizer = None
19
  model_load_error = None
20
 
21
  try:
22
  print(f"[{time.time()}] Loading tokenizer for {MODEL_NAME}...")
23
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
24
  print(f"[{time.time()}] Tokenizer loaded. Vocab size: {tokenizer.vocab_size if tokenizer else 'N/A'}")
25
 
 
26
  if tokenizer and tokenizer.pad_token is None:
27
  tokenizer.pad_token = tokenizer.eos_token
28
  print(f"[{time.time()}] Set pad_token to eos_token: {tokenizer.pad_token}")
29
 
30
+ print(f"[{time.time()}] Attempting to load model {MODEL_NAME}...")
31
+
32
+ if DEVICE == "cuda":
33
+ print(f"[{time.time()}] Configuring 8-bit quantization for GPU...")
34
+ quantization_config = BitsAndBytesConfig(
35
+ load_in_8bit=True,
36
+ bnb_8bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
37
+ )
38
+ model = AutoModelForCausalLM.from_pretrained(
39
+ MODEL_NAME,
40
+ quantization_config=quantization_config,
41
+ device_map="auto", # Let accelerate handle device mapping for GPU
42
+ trust_remote_code=True
43
+ )
44
+ print(f"[{time.time()}] Model {MODEL_NAME} loaded with 8-bit quantization on GPU.")
45
+ else: # CPU
46
+ print(f"[{time.time()}] Loading model {MODEL_NAME} on CPU without bitsandbytes quantization.")
47
+ # When on CPU, load without quantization_config to avoid bitsandbytes issues.
48
+ # This will use more RAM but is more stable if bitsandbytes CPU support is problematic.
49
+ model = AutoModelForCausalLM.from_pretrained(
50
+ MODEL_NAME,
51
+ torch_dtype=torch.float32, # Use float32 for CPU for broader compatibility
52
+ trust_remote_code=True,
53
+ low_cpu_mem_usage=True # Helpful for larger models on CPU
54
+ )
55
+ # Explicitly move to CPU if not already (low_cpu_mem_usage might handle parts of this)
56
+ model.to(DEVICE)
57
+ print(f"[{time.time()}] Model {MODEL_NAME} loaded on CPU (FP32 precision).")
58
+
59
+ model.eval()
60
+ # print(f"[{time.time()}] Model footprint: {model.get_memory_footprint()}") # Useful if available
61
 
62
  except Exception as e:
63
  model_load_error = str(e)
64
  print(f"[{time.time()}] CRITICAL ERROR loading model or tokenizer: {e}")
65
+ import traceback
66
+ traceback.print_exc()
67
 
68
+
69
+ # --- Chat Function (remains the same as your previous version) ---
70
  def generate_chat_response(message, history):
 
 
 
 
 
71
  print(f"[{time.time()}] generate_chat_response called. Message: '{message}'")
72
 
73
  if model_load_error or not model or not tokenizer:
 
75
  print(f"[{time.time()}] {error_msg}")
76
  return error_msg
77
 
78
+ prompt_parts = []
79
+ for user_msg, bot_msg in history:
80
+ prompt_parts.append(f"### Instruction:\n{user_msg}\n### Response:\n{bot_msg}")
81
+ prompt_parts.append(f"### Instruction:\n{message}\n### Response:")
82
+ prompt = "\n".join(prompt_parts)
 
 
 
83
 
84
  try:
85
+ print(f"[{time.time()}] Encoding prompt for model (length: {len(prompt)} chars)...")
86
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1500)
 
87
 
88
+ # Move inputs to the model's device if not using device_map="auto" or if it's explicitly CPU
89
+ if DEVICE == "cpu": # Or check model.device directly
90
+ inputs = inputs.to(model.device)
91
+ # If device_map="auto" was used (GPU case), inputs are often handled by accelerate
 
 
 
 
 
 
92
 
93
+ print(f"[{time.time()}] Generating response... Input token length: {inputs['input_ids'].shape[1]}")
94
 
95
+ with torch.no_grad():
 
 
 
96
  output_sequences = model.generate(
97
+ input_ids=inputs['input_ids'],
98
+ attention_mask=inputs['attention_mask'],
99
+ max_new_tokens=200,
100
+ pad_token_id=tokenizer.eos_token_id,
 
101
  eos_token_id=tokenizer.eos_token_id,
102
+ do_sample=True,
103
+ top_p=0.95,
104
+ top_k=50,
105
+ temperature=0.7
106
  )
107
 
108
+ response_text = tokenizer.decode(output_sequences[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
109
+ response_text = response_text.strip()
 
 
 
110
 
111
  print(f"[{time.time()}] Raw generated text: '{response_text}'")
112
  if not response_text:
113
  response_text = "I'm not sure how to respond to that right now."
 
114
  return response_text
 
115
  except Exception as e:
116
  print(f"[{time.time()}] Error during text generation: {e}")
117
+ import traceback
118
+ traceback.print_exc()
119
+ return f"Sorry, I encountered an error while generating a response: {e}"
120
 
121
+ # --- Gradio Interface (remains the same) ---
122
  if __name__ == "__main__":
123
+ print(f"[{time.time()}] MAIN: Building Gradio interface (DeepSeek Coder - Conditional Quantization)...")
124
+ interface_title = f"Chat with LLM ({MODEL_NAME} - Conditional Quantization)"
 
125
  interface_description = f"""
126
+ This app runs **{MODEL_NAME}** directly in this Space.
127
+ Model loading might take a few minutes. Running on: **{DEVICE.upper()}**.
128
+ Quantization is attempted on GPU, bypassed on CPU to avoid `bitsandbytes` issues.
129
  """
130
  if model_load_error:
131
  interface_description += f"\n\n<h3 style='color:red;'>MODEL LOADING FAILED: {model_load_error}</h3>"
132
  elif not model or not tokenizer:
133
  interface_description += "\n\n<h3 style='color:orange;'>Warning: Model or tokenizer not available. Chat may not function.</h3>"
134
 
 
135
  chat_interface = gr.ChatInterface(
136
  fn=generate_chat_response,
137
  title=interface_title,
138
  description=interface_description,
139
+ examples=[["Hello, what can you do?"], ["Write a python function to calculate factorial."]],
140
+ cache_examples=False,
 
 
 
141
  )
 
142
  print(f"[{time.time()}] MAIN: Attempting to launch Gradio app...")
143
  try:
144
+ chat_interface.queue().launch(debug=True)
145
  print(f"[{time.time()}] MAIN: Gradio app launch() called. Monitor logs for 'Application startup complete'.")
146
  except Exception as e:
147
  print(f"[{time.time()}] FATAL ERROR during launch: {e}")
148
+ with open("launch_error.txt", "w") as f_err:
149
+ f_err.write(f"Error during launch: {str(e)}\n")
150
+ print(f"[{time.time()}] SCRIPT END: DeepSeek Coder app.py (Conditional Quantization) has finished.")