siddharth786 commited on
Commit
0d541e6
·
1 Parent(s): ce67727

Add training script, train model, and save pipeline

Browse files
Files changed (2) hide show
  1. models.py +161 -71
  2. train.py +114 -0
models.py CHANGED
@@ -3,105 +3,200 @@ import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.naive_bayes import MultinomialNB
5
  from sklearn.pipeline import Pipeline
6
- from typing import Tuple, Any
7
- from pathlib import Path # <-- Add this import
8
- from utils import clean_text_for_classification # Import cleaning function
 
 
 
 
 
9
 
10
  # --- Constants ---
11
- MODEL_DIR = Path("saved_models") # <-- This line caused the error
12
- MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl" # Example using Path object
13
- VECTORIZER_PATH = MODEL_DIR / "tfidf_vectorizer.joblib"
14
-
15
- # Ensure the model directory exists
16
  MODEL_DIR.mkdir(parents=True, exist_ok=True)
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # --- Model Loading ---
19
- def load_model_and_vectorizer() -> Tuple[Pipeline, Any]:
20
  """Loads the trained model pipeline."""
21
- model = None
22
- vectorizer = None
23
-
24
- if MODEL_PATH.exists() and VECTORIZER_PATH.exists():
25
  try:
26
- model = joblib.load(MODEL_PATH)
27
- vectorizer = joblib.load(VECTORIZER_PATH)
28
- print("Model and vectorizer loaded successfully.")
29
  except Exception as e:
30
- print(f"Error loading model or vectorizer: {e}")
31
- # Handle error appropriately, maybe raise it or return None
32
  else:
33
- print(f"Model ({MODEL_PATH}) or Vectorizer ({VECTORIZER_PATH}) not found.")
34
- print("Please train and save the model and vectorizer first.")
35
- # In a real app, you might trigger training or raise an error
36
- # For this template, we'll proceed with None, API will handle it
37
-
38
- return model, vectorizer
39
 
40
  # --- Prediction Function ---
41
- def predict_category(text: str, model: Any, vectorizer: Any) -> str:
42
  """
43
- Predicts the email category using the loaded model and vectorizer.
44
 
45
  Args:
46
  text: The masked email text.
47
- model: The loaded classification model.
48
- vectorizer: The loaded text vectorizer.
49
 
50
  Returns:
51
- The predicted category name (str) or a default/error string.
52
  """
53
- if not model or not vectorizer:
54
- return "Error: Model or Vectorizer not loaded"
55
-
56
  try:
57
- # 1. Clean the masked text
58
  cleaned_text = clean_text_for_classification(text)
59
 
60
- # 2. Vectorize the cleaned text
61
- # Note: vectorizer.transform expects an iterable (like a list)
62
- vectorized_text = vectorizer.transform([cleaned_text])
63
-
64
- # 3. Predict using the model
65
- prediction = model.predict(vectorized_text)
66
 
67
- # prediction is likely an array, get the first element
68
  return prediction[0]
 
69
  except Exception as e:
70
  print(f"Error during prediction: {e}")
71
- return "Error during prediction"
 
 
 
 
72
 
 
 
 
 
73
 
74
- # --- Training Function (Example - Run this separately if needed) ---
75
- # You would typically run this in a separate script (e.g., train.py)
76
- # or a Jupyter notebook, not directly within the API server process.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- def train_and_save_model(data, labels):
79
- """Example function to train and save a simple model."""
80
- print("Starting model training...")
 
 
 
81
 
82
- # Create a pipeline
83
  pipeline = Pipeline([
84
- ('vectorizer', TfidfVectorizer(stop_words='english', max_features=5000)),
85
- ('classifier', MultinomialNB())
86
  ])
87
 
88
- # Preprocess data (assuming 'data' is a list/Series of masked emails)
89
- cleaned_data = [clean_text_for_classification(text) for text in data]
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- # Train the pipeline
92
- pipeline.fit(cleaned_data, labels)
93
- print("Model training complete.")
 
 
 
 
 
94
 
95
- # Save the pipeline components
96
- joblib.dump(pipeline.named_steps['classifier'], MODEL_PATH)
97
- joblib.dump(pipeline.named_steps['vectorizer'], VECTORIZER_PATH)
98
- print(f"Model saved to {MODEL_PATH}")
99
- print(f"Vectorizer saved to {VECTORIZER_PATH}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  # Example Usage (if you run this file directly for testing/training)
102
  if __name__ == "__main__":
103
- # This block is for testing or initiating training manually.
104
- # Create dummy data for demonstration if needed:
105
  print("Running models.py directly...")
106
  dummy_emails = [
107
  "Subject: Billing Issue My account [full_name] was charged twice for order [order_id]. Please refund.",
@@ -110,17 +205,12 @@ if __name__ == "__main__":
110
  ]
111
  dummy_labels = ["Billing Issues", "Technical Support", "Account Management"]
112
 
113
- # Uncomment to train a dummy model:
114
- # print("Training dummy model...")
115
- # train_and_save_model(dummy_emails, dummy_labels)
116
- # print("-" * 20)
117
-
118
  print("Attempting to load model and predict...")
119
- model, vectorizer = load_model_and_vectorizer()
120
- if model and vectorizer:
121
  test_email = "my login is not working help required email [email]"
122
- category = predict_category(test_email, model, vectorizer)
123
  print(f"Test Email: '{test_email}'")
124
  print(f"Predicted Category: {category}")
125
  else:
126
- print("Cannot perform prediction as model/vectorizer failed to load.")
 
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.naive_bayes import MultinomialNB
5
  from sklearn.pipeline import Pipeline
6
+ from sklearn.model_selection import train_test_split
7
+ from typing import Tuple, Any, Optional, List, Dict
8
+ from pathlib import Path
9
+ import re
10
+ from fastapi import FastAPI, HTTPException
11
+ from pydantic import BaseModel
12
+ from utils import clean_text_for_classification, mask_pii
13
+ from models import MODEL_PATH, load_model_pipeline, predict_category
14
 
15
  # --- Constants ---
16
+ MODEL_DIR = Path("saved_models")
17
+ MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl"
 
 
 
18
  MODEL_DIR.mkdir(parents=True, exist_ok=True)
19
 
20
+ # --- FastAPI App ---
21
+ app = FastAPI()
22
+
23
+ # --- Pydantic Models for Request/Response ---
24
+ class EmailInput(BaseModel):
25
+ email_body: str
26
+
27
+ class MaskedEntity(BaseModel):
28
+ position: List[int]
29
+ classification: str
30
+ entity: str
31
+
32
+ class ClassificationOutput(BaseModel):
33
+ input_email_body: str
34
+ list_of_masked_entities: List[MaskedEntity]
35
+ masked_email: str
36
+ category_of_the_email: str
37
+
38
+ # --- Load Model at Startup ---
39
+ # Load the model pipeline once when the application starts
40
+ model_pipeline: Optional[Pipeline] = load_model_pipeline()
41
+
42
  # --- Model Loading ---
43
+ def load_model_pipeline() -> Optional[Pipeline]:
44
  """Loads the trained model pipeline."""
45
+ model_pipeline = None
46
+ if MODEL_PATH.exists():
 
 
47
  try:
48
+ model_pipeline = joblib.load(MODEL_PATH)
49
+ print(f"Model pipeline loaded successfully from {MODEL_PATH}")
 
50
  except Exception as e:
51
+ print(f"Error loading model pipeline from {MODEL_PATH}: {e}")
 
52
  else:
53
+ print(f"Model pipeline not found at {MODEL_PATH}.")
54
+ print("Please train and save the model pipeline first.")
55
+ return model_pipeline
 
 
 
56
 
57
  # --- Prediction Function ---
58
+ def predict_category(text: str, model_pipeline: Optional[Pipeline]) -> str:
59
  """
60
+ Predicts the email category using the loaded model pipeline.
61
 
62
  Args:
63
  text: The masked email text.
64
+ model_pipeline: The loaded classification pipeline.
 
65
 
66
  Returns:
67
+ The predicted category name (str) or an error string.
68
  """
69
+ if not model_pipeline:
70
+ return "Error: Model Pipeline not loaded"
 
71
  try:
72
+ # 1. Clean the masked text (using the function from utils.py)
73
  cleaned_text = clean_text_for_classification(text)
74
 
75
+ # 2. Predict using the pipeline (handles vectorization internally)
76
+ # model_pipeline.predict expects an iterable (like a list)
77
+ prediction = model_pipeline.predict([cleaned_text])
 
 
 
78
 
79
+ # 3. Return the first prediction
80
  return prediction[0]
81
+
82
  except Exception as e:
83
  print(f"Error during prediction: {e}")
84
+ return "Error: Prediction failed"
85
+
86
+ # --- Training Function ---
87
+ def train_model(data_path: Path, model_save_path: Path):
88
+ """Loads data, trains the model pipeline, and saves it."""
89
 
90
+ if not data_path.exists():
91
+ print(f"Error: Dataset not found at {data_path}")
92
+ print("Please make sure the CSV file is uploaded to your Codespace.")
93
+ return
94
 
95
+ print(f"Loading dataset from {data_path}...")
96
+ try:
97
+ df = pd.read_csv(data_path)
98
+ except Exception as e:
99
+ print(f"Error loading CSV: {e}")
100
+ return
101
+
102
+ # --- Data Validation ---
103
+ email_body_column = 'body' # Column name for email text in your CSV
104
+ category_column = 'category' # Column name for the category label in your CSV
105
+
106
+ if email_body_column not in df.columns:
107
+ print(f"Error: Email body column '{email_body_column}' not found in the dataset.")
108
+ print(f"Available columns: {df.columns.tolist()}")
109
+ return
110
+ if category_column not in df.columns:
111
+ print(f"Error: Category column '{category_column}' not found in the dataset.")
112
+ print(f"Available columns: {df.columns.tolist()}")
113
+ return
114
+
115
+ # Handle potential missing values
116
+ df.dropna(subset=[email_body_column, category_column], inplace=True)
117
+ if df.empty:
118
+ print("Error: No valid data remaining after handling missing values.")
119
+ return
120
+
121
+ print("Applying text cleaning...")
122
+ # Ensure the cleaning function exists and works
123
+ try:
124
+ df['cleaned_text'] = df[email_body_column].astype(str).apply(clean_text_for_classification)
125
+ except Exception as e:
126
+ print(f"Error during text cleaning: {e}")
127
+ return
128
 
129
+ print("Splitting data...")
130
+ X = df['cleaned_text']
131
+ y = df[category_column]
132
+ X_train, X_test, y_train, y_test = train_test_split(
133
+ X, y, test_size=0.2, random_state=42, stratify=y # Use stratify for balanced splits
134
+ )
135
 
136
+ # --- Model Pipeline ---
137
  pipeline = Pipeline([
138
+ ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)),
139
+ ('clf', MultinomialNB()) # Using Naive Bayes as a starting point
140
  ])
141
 
142
+ print("Training model...")
143
+ try:
144
+ pipeline.fit(X_train, y_train)
145
+ print("Training complete.")
146
+ except Exception as e:
147
+ print(f"Error during model training: {e}")
148
+ return
149
+
150
+ # --- Evaluation ---
151
+ try:
152
+ accuracy = pipeline.score(X_test, y_test)
153
+ print(f"Model Accuracy on Test Set: {accuracy:.4f}")
154
+ except Exception as e:
155
+ print(f"Error during model evaluation: {e}")
156
+
157
 
158
+ # --- Save Model ---
159
+ print(f"Saving model pipeline to {model_save_path}...")
160
+ model_save_path.parent.mkdir(parents=True, exist_ok=True) # Ensure directory exists
161
+ try:
162
+ joblib.dump(pipeline, model_save_path)
163
+ print("Model pipeline saved successfully.")
164
+ except Exception as e:
165
+ print(f"Error saving model pipeline: {e}")
166
 
167
+ # --- API Endpoints ---
168
+ @app.get("/")
169
+ def read_root():
170
+ return {"message": "Email Classification API is running. Use the /classify/ endpoint."}
171
+
172
+ @app.post("/classify/", response_model=ClassificationOutput)
173
+ async def classify_email(email_input: EmailInput):
174
+ if model_pipeline is None:
175
+ raise HTTPException(status_code=503, detail="Model not loaded. API is not ready.")
176
+
177
+ input_email = email_input.email_body
178
+
179
+ # 1. Mask PII
180
+ masked_text, masked_entities_list = mask_pii(input_email)
181
+
182
+ # Convert masked_entities_list to list of MaskedEntity objects if needed
183
+ # (Depends on how mask_pii returns it, ensure structure matches Pydantic model)
184
+ formatted_entities = [MaskedEntity(**entity) for entity in masked_entities_list]
185
+
186
+ # 2. Predict Category using the masked text
187
+ predicted_category = predict_category(masked_text, model_pipeline)
188
+
189
+ # 3. Construct and return the response
190
+ response = ClassificationOutput(
191
+ input_email_body=input_email,
192
+ list_of_masked_entities=formatted_entities,
193
+ masked_email=masked_text,
194
+ category_of_the_email=predicted_category
195
+ )
196
+ return response
197
 
198
  # Example Usage (if you run this file directly for testing/training)
199
  if __name__ == "__main__":
 
 
200
  print("Running models.py directly...")
201
  dummy_emails = [
202
  "Subject: Billing Issue My account [full_name] was charged twice for order [order_id]. Please refund.",
 
205
  ]
206
  dummy_labels = ["Billing Issues", "Technical Support", "Account Management"]
207
 
 
 
 
 
 
208
  print("Attempting to load model and predict...")
209
+ model_pipeline = load_model_pipeline()
210
+ if model_pipeline:
211
  test_email = "my login is not working help required email [email]"
212
+ category = predict_category(test_email, model_pipeline)
213
  print(f"Test Email: '{test_email}'")
214
  print(f"Predicted Category: {category}")
215
  else:
216
+ print("Cannot perform prediction as model pipeline failed to load.")
train.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # filepath: /workspaces/internship1/train.py
2
+ import pandas as pd
3
+ import joblib
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.naive_bayes import MultinomialNB
7
+ from sklearn.pipeline import Pipeline
8
+ from pathlib import Path
9
+
10
+ # --- Local Imports ---
11
+ # Ensure utils.py has the clean_text_for_classification function
12
+ try:
13
+ from utils import clean_text_for_classification
14
+ except ImportError:
15
+ print("Error: Could not import clean_text_for_classification from utils.")
16
+ print("Make sure utils.py exists and the function is defined.")
17
+ # Define a basic fallback if needed for testing, but fix the import
18
+ def clean_text_for_classification(text: str) -> str:
19
+ return text.lower().strip()
20
+
21
+ # --- Configuration ---
22
+ # !! ADJUST THESE PATHS AND COLUMN NAMES !!
23
+ DATASET_PATH = Path("combined_emails_with_natural_pii.csv")
24
+ MODEL_DIR = Path("saved_models")
25
+ MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl"
26
+ email_body_column = 'email' # <<< Ensure this is 'email'
27
+ category_column = 'type' # <<< Ensure this is 'type'
28
+
29
+ # --- Main Training Function ---
30
+ def train_model(data_path: Path, model_save_path: Path):
31
+ """Loads data, trains the model pipeline, and saves it."""
32
+
33
+ if not data_path.exists():
34
+ print(f"Error: Dataset not found at {data_path}")
35
+ print("Please make sure the CSV file is uploaded to your Codespace.")
36
+ return
37
+
38
+ print(f"Loading dataset from {data_path}...")
39
+ try:
40
+ # Keep the on_bad_lines='skip' if it worked
41
+ df = pd.read_csv(data_path, engine='python', on_bad_lines='skip')
42
+ print(f"Dataset loaded. Note: Bad lines may have been skipped.")
43
+ except Exception as e:
44
+ print(f"Error loading CSV: {e}")
45
+ return
46
+
47
+ # --- Data Validation ---
48
+ if email_body_column not in df.columns:
49
+ print(f"Error: Email body column '{email_body_column}' not found in the dataset.")
50
+ print(f"Available columns: {df.columns.tolist()}")
51
+ return
52
+ if category_column not in df.columns:
53
+ print(f"Error: Category column '{category_column}' not found in the dataset.")
54
+ print(f"Available columns: {df.columns.tolist()}")
55
+ return
56
+
57
+ # Handle potential missing values
58
+ df.dropna(subset=[email_body_column, category_column], inplace=True)
59
+ if df.empty:
60
+ print("Error: No valid data remaining after handling missing values.")
61
+ return
62
+
63
+ print("Applying text cleaning...")
64
+ # Ensure the cleaning function exists and works
65
+ try:
66
+ df['cleaned_text'] = df[email_body_column].astype(str).apply(clean_text_for_classification)
67
+ except Exception as e:
68
+ print(f"Error during text cleaning: {e}")
69
+ return
70
+
71
+ print("Splitting data...")
72
+ X = df['cleaned_text']
73
+ y = df[category_column]
74
+ X_train, X_test, y_train, y_test = train_test_split(
75
+ X, y, test_size=0.2, random_state=42, stratify=y # Use stratify for balanced splits
76
+ )
77
+
78
+ # --- Model Pipeline ---
79
+ pipeline = Pipeline([
80
+ ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)),
81
+ ('clf', MultinomialNB()) # Using Naive Bayes as a starting point
82
+ ])
83
+
84
+ print("Training model...")
85
+ try:
86
+ pipeline.fit(X_train, y_train)
87
+ print("Training complete.")
88
+ except Exception as e:
89
+ print(f"Error during model training: {e}")
90
+ return
91
+
92
+ # --- Evaluation ---
93
+ try:
94
+ accuracy = pipeline.score(X_test, y_test)
95
+ print(f"Model Accuracy on Test Set: {accuracy:.4f}")
96
+ except Exception as e:
97
+ print(f"Error during model evaluation: {e}")
98
+
99
+
100
+ # --- Save Model ---
101
+ print(f"Saving model pipeline to {model_save_path}...")
102
+ model_save_path.parent.mkdir(parents=True, exist_ok=True) # Ensure directory exists
103
+ try:
104
+ joblib.dump(pipeline, model_save_path)
105
+ print("Model pipeline saved successfully.")
106
+ except Exception as e:
107
+ print(f"Error saving model pipeline: {e}")
108
+
109
+
110
+ # --- Script Execution ---
111
+ if __name__ == "__main__":
112
+ # Make sure the MODEL_DIR exists before calling train_model if needed elsewhere
113
+ MODEL_DIR.mkdir(parents=True, exist_ok=True)
114
+ train_model(DATASET_PATH, MODEL_PATH)