Spaces:

siddharth786
/

email-pii-classifier-v2

Runtime error

App Files Files Community

siddharth786 commited on Apr 17

Commit

0d541e6

1 Parent(s): ce67727

Add training script, train model, and save pipeline

Browse files

Files changed (2) hide show

models.py +161 -71
train.py +114 -0

models.py CHANGED Viewed

@@ -3,105 +3,200 @@ import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline
-from typing import Tuple, Any
-from pathlib import Path # <-- Add this import
-from utils import clean_text_for_classification # Import cleaning function
 # --- Constants ---
-MODEL_DIR = Path("saved_models") # <-- This line caused the error
-MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl" # Example using Path object
-VECTORIZER_PATH = MODEL_DIR / "tfidf_vectorizer.joblib"
-# Ensure the model directory exists
 MODEL_DIR.mkdir(parents=True, exist_ok=True)
 # --- Model Loading ---
-def load_model_and_vectorizer() -> Tuple[Pipeline, Any]:
     """Loads the trained model pipeline."""
-    model = None
-    vectorizer = None
-    if MODEL_PATH.exists() and VECTORIZER_PATH.exists():
         try:
-            model = joblib.load(MODEL_PATH)
-            vectorizer = joblib.load(VECTORIZER_PATH)
-            print("Model and vectorizer loaded successfully.")
         except Exception as e:
-            print(f"Error loading model or vectorizer: {e}")
-            # Handle error appropriately, maybe raise it or return None
     else:
-        print(f"Model ({MODEL_PATH}) or Vectorizer ({VECTORIZER_PATH}) not found.")
-        print("Please train and save the model and vectorizer first.")
-        # In a real app, you might trigger training or raise an error
-        # For this template, we'll proceed with None, API will handle it
-    return model, vectorizer
 # --- Prediction Function ---
-def predict_category(text: str, model: Any, vectorizer: Any) -> str:
     """
-    Predicts the email category using the loaded model and vectorizer.
     Args:
         text: The masked email text.
-        model: The loaded classification model.
-        vectorizer: The loaded text vectorizer.
     Returns:
-        The predicted category name (str) or a default/error string.
     """
-    if not model or not vectorizer:
-        return "Error: Model or Vectorizer not loaded"
     try:
-        # 1. Clean the masked text
         cleaned_text = clean_text_for_classification(text)
-        # 2. Vectorize the cleaned text
-        # Note: vectorizer.transform expects an iterable (like a list)
-        vectorized_text = vectorizer.transform([cleaned_text])
-        # 3. Predict using the model
-        prediction = model.predict(vectorized_text)
-        # prediction is likely an array, get the first element
         return prediction[0]
     except Exception as e:
         print(f"Error during prediction: {e}")
-        return "Error during prediction"
-# --- Training Function (Example - Run this separately if needed) ---
-# You would typically run this in a separate script (e.g., train.py)
-# or a Jupyter notebook, not directly within the API server process.
-def train_and_save_model(data, labels):
-    """Example function to train and save a simple model."""
-    print("Starting model training...")
-    # Create a pipeline
     pipeline = Pipeline([
-        ('vectorizer', TfidfVectorizer(stop_words='english', max_features=5000)),
-        ('classifier', MultinomialNB())
     ])
-    # Preprocess data (assuming 'data' is a list/Series of masked emails)
-    cleaned_data = [clean_text_for_classification(text) for text in data]
-    # Train the pipeline
-    pipeline.fit(cleaned_data, labels)
-    print("Model training complete.")
-    # Save the pipeline components
-    joblib.dump(pipeline.named_steps['classifier'], MODEL_PATH)
-    joblib.dump(pipeline.named_steps['vectorizer'], VECTORIZER_PATH)
-    print(f"Model saved to {MODEL_PATH}")
-    print(f"Vectorizer saved to {VECTORIZER_PATH}")
 # Example Usage (if you run this file directly for testing/training)
 if __name__ == "__main__":
-    # This block is for testing or initiating training manually.
-    # Create dummy data for demonstration if needed:
     print("Running models.py directly...")
     dummy_emails = [
         "Subject: Billing Issue My account [full_name] was charged twice for order [order_id]. Please refund.",
@@ -110,17 +205,12 @@ if __name__ == "__main__":
         ]
     dummy_labels = ["Billing Issues", "Technical Support", "Account Management"]
-    # Uncomment to train a dummy model:
-    # print("Training dummy model...")
-    # train_and_save_model(dummy_emails, dummy_labels)
-    # print("-" * 20)
     print("Attempting to load model and predict...")
-    model, vectorizer = load_model_and_vectorizer()
-    if model and vectorizer:
         test_email = "my login is not working help required email [email]"
-        category = predict_category(test_email, model, vectorizer)
         print(f"Test Email: '{test_email}'")
         print(f"Predicted Category: {category}")
     else:
-        print("Cannot perform prediction as model/vectorizer failed to load.")

 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.pipeline import Pipeline
+from sklearn.model_selection import train_test_split
+from typing import Tuple, Any, Optional, List, Dict
+from pathlib import Path
+import re
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from utils import clean_text_for_classification, mask_pii
+from models import MODEL_PATH, load_model_pipeline, predict_category
 # --- Constants ---
+MODEL_DIR = Path("saved_models")
+MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl"
 MODEL_DIR.mkdir(parents=True, exist_ok=True)
+# --- FastAPI App ---
+app = FastAPI()
+# --- Pydantic Models for Request/Response ---
+class EmailInput(BaseModel):
+    email_body: str
+class MaskedEntity(BaseModel):
+    position: List[int]
+    classification: str
+    entity: str
+class ClassificationOutput(BaseModel):
+    input_email_body: str
+    list_of_masked_entities: List[MaskedEntity]
+    masked_email: str
+    category_of_the_email: str
+# --- Load Model at Startup ---
+# Load the model pipeline once when the application starts
+model_pipeline: Optional[Pipeline] = load_model_pipeline()
 # --- Model Loading ---
+def load_model_pipeline() -> Optional[Pipeline]:
     """Loads the trained model pipeline."""
+    model_pipeline = None
+    if MODEL_PATH.exists():
         try:
+            model_pipeline = joblib.load(MODEL_PATH)
+            print(f"Model pipeline loaded successfully from {MODEL_PATH}")
         except Exception as e:
+            print(f"Error loading model pipeline from {MODEL_PATH}: {e}")
     else:
+        print(f"Model pipeline not found at {MODEL_PATH}.")
+        print("Please train and save the model pipeline first.")
+    return model_pipeline
 # --- Prediction Function ---
+def predict_category(text: str, model_pipeline: Optional[Pipeline]) -> str:
     """
+    Predicts the email category using the loaded model pipeline.
     Args:
         text: The masked email text.
+        model_pipeline: The loaded classification pipeline.
     Returns:
+        The predicted category name (str) or an error string.
     """
+    if not model_pipeline:
+        return "Error: Model Pipeline not loaded"
     try:
+        # 1. Clean the masked text (using the function from utils.py)
         cleaned_text = clean_text_for_classification(text)
+        # 2. Predict using the pipeline (handles vectorization internally)
+        # model_pipeline.predict expects an iterable (like a list)
+        prediction = model_pipeline.predict([cleaned_text])
+        # 3. Return the first prediction
         return prediction[0]
     except Exception as e:
         print(f"Error during prediction: {e}")
+        return "Error: Prediction failed"
+# --- Training Function ---
+def train_model(data_path: Path, model_save_path: Path):
+    """Loads data, trains the model pipeline, and saves it."""
+    if not data_path.exists():
+        print(f"Error: Dataset not found at {data_path}")
+        print("Please make sure the CSV file is uploaded to your Codespace.")
+        return
+    print(f"Loading dataset from {data_path}...")
+    try:
+        df = pd.read_csv(data_path)
+    except Exception as e:
+        print(f"Error loading CSV: {e}")
+        return
+    # --- Data Validation ---
+    email_body_column = 'body'       # Column name for email text in your CSV
+    category_column = 'category'     # Column name for the category label in your CSV
+    if email_body_column not in df.columns:
+        print(f"Error: Email body column '{email_body_column}' not found in the dataset.")
+        print(f"Available columns: {df.columns.tolist()}")
+        return
+    if category_column not in df.columns:
+        print(f"Error: Category column '{category_column}' not found in the dataset.")
+        print(f"Available columns: {df.columns.tolist()}")
+        return
+    # Handle potential missing values
+    df.dropna(subset=[email_body_column, category_column], inplace=True)
+    if df.empty:
+        print("Error: No valid data remaining after handling missing values.")
+        return
+    print("Applying text cleaning...")
+    # Ensure the cleaning function exists and works
+    try:
+        df['cleaned_text'] = df[email_body_column].astype(str).apply(clean_text_for_classification)
+    except Exception as e:
+        print(f"Error during text cleaning: {e}")
+        return
+    print("Splitting data...")
+    X = df['cleaned_text']
+    y = df[category_column]
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42, stratify=y # Use stratify for balanced splits
+    )
+    # --- Model Pipeline ---
     pipeline = Pipeline([
+        ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)),
+        ('clf', MultinomialNB()) # Using Naive Bayes as a starting point
     ])
+    print("Training model...")
+    try:
+        pipeline.fit(X_train, y_train)
+        print("Training complete.")
+    except Exception as e:
+        print(f"Error during model training: {e}")
+        return
+    # --- Evaluation ---
+    try:
+        accuracy = pipeline.score(X_test, y_test)
+        print(f"Model Accuracy on Test Set: {accuracy:.4f}")
+    except Exception as e:
+        print(f"Error during model evaluation: {e}")
+    # --- Save Model ---
+    print(f"Saving model pipeline to {model_save_path}...")
+    model_save_path.parent.mkdir(parents=True, exist_ok=True) # Ensure directory exists
+    try:
+        joblib.dump(pipeline, model_save_path)
+        print("Model pipeline saved successfully.")
+    except Exception as e:
+        print(f"Error saving model pipeline: {e}")
+# --- API Endpoints ---
+@app.get("/")
+def read_root():
+    return {"message": "Email Classification API is running. Use the /classify/ endpoint."}
+@app.post("/classify/", response_model=ClassificationOutput)
+async def classify_email(email_input: EmailInput):
+    if model_pipeline is None:
+        raise HTTPException(status_code=503, detail="Model not loaded. API is not ready.")
+    input_email = email_input.email_body
+    # 1. Mask PII
+    masked_text, masked_entities_list = mask_pii(input_email)
+    # Convert masked_entities_list to list of MaskedEntity objects if needed
+    # (Depends on how mask_pii returns it, ensure structure matches Pydantic model)
+    formatted_entities = [MaskedEntity(**entity) for entity in masked_entities_list]
+    # 2. Predict Category using the masked text
+    predicted_category = predict_category(masked_text, model_pipeline)
+    # 3. Construct and return the response
+    response = ClassificationOutput(
+        input_email_body=input_email,
+        list_of_masked_entities=formatted_entities,
+        masked_email=masked_text,
+        category_of_the_email=predicted_category
+    )
+    return response
 # Example Usage (if you run this file directly for testing/training)
 if __name__ == "__main__":
     print("Running models.py directly...")
     dummy_emails = [
         "Subject: Billing Issue My account [full_name] was charged twice for order [order_id]. Please refund.",
         ]
     dummy_labels = ["Billing Issues", "Technical Support", "Account Management"]
     print("Attempting to load model and predict...")
+    model_pipeline = load_model_pipeline()
+    if model_pipeline:
         test_email = "my login is not working help required email [email]"
+        category = predict_category(test_email, model_pipeline)
         print(f"Test Email: '{test_email}'")
         print(f"Predicted Category: {category}")
     else:
+        print("Cannot perform prediction as model pipeline failed to load.")

train.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# filepath: /workspaces/internship1/train.py
+import pandas as pd
+import joblib
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline
+from pathlib import Path
+# --- Local Imports ---
+# Ensure utils.py has the clean_text_for_classification function
+try:
+    from utils import clean_text_for_classification
+except ImportError:
+    print("Error: Could not import clean_text_for_classification from utils.")
+    print("Make sure utils.py exists and the function is defined.")
+    # Define a basic fallback if needed for testing, but fix the import
+    def clean_text_for_classification(text: str) -> str:
+        return text.lower().strip()
+# --- Configuration ---
+# !! ADJUST THESE PATHS AND COLUMN NAMES !!
+DATASET_PATH = Path("combined_emails_with_natural_pii.csv")
+MODEL_DIR = Path("saved_models")
+MODEL_PATH = MODEL_DIR / "email_classifier_pipeline.pkl"
+email_body_column = 'email'      # <<< Ensure this is 'email'
+category_column = 'type'         # <<< Ensure this is 'type'
+# --- Main Training Function ---
+def train_model(data_path: Path, model_save_path: Path):
+    """Loads data, trains the model pipeline, and saves it."""
+    if not data_path.exists():
+        print(f"Error: Dataset not found at {data_path}")
+        print("Please make sure the CSV file is uploaded to your Codespace.")
+        return
+    print(f"Loading dataset from {data_path}...")
+    try:
+        # Keep the on_bad_lines='skip' if it worked
+        df = pd.read_csv(data_path, engine='python', on_bad_lines='skip')
+        print(f"Dataset loaded. Note: Bad lines may have been skipped.")
+    except Exception as e:
+        print(f"Error loading CSV: {e}")
+        return
+    # --- Data Validation ---
+    if email_body_column not in df.columns:
+        print(f"Error: Email body column '{email_body_column}' not found in the dataset.")
+        print(f"Available columns: {df.columns.tolist()}")
+        return
+    if category_column not in df.columns:
+        print(f"Error: Category column '{category_column}' not found in the dataset.")
+        print(f"Available columns: {df.columns.tolist()}")
+        return
+    # Handle potential missing values
+    df.dropna(subset=[email_body_column, category_column], inplace=True)
+    if df.empty:
+        print("Error: No valid data remaining after handling missing values.")
+        return
+    print("Applying text cleaning...")
+    # Ensure the cleaning function exists and works
+    try:
+        df['cleaned_text'] = df[email_body_column].astype(str).apply(clean_text_for_classification)
+    except Exception as e:
+        print(f"Error during text cleaning: {e}")
+        return
+    print("Splitting data...")
+    X = df['cleaned_text']
+    y = df[category_column]
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42, stratify=y # Use stratify for balanced splits
+    )
+    # --- Model Pipeline ---
+    pipeline = Pipeline([
+        ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)),
+        ('clf', MultinomialNB()) # Using Naive Bayes as a starting point
+    ])
+    print("Training model...")
+    try:
+        pipeline.fit(X_train, y_train)
+        print("Training complete.")
+    except Exception as e:
+        print(f"Error during model training: {e}")
+        return
+    # --- Evaluation ---
+    try:
+        accuracy = pipeline.score(X_test, y_test)
+        print(f"Model Accuracy on Test Set: {accuracy:.4f}")
+    except Exception as e:
+        print(f"Error during model evaluation: {e}")
+    # --- Save Model ---
+    print(f"Saving model pipeline to {model_save_path}...")
+    model_save_path.parent.mkdir(parents=True, exist_ok=True) # Ensure directory exists
+    try:
+        joblib.dump(pipeline, model_save_path)
+        print("Model pipeline saved successfully.")
+    except Exception as e:
+        print(f"Error saving model pipeline: {e}")
+# --- Script Execution ---
+if __name__ == "__main__":
+    # Make sure the MODEL_DIR exists before calling train_model if needed elsewhere
+    MODEL_DIR.mkdir(parents=True, exist_ok=True)
+    train_model(DATASET_PATH, MODEL_PATH)