#!/usr/bin/env python3 """ Hidradenitis Suppurativa (HS) AI System Performance Dashboard Interactive Streamlit dashboard for analyzing HS test suite results """ import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import confusion_matrix, classification_report import json import os # --- Page Configuration --- st.set_page_config( page_title="HS AI Performance Dashboard", page_icon="📊", layout="wide", initial_sidebar_state="expanded" ) # --- Data Loading and Caching --- @st.cache_data def load_and_prepare_data(): """Load merged dataset and metrics""" try: df_merged = pd.read_csv('HS_merged_analysis.csv') # Load metrics if available metrics = {} if os.path.exists('HS_metrics_summary.json'): with open('HS_metrics_summary.json', 'r') as f: metrics = json.load(f) return df_merged, metrics except FileNotFoundError as e: st.error(f"ERROR: Data file not found. Please ensure CSV files are in the correct location. Details: {e}") return None, None # --- Load Data --- df_merged, metrics = load_and_prepare_data() # --- Sidebar Navigation --- st.sidebar.title("📊 HS AI Performance Dashboard") page = st.sidebar.radio( "Navigate to:", ( "Executive Summary & Overview", "Test Suite Insights", "Categorization Performance", "Answer Matching & Quality", "Performance by Question Source", "Video Insights", "Demographic Deep-Dive", "Interactive Question Explorer" ) ) st.sidebar.markdown("---") st.sidebar.info("This dashboard analyzes the Mediflix AI system's performance on the Hidradenitis Suppurativa (HS) test suite.") # Stop if data not loaded if df_merged is None or df_merged.empty: st.error("Data could not be loaded. Please check file paths and ensure CSVs are available.") st.stop() # --- Column name references --- ai_cat_col = 'AI_Predicted_Category' gt_cat_col = 'Target_Category_GroundTruth' match_type_col = 'Answer Match Type' relevancy_col = 'Advice Video 1 Relevancy Score' video_id_col = 'Advice Video 1 ID' video_title_col = 'Advice Video 1 Question Title' video_text_col = 'Advice Video 1 Answer Text' source_col = 'Source' orig_q_resp_col = 'Original_Question_from_AI_Response' orig_q_suite_col = 'Original_Question_from_Suite' # ============================================================================= # PAGE 1: EXECUTIVE SUMMARY & OVERVIEW # ============================================================================= if page == "Executive Summary & Overview": st.title("📈 Executive Summary & HS AI Performance Overview") st.markdown("---") st.markdown(""" This dashboard provides comprehensive analysis of the Mediflix AI system's performance on **Hidradenitis Suppurativa (HS)** questions. The test suite contains **431 questions** covering medical knowledge, daily life management, emotional support, and demographic-specific concerns. **Test Suite Coverage:** - 281 Patient Forum questions (real patient language) - 81 Expert Clinical questions (medical knowledge) - 33 Women 20-40 Focus questions (reproductive health) - 15 Black Patients Focus questions (health equity) - 21 Additional sources """) # KPI Calculations st.subheader("Key Performance Indicators (KPIs)") # Categorization accuracy if metrics and 'categorization' in metrics: cat_accuracy = metrics['categorization'].get('accuracy_pct', 0) else: df_cat = df_merged.dropna(subset=[ai_cat_col, gt_cat_col]) if not df_cat.empty: correct = (df_cat[ai_cat_col] == df_cat[gt_cat_col]).sum() total = len(df_cat) cat_accuracy = (correct / total * 100) if total > 0 else 0 else: cat_accuracy = 0 # Match rates if metrics and 'answer_matching' in metrics: exact_match_pct = metrics['answer_matching'].get('exact_match_pct', 0) combined_match_pct = metrics['answer_matching'].get('combined_match_pct', 0) no_match_pct = metrics['answer_matching'].get('no_match_pct', 0) else: df_searchable = df_merged[df_merged[ai_cat_col] == 'searchable'] df_match = df_searchable.dropna(subset=[match_type_col]) if not df_match.empty: match_counts = df_match[match_type_col].value_counts() total = len(df_match) exact_match_pct = (match_counts.get('exact-match', 0) / total * 100) loose_match_pct = (match_counts.get('loose-match', 0) / total * 100) combined_match_pct = exact_match_pct + loose_match_pct no_match_pct = (match_counts.get('no-match', 0) / total * 100) else: exact_match_pct = combined_match_pct = no_match_pct = 0 # Quality metrics if metrics and 'answer_quality' in metrics: avg_relevancy = metrics['answer_quality'].get('mean_score', 0) high_quality_pct = metrics['answer_quality'].get('high_quality_pct', 0) else: df_quality = df_merged[df_merged[match_type_col].isin(['exact-match', 'loose-match'])] df_quality = df_quality.dropna(subset=[relevancy_col]) if not df_quality.empty: avg_relevancy = df_quality[relevancy_col].mean() high_quality_pct = ((df_quality[relevancy_col] >= 0.7).sum() / len(df_quality) * 100) else: avg_relevancy = high_quality_pct = 0 # Display KPIs col1, col2, col3, col4 = st.columns(4) with col1: st.metric(label="Categorization Accuracy", value=f"{cat_accuracy:.2f}%") with col2: st.metric(label="Combined Match Rate", value=f"{combined_match_pct:.1f}%") with col3: st.metric(label="Avg Relevancy Score", value=f"{avg_relevancy:.3f}") with col4: st.metric(label="High Quality Answers (≥0.7)", value=f"{high_quality_pct:.1f}%") st.markdown("---") # Comparison to Obesity Benchmark st.subheader("📊 Performance vs. Obesity Suite Benchmark") benchmark_data = { 'Metric': ['Categorization Accuracy', 'Exact Match Rate', 'Combined Match Rate', 'No Match Rate', 'High Quality %'], 'HS Suite': [f"{cat_accuracy:.2f}%", f"{exact_match_pct:.1f}%", f"{combined_match_pct:.1f}%", f"{no_match_pct:.1f}%", f"{high_quality_pct:.1f}%"], 'Obesity Benchmark': ['98.86%', '66.1%', '87.9%', '12.1%', '68.4%'], 'Difference': [ f"{cat_accuracy - 98.86:.2f}%", f"{exact_match_pct - 66.1:.1f}%", f"{combined_match_pct - 87.9:.1f}%", f"{no_match_pct - 12.1:.1f}%", f"{high_quality_pct - 68.4:.1f}%" ] } df_benchmark = pd.DataFrame(benchmark_data) st.dataframe(df_benchmark, use_container_width=True) st.markdown("---") # Key Findings st.subheader("📋 Summary of Key Findings") st.markdown(f""" **Strengths:** - ✅ **Exact Match Rate:** {exact_match_pct:.1f}% (5.4% better than obesity benchmark) - ✅ **Combined Match Rate:** {combined_match_pct:.1f}% (0.4% better than benchmark) - ✅ **High Quality Answers:** {high_quality_pct:.1f}% of matched questions score ≥0.7 relevancy - ✅ **Low No-Match Rate:** {no_match_pct:.1f}% (slightly better than benchmark) **Areas for Improvement:** - ⚠️ **Categorization:** {cat_accuracy:.2f}% accuracy with 21 misclassifications (mostly searchable → contact_doctor) - ⚠️ **Women 20-40 Content:** Lower relevancy scores (0.617) indicating content gaps - ⚠️ **Black Patient Content:** Lowest relevancy scores (0.588) - urgent priority for health equity - ⚠️ **Expert Clinical:** 17.3% no-match rate suggests gaps in fundamental medical content **Overall Grade: B+ (87/100)** - Strong performance with targeted improvement opportunities """) st.markdown("---") # Top Recommendations st.subheader("🎯 Top Recommendations") st.markdown(""" 1. **Review 21 Misclassified Questions** - Determine if conservative routing is appropriate or needs content 2. **Create 15-20 Women's Health Videos** - Address menstruation, pregnancy, birth control, intimacy topics 3. **Create 10-15 Black Patient Videos** - Focus on genetics, disparities, culturally competent care 4. **Address Expert Clinical Gaps** - Reduce 17.3% no-match rate in clinical questions 5. **Review Low-Quality Catchalls** - Audit videos 4827, 4967, 4895 (high use, low relevancy) 6. **Process Remaining Questions** - Complete full 500-question suite for comprehensive coverage """) # ============================================================================= # PAGE 2: TEST SUITE INSIGHTS # ============================================================================= elif page == "Test Suite Insights": st.title("📝 Test Suite Insights") st.markdown("This page provides an overview of the test suite used for this evaluation.") st.markdown("---") # Basic stats st.subheader("Test Suite Overview") st.write(f"**Total questions in test suite:** {len(df_merged)}") st.write(f"**Expected questions:** 500") st.write(f"**Coverage:** {len(df_merged)/500*100:.1f}%") st.info("Note: The test suite contains 431 questions (86.2% of planned 500-question suite). This represents a high-quality subset for initial validation.") # First 5 questions st.markdown("---") st.subheader("Sample Questions from Test Suite") display_cols = [orig_q_suite_col, gt_cat_col, source_col, 'Condition'] display_cols = [col for col in display_cols if col in df_merged.columns] st.dataframe(df_merged[display_cols].head(10), use_container_width=True) # Question distribution by source st.markdown("---") st.subheader("Distribution of Questions by Source") if source_col in df_merged.columns: source_counts = df_merged[source_col].value_counts() st.write("**Number of Questions per Source:**") st.dataframe(source_counts.to_frame(name='Question Count'), use_container_width=True) # Bar chart fig_source, ax_source = plt.subplots(figsize=(12, 6)) source_counts.plot(kind='bar', ax=ax_source, color='dodgerblue') ax_source.set_title('Number of Questions per Source in Test Suite', fontsize=14) ax_source.set_ylabel('Number of Questions', fontsize=12) ax_source.set_xlabel('Source', fontsize=12) plt.xticks(rotation=45, ha='right', fontsize=10) plt.yticks(fontsize=10) ax_source.grid(axis='y', linestyle='--', alpha=0.7) plt.tight_layout() st.pyplot(fig_source) else: st.write("Source column not found in dataset.") # Ground truth category distribution st.markdown("---") st.subheader("Ground Truth Target Categories") if gt_cat_col in df_merged.columns: gt_counts = df_merged[gt_cat_col].value_counts() st.write("**Target Category Distribution:**") st.dataframe(gt_counts.to_frame(name='Count'), use_container_width=True) st.info("All 431 questions in this test suite are tagged as 'searchable', indicating they should be answerable by the AI system with video content.") # Condition distribution st.markdown("---") st.subheader("Condition Distribution") if 'Condition' in df_merged.columns: condition_counts = df_merged['Condition'].value_counts() st.write("**Conditions Covered:**") st.dataframe(condition_counts.to_frame(name='Count'), use_container_width=True) # ============================================================================= # PAGE 3: CATEGORIZATION PERFORMANCE # ============================================================================= elif page == "Categorization Performance": st.title("📊 AI Question Categorization Performance") st.markdown("This section evaluates how accurately the AI system categorizes questions into predefined types.") st.markdown("---") # Overall accuracy st.subheader("Overall Categorization Accuracy") df_cat = df_merged.dropna(subset=[ai_cat_col, gt_cat_col]) if not df_cat.empty: correct = (df_cat[ai_cat_col] == df_cat[gt_cat_col]).sum() total = len(df_cat) incorrect = total - correct accuracy = (correct / total * 100) if total > 0 else 0 col1, col2, col3 = st.columns(3) col1.metric(label="Total Questions Evaluated", value=total) col2.metric(label="Correctly Categorized", value=correct) col3.metric(label="Overall Accuracy", value=f"{accuracy:.2f}%") if incorrect > 0: st.metric(label="Misclassified Questions", value=incorrect, delta=f"-{incorrect}", delta_color="inverse") else: st.write("No data available for categorization analysis.") # Misclassified questions st.markdown("---") st.subheader("Detailed List of Misclassified Questions") df_miscat = df_cat[df_cat[ai_cat_col] != df_cat[gt_cat_col]] if not df_miscat.empty: st.write(f"**Found {len(df_miscat)} misclassified questions:**") display_cols = [orig_q_suite_col, gt_cat_col, ai_cat_col, source_col] display_cols = [col for col in display_cols if col in df_miscat.columns] st.dataframe(df_miscat[display_cols], use_container_width=True) # Misclassification patterns st.markdown("**Misclassification Patterns:**") miscat_patterns = df_miscat.groupby([gt_cat_col, ai_cat_col]).size().reset_index(name='Count') miscat_patterns['Pattern'] = miscat_patterns[gt_cat_col] + ' → ' + miscat_patterns[ai_cat_col] st.dataframe(miscat_patterns[['Pattern', 'Count']].sort_values('Count', ascending=False), use_container_width=True) else: st.success("✅ No misclassified questions found!") # Classification report st.markdown("---") st.subheader("Classification Report (Precision, Recall, F1-Score)") if total > 0: all_categories = sorted(list(set(df_cat[gt_cat_col].unique()) | set(df_cat[ai_cat_col].unique()) | {'searchable', 'contact_doctor', 'emergency', 'unknown'})) report_str = classification_report( df_cat[gt_cat_col], df_cat[ai_cat_col], labels=all_categories, zero_division=0, output_dict=False ) st.text_area("Classification Report:", value=report_str, height=300) st.caption("**Interpretation:** Precision (how many selected items are relevant), Recall (how many relevant items are selected), F1-Score (balance), Support (true instances).") # Confusion matrix st.markdown("---") st.subheader("Confusion Matrix Visualization") if total > 0 and len(df_cat[gt_cat_col].unique()) >= 1: cm = confusion_matrix(df_cat[gt_cat_col], df_cat[ai_cat_col], labels=all_categories) fig_cm, ax_cm = plt.subplots(figsize=(10, 8)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=all_categories, yticklabels=all_categories, ax=ax_cm) ax_cm.set_title('Categorization Confusion Matrix', fontsize=14) ax_cm.set_ylabel('True Target Category', fontsize=12) ax_cm.set_xlabel('AI Predicted Category', fontsize=12) plt.tight_layout() st.pyplot(fig_cm) st.caption("**Note:** The diagonal shows correct classifications. Off-diagonal values indicate misclassifications.") # ============================================================================= # PAGE 4: ANSWER MATCHING & QUALITY # ============================================================================= elif page == "Answer Matching & Quality": st.title("✨ Answer Matching & Quality Analysis") st.markdown("This section assesses the AI system's performance in matching 'searchable' questions to video answers and the quality of those matches.") st.markdown("---") # Overall match type distribution st.subheader("Overall Answer Match Type Distribution") st.markdown("Breakdown of how AI-predicted 'searchable' questions were matched to video content.") df_searchable = df_merged[df_merged[ai_cat_col] == 'searchable'].copy() df_match = df_searchable.dropna(subset=[match_type_col]) if not df_match.empty: total_searchable = len(df_match) st.write(f"**Total AI-predicted 'searchable' questions:** {total_searchable}") match_counts = df_match[match_type_col].value_counts() match_pcts = df_match[match_type_col].value_counts(normalize=True) * 100 col1, col2 = st.columns(2) with col1: st.write("**Counts:**") st.dataframe(match_counts.to_frame(name='Count'), use_container_width=True) with col2: st.write("**Percentages:**") pct_df = match_pcts.round(1).astype(str) + '%' st.dataframe(pct_df.to_frame(name='Percentage'), use_container_width=True) # Bar chart fig_match, ax_match = plt.subplots(figsize=(10, 6)) match_order = ['exact-match', 'loose-match', 'no-match'] colors = ['#2ECC71', '#F1C40F', '#E74C3C'] plot_counts = match_counts.reindex(match_order, fill_value=0) plot_counts.plot(kind='bar', ax=ax_match, color=colors) ax_match.set_title("Answer Match Type Distribution\n(for AI-predicted 'searchable' questions)", fontsize=14) ax_match.set_ylabel("Number of Questions", fontsize=12) ax_match.set_xlabel("Answer Match Type", fontsize=12) plt.xticks(rotation=0, fontsize=10) plt.yticks(fontsize=10) # Add percentage labels for i, count in enumerate(plot_counts): if plot_counts.index[i] in match_pcts.index: pct = match_pcts[plot_counts.index[i]] ax_match.text(i, count + (0.01 * total_searchable), f'{pct:.1f}%', ha='center', va='bottom', fontsize=10) plt.tight_layout() st.pyplot(fig_match) # Key metrics exact_pct = match_pcts.get('exact-match', 0) loose_pct = match_pcts.get('loose-match', 0) no_match_pct = match_pcts.get('no-match', 0) combined_pct = exact_pct + loose_pct col1, col2, col3 = st.columns(3) col1.metric("Exact Match Rate", f"{exact_pct:.1f}%") col2.metric("Combined Match Rate", f"{combined_pct:.1f}%") col3.metric("No Match Rate", f"{no_match_pct:.1f}%") else: st.write("No searchable questions found with match type data.") # Relevancy score analysis st.markdown("---") st.subheader("Answer Relevancy Score Analysis") st.markdown("Analyzes the quality of matched answers using the relevancy score (0-1 scale).") df_matched = df_merged[df_merged[match_type_col].isin(['exact-match', 'loose-match'])].copy() df_quality = df_matched.dropna(subset=[relevancy_col]) if not df_quality.empty: st.write(f"**Analyzing {len(df_quality)} matched questions with relevancy scores:**") scores = df_quality[relevancy_col] # Descriptive statistics col1, col2 = st.columns(2) with col1: st.write("**Descriptive Statistics:**") stats_df = scores.describe().to_frame() stats_df.columns = ['Value'] st.dataframe(stats_df.style.format("{:.3f}"), use_container_width=True) with col2: st.write("**Quality Distribution:**") high_quality = (scores >= 0.7).sum() medium_quality = ((scores >= 0.5) & (scores < 0.7)).sum() low_quality = (scores < 0.5).sum() quality_data = { 'Quality Tier': ['High (≥0.7)', 'Medium (0.5-0.7)', 'Low (<0.5)'], 'Count': [high_quality, medium_quality, low_quality], 'Percentage': [ f"{high_quality/len(scores)*100:.1f}%", f"{medium_quality/len(scores)*100:.1f}%", f"{low_quality/len(scores)*100:.1f}%" ] } st.dataframe(pd.DataFrame(quality_data), use_container_width=True) # Histogram fig_hist, ax_hist = plt.subplots(figsize=(12, 6)) sns.histplot(scores, kde=True, bins=20, color='teal', ax=ax_hist) ax_hist.set_title('Distribution of Relevancy Scores\n(for Exact and Loose Matches)', fontsize=14) ax_hist.set_xlabel('Relevancy Score (0-1)', fontsize=12) ax_hist.set_ylabel('Number of Questions', fontsize=12) ax_hist.axvline(x=0.7, color='red', linestyle='--', label='High Quality Threshold (0.7)') ax_hist.legend() ax_hist.grid(axis='y', alpha=0.75) plt.tight_layout() st.pyplot(fig_hist) # Key metric high_quality_pct = (high_quality / len(scores) * 100) st.metric(label="High Quality Answers (≥0.7)", value=f"{high_quality_pct:.1f}%", help=f"{high_quality} out of {len(scores)} matched questions") else: st.write("No matched questions with relevancy scores to analyze.") # Relevancy by match type (stacked histogram) st.markdown("---") st.subheader("Relevancy Score Distribution by Match Type") df_plot = df_searchable.copy() # Set no-match scores to 0 for visualization df_plot.loc[df_plot[match_type_col] == 'no-match', relevancy_col] = 0.0 df_plot_clean = df_plot.dropna(subset=[relevancy_col, match_type_col]) if not df_plot_clean.empty: match_order = ['exact-match', 'loose-match', 'no-match'] palette = {'exact-match': '#2ECC71', 'loose-match': '#F1C40F', 'no-match': '#E74C3C'} df_plot_filtered = df_plot_clean[df_plot_clean[match_type_col].isin(match_order)] fig_stacked, ax_stacked = plt.subplots(figsize=(14, 8)) sns.histplot( data=df_plot_filtered, x=relevancy_col, hue=match_type_col, hue_order=match_order, multiple="stack", kde=False, bins=np.arange(-0.025, 1.075, 0.05), palette=palette, edgecolor='black', linewidth=0.5, ax=ax_stacked ) ax_stacked.set_title('Distribution of Relevancy Scores by Answer Match Type\n(for AI-predicted Searchable Questions)', fontsize=16) ax_stacked.set_xlabel('Relevancy Score (0-1; "no-match" at 0)', fontsize=12) ax_stacked.set_ylabel('Number of Questions', fontsize=12) ax_stacked.grid(axis='y', linestyle='--', alpha=0.7) plt.xticks(np.arange(0, 1.1, 0.1), fontsize=10) plt.yticks(fontsize=10) ax_stacked.set_xlim(-0.05, 1.05) plt.tight_layout() st.pyplot(fig_stacked) else: st.write("No data available for relevancy distribution by match type.") # ============================================================================= # PAGE 5: PERFORMANCE BY QUESTION SOURCE # ============================================================================= elif page == "Performance by Question Source": st.title("📂 Performance Analysis by Question Source") st.markdown("This page analyzes AI system performance across different question sources.") st.markdown("---") # Overall source performance comparison st.subheader("Match Rate Performance by Source") if metrics and 'source_performance' in metrics: source_data = metrics['source_performance'] # Create comparison dataframe source_df = pd.DataFrame([ { 'Source': source_name, 'Total Questions': data['total_questions'], 'Exact Match %': data['exact_match_pct'], 'Combined Match %': data['combined_match_pct'], 'No Match %': data['no_match_pct'], 'Avg Relevancy': data['avg_relevancy'] } for source_name, data in source_data.items() ]) # Sort by combined match rate source_df = source_df.sort_values('Combined Match %', ascending=False) st.dataframe(source_df, use_container_width=True) # Visualizations st.markdown("---") st.subheader("Combined Match Rate by Source") fig1, ax1 = plt.subplots(figsize=(12, 6)) colors = ['#2ECC71' if x >= 85 else '#F1C40F' if x >= 75 else '#E74C3C' for x in source_df['Combined Match %']] ax1.barh(source_df['Source'], source_df['Combined Match %'], color=colors) ax1.set_xlabel('Combined Match Rate (%)', fontsize=12) ax1.set_title('Combined Match Rate by Question Source', fontsize=14) ax1.axvline(x=87.9, color='gray', linestyle='--', label='Obesity Benchmark (87.9%)') ax1.legend() ax1.grid(axis='x', alpha=0.3) plt.tight_layout() st.pyplot(fig1) # Relevancy comparison st.markdown("---") st.subheader("Average Relevancy Score by Source") fig2, ax2 = plt.subplots(figsize=(12, 6)) colors2 = ['#2ECC71' if x >= 0.7 else '#F1C40F' if x >= 0.6 else '#E74C3C' for x in source_df['Avg Relevancy']] ax2.barh(source_df['Source'].values[::-1], source_df['Avg Relevancy'].values[::-1], color=colors2[::-1]) ax2.set_xlabel('Average Relevancy Score', fontsize=12) ax2.set_title('Average Relevancy Score by Question Source', fontsize=14) ax2.axvline(x=0.725, color='gray', linestyle='--', label='Overall Mean (0.725)') ax2.axvline(x=0.7, color='red', linestyle='--', alpha=0.5, label='High Quality Threshold') ax2.legend() ax2.grid(axis='x', alpha=0.3) ax2.set_xlim(0.5, 0.85) plt.tight_layout() st.pyplot(fig2) # No-match examples by source st.markdown("---") st.subheader("No-Match Questions by Source") df_no_match = df_merged[df_merged[match_type_col] == 'no-match'].copy() if not df_no_match.empty: st.write(f"**Total no-match questions: {len(df_no_match)}**") # Group by source no_match_by_source = df_no_match.groupby(source_col).size().sort_values(ascending=False) col1, col2 = st.columns([1, 2]) with col1: st.write("**No-Match Count by Source:**") st.dataframe(no_match_by_source.to_frame(name='Count'), use_container_width=True) with col2: # Select source to view questions selected_source = st.selectbox("Select source to view no-match questions:", options=no_match_by_source.index.tolist()) df_source_no_match = df_no_match[df_no_match[source_col] == selected_source] st.write(f"**{len(df_source_no_match)} no-match questions from {selected_source}:**") display_cols = [orig_q_suite_col, gt_cat_col, source_col] display_cols = [col for col in display_cols if col in df_source_no_match.columns] st.dataframe(df_source_no_match[display_cols].reset_index(drop=True), use_container_width=True) else: st.success("✅ No questions with no-match status!") # Key insights st.markdown("---") st.subheader("💡 Key Insights by Source") st.markdown(""" **Patient Forums (281 questions):** - ✅ **Best Performance:** 90.0% combined match rate (highest of all sources) - ✅ **Good Relevancy:** 0.734 average (slightly above overall mean) - 📌 Represents real patient language and concerns - 📌 Only 28 no-match questions (9.96%) **Expert Clinical (81 questions):** - ⚠️ **Lower Match Rate:** 82.7% combined (below benchmark) - ✅ **Highest Relevancy:** 0.763 average (best quality when matched) - ⚠️ **14 no-match questions (17.3%)** - suggests gaps in fundamental medical content - 📌 Prioritize creating expert-level educational content **Women 20-40 Focus (33 questions):** - ⚠️ **Lowest Relevancy:** 0.617 average (well below mean of 0.725) - ✅ **Decent Match Rate:** 87.9% combined - 📌 Content exists but quality is lower - review and improve - 📌 Focus on reproductive health, pregnancy, intimacy topics **Black Patients Focus (15 questions):** - ⚠️ **Lowest Relevancy:** 0.588 average (critical gap) - ✅ **Decent Match Rate:** 86.7% combined - 📌 **Urgent Priority:** Create culturally competent content - 📌 Address genetic factors, disparities, community concerns """) # ============================================================================= # PAGE 6: VIDEO INSIGHTS # ============================================================================= elif page == "Video Insights": st.title("🎥 Video Library Performance Insights") st.markdown("This page analyzes how video content is being utilized to answer HS questions.") st.markdown("---") # Video usage summary st.subheader("Video Library Utilization Summary") if metrics and 'video_usage' in metrics: video_metrics = metrics['video_usage'] col1, col2, col3 = st.columns(3) col1.metric("Unique Videos as Top Answer", video_metrics['unique_videos_as_top_answer']) col2.metric("Unique Videos in Top 3", video_metrics['unique_videos_in_top3']) col3.metric("Total Video Presentations", video_metrics['total_video_presentations']) st.info(f""" **Interpretation:** The system uses **{video_metrics['unique_videos_as_top_answer']} different videos** as the primary answer across 362 matched questions. When considering all top-3 positions, **{video_metrics['unique_videos_in_top3']} unique videos** appear, showing good content diversity. """) # Top 20 most used videos st.markdown("---") st.subheader("Top 20 Most Frequently Surfaced Videos") if metrics and 'video_usage' in metrics and 'top_20_videos' in metrics['video_usage']: top_videos = metrics['video_usage']['top_20_videos'] # Create dataframe top_videos_df = pd.DataFrame(top_videos) top_videos_df.columns = ['Rank', 'Video ID', 'Frequency', 'Avg Relevancy'] st.dataframe(top_videos_df, use_container_width=True) # Visualization: Frequency bar chart fig1, ax1 = plt.subplots(figsize=(14, 8)) # Color by relevancy score colors = ['#2ECC71' if x >= 0.7 else '#F1C40F' if x >= 0.6 else '#E74C3C' for x in top_videos_df['Avg Relevancy']] ax1.barh(range(len(top_videos_df)), top_videos_df['Frequency'], color=colors) ax1.set_yticks(range(len(top_videos_df))) ax1.set_yticklabels([f"#{row['Rank']}: Video {row['Video ID']}" for _, row in top_videos_df.iterrows()]) ax1.set_xlabel('Frequency (Times Used as Top Answer)', fontsize=12) ax1.set_title('Top 20 Most Frequently Surfaced Videos\n(Color indicates Avg Relevancy: Green ≥0.7, Yellow ≥0.6, Red <0.6)', fontsize=14) ax1.invert_yaxis() ax1.grid(axis='x', alpha=0.3) plt.tight_layout() st.pyplot(fig1) # Frequency vs Relevancy scatter plot st.markdown("---") st.subheader("Video Frequency vs. Relevancy Quality") fig2, ax2 = plt.subplots(figsize=(12, 8)) scatter = ax2.scatter(top_videos_df['Frequency'], top_videos_df['Avg Relevancy'], s=100, alpha=0.6, c=top_videos_df['Avg Relevancy'], cmap='RdYlGn', edgecolors='black') # Annotate problematic videos (high frequency, low relevancy) for _, row in top_videos_df.iterrows(): if row['Frequency'] >= 8 and row['Avg Relevancy'] < 0.6: ax2.annotate(f"Video {row['Video ID']}", (row['Frequency'], row['Avg Relevancy']), xytext=(5, 5), textcoords='offset points', fontsize=9, color='red', fontweight='bold') ax2.set_xlabel('Frequency (Times Used)', fontsize=12) ax2.set_ylabel('Average Relevancy Score', fontsize=12) ax2.set_title('Video Usage Frequency vs. Quality\n(Problematic: High frequency + Low relevancy)', fontsize=14) ax2.axhline(y=0.7, color='green', linestyle='--', alpha=0.5, label='High Quality (≥0.7)') ax2.axhline(y=0.6, color='orange', linestyle='--', alpha=0.5, label='Medium Quality (≥0.6)') ax2.grid(alpha=0.3) ax2.legend() plt.colorbar(scatter, ax=ax2, label='Avg Relevancy Score') plt.tight_layout() st.pyplot(fig2) # Catchall videos analysis st.markdown("---") st.subheader("🚨 Catchall Videos Analysis") st.markdown("Videos with high usage frequency but low relevancy scores (potential 'catchalls')") if metrics and 'video_usage' in metrics and 'top_20_videos' in metrics['video_usage']: catchall_threshold_freq = 8 catchall_threshold_rel = 0.6 catchall_videos = [v for v in top_videos if v['frequency'] >= catchall_threshold_freq and v['avg_relevancy'] < catchall_threshold_rel] if catchall_videos: st.warning(f"⚠️ **Found {len(catchall_videos)} potential catchall videos** (used ≥{catchall_threshold_freq} times with relevancy <{catchall_threshold_rel})") catchall_df = pd.DataFrame(catchall_videos) catchall_df.columns = ['Rank', 'Video ID', 'Frequency', 'Avg Relevancy'] st.dataframe(catchall_df, use_container_width=True) st.markdown(""" **Recommendations:** - **Review these videos:** They may be too generic or not addressing specific patient needs - **Create targeted content:** Develop videos that better address the specific questions matched to these catchalls - **Consider retirement:** If relevancy remains low after review, these videos may need to be replaced """) else: st.success(f"✅ No problematic catchall videos found (using threshold: frequency ≥{catchall_threshold_freq}, relevancy <{catchall_threshold_rel})") # Key insights st.markdown("---") st.subheader("💡 Key Video Library Insights") st.markdown(""" **Content Diversity:** - 108 unique videos serve as top answer (good diversity) - 141 unique videos appear in top-3 positions (excellent coverage) - Top video (4866) used only 14 times (3.9% of presentations) - no over-reliance **Quality Concerns:** - Videos 4827, 4967, 4895: High frequency (8-11 uses) but low relevancy (<0.52) - These may be acting as "catchalls" for questions lacking better matches - Recommend reviewing these videos and creating targeted alternatives **High Performers:** - Video 4888: 6 uses with 0.925 relevancy (excellent quality) - Video 5034: 7 uses with 0.836 relevancy (very good) - Video 4826: 12 uses with 0.817 relevancy (consistent quality) **Recommendations:** 1. Audit low-relevancy, high-frequency videos 2. Create 10-15 targeted videos for common no-match questions 3. Monitor video performance over time to identify emerging catchalls """) # ============================================================================= # PAGE 7: DEMOGRAPHIC DEEP-DIVE # ============================================================================= elif page == "Demographic Deep-Dive": st.title("👥 Demographic Performance Deep-Dive") st.markdown("Analyzing AI system performance for key demographic groups: Women 20-40 and Black patients.") st.markdown("---") st.info(""" **Why These Demographics Matter:** - **Women:** 75% of HS patients are women; condition peak onset 20-40 years - **Black Patients:** 2-3x higher prevalence in Black/African American populations - **Health Equity:** Ensuring culturally competent, relevant content for all patients """) # Comparison overview st.subheader("Demographic Performance Comparison") if metrics and 'demographic_performance' in metrics: demo_data = metrics['demographic_performance'] # Create comparison table demo_comparison = pd.DataFrame([ { 'Demographic': demo_data['Women_20to40_Focus']['name'], 'Questions': demo_data['Women_20to40_Focus']['total_questions'], 'Exact Match %': demo_data['Women_20to40_Focus']['exact_match_pct'], 'Combined Match %': demo_data['Women_20to40_Focus']['combined_match_pct'], 'Avg Relevancy': demo_data['Women_20to40_Focus']['avg_relevancy'] }, { 'Demographic': demo_data['Black_Patients_Focus']['name'], 'Questions': demo_data['Black_Patients_Focus']['total_questions'], 'Exact Match %': demo_data['Black_Patients_Focus']['exact_match_pct'], 'Combined Match %': demo_data['Black_Patients_Focus']['combined_match_pct'], 'Avg Relevancy': demo_data['Black_Patients_Focus']['avg_relevancy'] }, { 'Demographic': 'Overall System', 'Questions': 410, 'Exact Match %': 71.46, 'Combined Match %': 88.29, 'Avg Relevancy': 0.725 } ]) st.dataframe(demo_comparison, use_container_width=True) # Visualizations col1, col2 = st.columns(2) with col1: st.subheader("Match Rate Comparison") fig1, ax1 = plt.subplots(figsize=(10, 6)) x = np.arange(len(demo_comparison)) width = 0.35 ax1.bar(x - width/2, demo_comparison['Exact Match %'], width, label='Exact Match', color='#2ECC71') ax1.bar(x + width/2, demo_comparison['Combined Match %'], width, label='Combined Match', color='#3498DB') ax1.set_ylabel('Match Rate (%)', fontsize=12) ax1.set_title('Match Rates by Demographic', fontsize=14) ax1.set_xticks(x) ax1.set_xticklabels(demo_comparison['Demographic'], fontsize=10) ax1.legend() ax1.grid(axis='y', alpha=0.3) plt.xticks(rotation=15, ha='right') plt.tight_layout() st.pyplot(fig1) with col2: st.subheader("Relevancy Score Comparison") fig2, ax2 = plt.subplots(figsize=(10, 6)) colors = ['#E74C3C', '#E74C3C', '#2ECC71'] # Red for demos, green for overall bars = ax2.bar(demo_comparison['Demographic'], demo_comparison['Avg Relevancy'], color=colors, alpha=0.7) ax2.set_ylabel('Average Relevancy Score', fontsize=12) ax2.set_title('Average Relevancy by Demographic', fontsize=14) ax2.axhline(y=0.7, color='green', linestyle='--', alpha=0.5, label='High Quality (≥0.7)') ax2.axhline(y=0.725, color='gray', linestyle='--', alpha=0.5, label='Overall Mean') ax2.set_ylim(0.5, 0.8) ax2.legend() ax2.grid(axis='y', alpha=0.3) plt.xticks(rotation=15, ha='right', fontsize=10) plt.tight_layout() st.pyplot(fig2) # Women 20-40 deep dive st.markdown("---") st.subheader("👩 Women 20-40 Years Deep-Dive") df_women = df_merged[df_merged[source_col] == 'Women_20to40_Focus'].copy() if not df_women.empty: st.write(f"**Analyzing {len(df_women)} questions specific to women aged 20-40**") col1, col2, col3 = st.columns(3) col1.metric("Questions", len(df_women)) col2.metric("Avg Relevancy", f"{df_women[relevancy_col].mean():.3f}") col3.metric("No-Match Count", (df_women[match_type_col] == 'no-match').sum()) # Show no-match questions df_women_no_match = df_women[df_women[match_type_col] == 'no-match'] if not df_women_no_match.empty: st.markdown("**No-Match Questions (Content Gaps):**") display_cols = [orig_q_suite_col, match_type_col] display_cols = [col for col in display_cols if col in df_women_no_match.columns] st.dataframe(df_women_no_match[display_cols].reset_index(drop=True), use_container_width=True) # Show low-relevancy matches df_women_matched = df_women[df_women[match_type_col].isin(['exact-match', 'loose-match'])].copy() df_women_low_rel = df_women_matched[df_women_matched[relevancy_col] < 0.6] if not df_women_low_rel.empty: st.markdown("**Low-Relevancy Matches (<0.6) - Quality Issues:**") display_cols = [orig_q_suite_col, relevancy_col, video_id_col] display_cols = [col for col in display_cols if col in df_women_low_rel.columns] st.dataframe(df_women_low_rel[display_cols].reset_index(drop=True), use_container_width=True) st.markdown(""" **Key Findings - Women 20-40:** - ⚠️ **Lowest Relevancy:** 0.617 average (15% below overall mean) - 📌 **Content Gaps:** Reproductive health, pregnancy, menstruation, intimacy - 📌 **Recommendation:** Create 15-20 targeted videos addressing: - HS during pregnancy and postpartum - Birth control and hormonal factors - Menstrual cycle impacts - Dating, intimacy, and relationships - Career and workplace concerns """) # Black patients deep dive st.markdown("---") st.subheader("🤝 Black Patients Deep-Dive") df_black = df_merged[df_merged[source_col] == 'Black_Patients_Focus'].copy() if not df_black.empty: st.write(f"**Analyzing {len(df_black)} questions specific to Black patients**") col1, col2, col3 = st.columns(3) col1.metric("Questions", len(df_black)) col2.metric("Avg Relevancy", f"{df_black[relevancy_col].mean():.3f}") col3.metric("No-Match Count", (df_black[match_type_col] == 'no-match').sum()) # Show no-match questions df_black_no_match = df_black[df_black[match_type_col] == 'no-match'] if not df_black_no_match.empty: st.markdown("**No-Match Questions (Content Gaps):**") display_cols = [orig_q_suite_col, match_type_col] display_cols = [col for col in display_cols if col in df_black_no_match.columns] st.dataframe(df_black_no_match[display_cols].reset_index(drop=True), use_container_width=True) # Show low-relevancy matches df_black_matched = df_black[df_black[match_type_col].isin(['exact-match', 'loose-match'])].copy() df_black_low_rel = df_black_matched[df_black_matched[relevancy_col] < 0.6] if not df_black_low_rel.empty: st.markdown("**Low-Relevancy Matches (<0.6) - Quality Issues:**") display_cols = [orig_q_suite_col, relevancy_col, video_id_col] display_cols = [col for col in display_cols if col in df_black_low_rel.columns] st.dataframe(df_black_low_rel[display_cols].reset_index(drop=True), use_container_width=True) st.markdown(""" **Key Findings - Black Patients:** - 🚨 **Lowest Relevancy:** 0.588 average (19% below overall mean) - 🚨 **Critical Priority:** Health equity issue requiring immediate attention - 📌 **Content Gaps:** Genetic factors, disparities, cultural competency - 📌 **Recommendation:** Create 10-15 targeted videos addressing: - Why HS is more common in Black populations (genetics) - Healthcare disparities and advocacy - Culturally competent care and provider selection - Community support and representation - Scarring and skin tone considerations (keloids, hyperpigmentation) """) # Overall recommendations st.markdown("---") st.subheader("🎯 Strategic Recommendations for Health Equity") st.markdown(""" **Immediate Actions:** 1. **Content Creation Sprint:** - 15-20 videos for Women 20-40 (reproductive health, career, relationships) - 10-15 videos for Black patients (genetics, disparities, cultural competency) 2. **Expert Recruitment:** - Partner with OB/GYN experts for women's health content - Engage Black dermatologists and healthcare advocates - Include patient testimonials from diverse backgrounds 3. **Quality Audit:** - Review all low-relevancy matches for both demographics - Replace generic catchall videos with targeted content - Test new content with target demographic users 4. **Measurement:** - Re-run test suite after content additions - Track relevancy improvement (target: >0.7 for both groups) - Monitor user feedback from demographic groups **Success Metrics:** - Women 20-40 relevancy: Current 0.617 → Target 0.72+ (match overall) - Black patients relevancy: Current 0.588 → Target 0.72+ (match overall) - No-match reduction: <5% for both demographics """) # ============================================================================= # PAGE 8: INTERACTIVE QUESTION EXPLORER # ============================================================================= elif page == "Interactive Question Explorer": st.title("🔍 Interactive Question Explorer") st.markdown("Search and explore individual questions, AI responses, and video content.") st.markdown("---") # Search functionality st.subheader("Search Questions") col1, col2 = st.columns([3, 1]) with col1: search_term = st.text_input("Enter keyword to search questions:", placeholder="e.g., pregnancy, surgery, diet, pain") with col2: search_in = st.selectbox("Search in:", ["Questions", "Video Titles", "Both"]) # Filter by match type match_filter = st.multiselect("Filter by Match Type:", options=['exact-match', 'loose-match', 'no-match'], default=['exact-match', 'loose-match', 'no-match']) # Filter by source source_filter = st.multiselect("Filter by Source:", options=df_merged[source_col].unique().tolist(), default=df_merged[source_col].unique().tolist()) # Apply filters df_filtered = df_merged.copy() if match_filter: df_filtered = df_filtered[df_filtered[match_type_col].isin(match_filter)] if source_filter: df_filtered = df_filtered[df_filtered[source_col].isin(source_filter)] if search_term: search_term_lower = search_term.lower() if search_in == "Questions": mask = df_filtered[orig_q_suite_col].str.lower().str.contains(search_term_lower, na=False) elif search_in == "Video Titles": mask = df_filtered[video_title_col].str.lower().str.contains(search_term_lower, na=False) else: # Both mask_q = df_filtered[orig_q_suite_col].str.lower().str.contains(search_term_lower, na=False) mask_v = df_filtered[video_title_col].str.lower().str.contains(search_term_lower, na=False) mask = mask_q | mask_v df_filtered = df_filtered[mask] st.write(f"**Found {len(df_filtered)} matching questions**") # Display results if not df_filtered.empty: st.markdown("---") st.subheader("Search Results") # Summary table display_cols = [orig_q_suite_col, source_col, match_type_col, relevancy_col, video_id_col] display_cols = [col for col in display_cols if col in df_filtered.columns] df_display = df_filtered[display_cols].copy() df_display = df_display.rename(columns={ orig_q_suite_col: 'Question', source_col: 'Source', match_type_col: 'Match Type', relevancy_col: 'Relevancy', video_id_col: 'Video ID' }) st.dataframe(df_display.reset_index(drop=True), use_container_width=True) # Detailed view st.markdown("---") st.subheader("Detailed Question View") # Select question to explore question_options = df_filtered[orig_q_suite_col].tolist() selected_question = st.selectbox("Select a question to view details:", options=question_options) if selected_question: df_selected = df_filtered[df_filtered[orig_q_suite_col] == selected_question].iloc[0] # Display question details st.markdown(f"### Question: *{selected_question}*") col1, col2, col3, col4 = st.columns(4) col1.metric("Source", df_selected[source_col]) col2.metric("Ground Truth Category", df_selected[gt_cat_col]) col3.metric("AI Predicted Category", df_selected[ai_cat_col]) col4.metric("Match Type", df_selected[match_type_col]) # Video 1 details if pd.notna(df_selected[video_id_col]): st.markdown("---") st.markdown("### 🎥 Top Answer Video") col1, col2, col3 = st.columns(3) col1.metric("Video ID", df_selected[video_id_col]) col2.metric("Relevancy Score", f"{df_selected[relevancy_col]:.3f}") # Relevancy color indicator rel_score = df_selected[relevancy_col] if rel_score >= 0.7: col3.success("✅ High Quality") elif rel_score >= 0.5: col3.warning("⚠️ Medium Quality") else: col3.error("❌ Low Quality") st.markdown(f"**Video Title:** {df_selected[video_title_col]}") if pd.notna(df_selected[video_text_col]): with st.expander("📄 View Full Video Transcript"): st.text_area("Transcript:", value=df_selected[video_text_col], height=300, disabled=True) # Video 2 & 3 if available video2_id_col = 'Advice Video 2 ID' video2_title_col = 'Advice Video 2 Title' video2_rel_col = 'Advice Video 2 Relevancy Score' video2_text_col = 'Advice Video 2 Answer Text' if video2_id_col in df_selected.index and pd.notna(df_selected[video2_id_col]): st.markdown("---") st.markdown("### 🎥 Second Answer Video") col1, col2 = st.columns(2) col1.metric("Video ID", df_selected[video2_id_col]) col2.metric("Relevancy Score", f"{df_selected[video2_rel_col]:.3f}") st.markdown(f"**Video Title:** {df_selected[video2_title_col]}") if pd.notna(df_selected[video2_text_col]): with st.expander("📄 View Full Video Transcript"): st.text_area("Transcript:", value=df_selected[video2_text_col], height=300, disabled=True, key='video2') video3_id_col = 'Advice Video 3 ID' video3_title_col = 'Advice Video 3 Title' video3_rel_col = 'Advice Video 3 Relevancy Score' video3_text_col = 'Advice Video 3 Answer Text' if video3_id_col in df_selected.index and pd.notna(df_selected[video3_id_col]): st.markdown("---") st.markdown("### 🎥 Third Answer Video") col1, col2 = st.columns(2) col1.metric("Video ID", df_selected[video3_id_col]) col2.metric("Relevancy Score", f"{df_selected[video3_rel_col]:.3f}") st.markdown(f"**Video Title:** {df_selected[video3_title_col]}") if pd.notna(df_selected[video3_text_col]): with st.expander("📄 View Full Video Transcript"): st.text_area("Transcript:", value=df_selected[video3_text_col], height=300, disabled=True, key='video3') else: st.info("No questions match your search criteria. Try adjusting your filters or search term.") # Quick stats st.markdown("---") st.subheader("📊 Explorer Statistics") col1, col2, col3, col4 = st.columns(4) col1.metric("Total Questions", len(df_merged)) col2.metric("Exact Matches", (df_merged[match_type_col] == 'exact-match').sum()) col3.metric("Loose Matches", (df_merged[match_type_col] == 'loose-match').sum()) col4.metric("No Matches", (df_merged[match_type_col] == 'no-match').sum())