import streamlit as st import pandas as pd import numpy as np import plotly.express as px from sklearn.decomposition import PCA from sklearn.datasets import load_iris, load_wine, load_breast_cancer from sklearn.preprocessing import StandardScaler import plotly.graph_objects as go # Set page config st.set_page_config( page_title="PCA Visualization App", page_icon="📊", layout="wide" ) # Title and description st.title("📊 PCA Visualization Dashboard") st.markdown(""" This app demonstrates Principal Component Analysis (PCA) visualization using different datasets. Use the controls in the sidebar to customize your analysis. """) # Sidebar controls st.sidebar.header("đŸŽ›ī¸ Controls") # Dataset selection dataset_name = st.sidebar.selectbox( "Select Dataset", ("Iris", "Wine", "Breast Cancer") ) # Number of components n_components = st.sidebar.slider( "Number of Components", min_value=2, max_value=3, value=2, help="Select 2D or 3D visualization" ) # Load selected dataset @st.cache_data def load_data(dataset_name): if dataset_name == "Iris": data = load_iris() elif dataset_name == "Wine": data = load_wine() else: data = load_breast_cancer() df = pd.DataFrame(data.data, columns=data.feature_names) df['target'] = data.target df['target_names'] = [data.target_names[i] for i in data.target] return df, data.target_names # Load data df, target_names = load_data(dataset_name) # Display dataset info st.subheader(f"Dataset: {dataset_name}") st.write(f"Shape: {df.shape[0]} rows, {df.shape[1]-2} features") st.write(f"Target classes: {', '.join(target_names)}") # Show raw data toggle if st.checkbox("Show raw data"): st.write(df.head()) # Prepare data for PCA X = df.drop(['target', 'target_names'], axis=1) y = df['target'] target_names = df['target_names'].unique() # Standardize the data scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Apply PCA pca = PCA(n_components=n_components) X_pca = pca.fit_transform(X_scaled) # Create DataFrame with PCA results pca_columns = [f"PC{i+1}" for i in range(n_components)] df_pca = pd.DataFrame(X_pca, columns=pca_columns) df_pca['target'] = y df_pca['target_names'] = df['target_names'] # Display PCA info st.subheader("PCA Analysis") col1, col2 = st.columns(2) with col1: st.write(f"Explained Variance Ratio: {[f'{val:.2%}' for val in pca.explained_variance_ratio_]}") st.write(f"Total Variance Explained: {pca.explained_variance_ratio_.sum():.2%}") with col2: st.write(f"Cumulative Variance Explained: {[f'{val:.2%}' for val in pca.explained_variance_ratio_.cumsum()]}") # Create visualization if n_components == 2: fig = px.scatter( df_pca, x='PC1', y='PC2', color='target_names', title=f"PCA Visualization - {dataset_name} Dataset (2D)", labels={ 'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)', 'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)' }, hover_data=['target_names'] ) fig.update_traces(marker=dict(size=8, opacity=0.8)) fig.update_layout( width=800, height=600, legend_title_text='Classes' ) else: # 3D visualization fig = px.scatter_3d( df_pca, x='PC1', y='PC2', z='PC3', color='target_names', title=f"PCA Visualization - {dataset_name} Dataset (3D)", labels={ 'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)', 'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)', 'PC3': f'PC3 ({pca.explained_variance_ratio_[2]:.1%} variance)' }, hover_data=['target_names'] ) fig.update_traces(marker=dict(size=5, opacity=0.8)) fig.update_layout( width=800, height=600, legend_title_text='Classes' ) # Display plot st.plotly_chart(fig, use_container_width=True) # Feature contribution to principal components st.subheader("Feature Contributions to Principal Components") feature_importance = pd.DataFrame( pca.components_.T, columns=pca_columns, index=X.columns ) # Display feature importance as a heatmap fig_importance = px.imshow( feature_importance.T, labels=dict(x="Features", y="Principal Components", color="Contribution"), color_continuous_scale='RdBu_r', aspect="auto", title="Feature Contributions to Principal Components" ) fig_importance.update_layout( width=800, height=400 ) st.plotly_chart(fig_importance, use_container_width=True) # Show top contributing features st.subheader("Top Contributing Features") for i in range(n_components): st.write(f"**PC{i+1}**:") pc_features = feature_importance[f'PC{i+1}'].abs().sort_values(ascending=False) top_features = pc_features.head(5) for feature, value in top_features.items(): st.write(f"- {feature}: {value:.3f}") st.write("") # Information about PCA with st.expander("â„šī¸ About PCA"): st.markdown(""" **Principal Component Analysis (PCA)** is a dimensionality reduction technique that transforms high-dimensional data into a lower-dimensional space while preserving as much variance as possible. **Key Concepts:** - **Principal Components**: New axes that capture maximum variance in the data - **Explained Variance Ratio**: Proportion of total variance explained by each component - **Standardization**: Important preprocessing step to ensure all features contribute equally **Benefits:** - Reduces computational complexity - Removes multicollinearity - Helps with data visualization - Can improve model performance by reducing noise **Applications:** - Data visualization - Feature extraction - Noise reduction - Data compression """)