Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| from sklearn.decomposition import PCA | |
| from sklearn.datasets import load_iris, load_wine, load_breast_cancer | |
| from sklearn.preprocessing import StandardScaler | |
| import plotly.graph_objects as go | |
| # Set page config | |
| st.set_page_config( | |
| page_title="PCA Visualization App", | |
| page_icon="π", | |
| layout="wide" | |
| ) | |
| # Title and description | |
| st.title("π PCA Visualization Dashboard") | |
| st.markdown(""" | |
| This app demonstrates Principal Component Analysis (PCA) visualization using different datasets. | |
| Use the controls in the sidebar to customize your analysis. | |
| """) | |
| # Sidebar controls | |
| st.sidebar.header("ποΈ Controls") | |
| # Dataset selection | |
| dataset_name = st.sidebar.selectbox( | |
| "Select Dataset", | |
| ("Iris", "Wine", "Breast Cancer") | |
| ) | |
| # Number of components | |
| n_components = st.sidebar.slider( | |
| "Number of Components", | |
| min_value=2, | |
| max_value=3, | |
| value=2, | |
| help="Select 2D or 3D visualization" | |
| ) | |
| # Load selected dataset | |
| def load_data(dataset_name): | |
| if dataset_name == "Iris": | |
| data = load_iris() | |
| elif dataset_name == "Wine": | |
| data = load_wine() | |
| else: | |
| data = load_breast_cancer() | |
| df = pd.DataFrame(data.data, columns=data.feature_names) | |
| df['target'] = data.target | |
| df['target_names'] = [data.target_names[i] for i in data.target] | |
| return df, data.target_names | |
| # Load data | |
| df, target_names = load_data(dataset_name) | |
| # Display dataset info | |
| st.subheader(f"Dataset: {dataset_name}") | |
| st.write(f"Shape: {df.shape[0]} rows, {df.shape[1]-2} features") | |
| st.write(f"Target classes: {', '.join(target_names)}") | |
| # Show raw data toggle | |
| if st.checkbox("Show raw data"): | |
| st.write(df.head()) | |
| # Prepare data for PCA | |
| X = df.drop(['target', 'target_names'], axis=1) | |
| y = df['target'] | |
| target_names = df['target_names'].unique() | |
| # Standardize the data | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X) | |
| # Apply PCA | |
| pca = PCA(n_components=n_components) | |
| X_pca = pca.fit_transform(X_scaled) | |
| # Create DataFrame with PCA results | |
| pca_columns = [f"PC{i+1}" for i in range(n_components)] | |
| df_pca = pd.DataFrame(X_pca, columns=pca_columns) | |
| df_pca['target'] = y | |
| df_pca['target_names'] = df['target_names'] | |
| # Display PCA info | |
| st.subheader("PCA Analysis") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write(f"Explained Variance Ratio: {[f'{val:.2%}' for val in pca.explained_variance_ratio_]}") | |
| st.write(f"Total Variance Explained: {pca.explained_variance_ratio_.sum():.2%}") | |
| with col2: | |
| st.write(f"Cumulative Variance Explained: {[f'{val:.2%}' for val in pca.explained_variance_ratio_.cumsum()]}") | |
| # Create visualization | |
| if n_components == 2: | |
| fig = px.scatter( | |
| df_pca, | |
| x='PC1', | |
| y='PC2', | |
| color='target_names', | |
| title=f"PCA Visualization - {dataset_name} Dataset (2D)", | |
| labels={ | |
| 'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)', | |
| 'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)' | |
| }, | |
| hover_data=['target_names'] | |
| ) | |
| fig.update_traces(marker=dict(size=8, opacity=0.8)) | |
| fig.update_layout( | |
| width=800, | |
| height=600, | |
| legend_title_text='Classes' | |
| ) | |
| else: # 3D visualization | |
| fig = px.scatter_3d( | |
| df_pca, | |
| x='PC1', | |
| y='PC2', | |
| z='PC3', | |
| color='target_names', | |
| title=f"PCA Visualization - {dataset_name} Dataset (3D)", | |
| labels={ | |
| 'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)', | |
| 'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)', | |
| 'PC3': f'PC3 ({pca.explained_variance_ratio_[2]:.1%} variance)' | |
| }, | |
| hover_data=['target_names'] | |
| ) | |
| fig.update_traces(marker=dict(size=5, opacity=0.8)) | |
| fig.update_layout( | |
| width=800, | |
| height=600, | |
| legend_title_text='Classes' | |
| ) | |
| # Display plot | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Feature contribution to principal components | |
| st.subheader("Feature Contributions to Principal Components") | |
| feature_importance = pd.DataFrame( | |
| pca.components_.T, | |
| columns=pca_columns, | |
| index=X.columns | |
| ) | |
| # Display feature importance as a heatmap | |
| fig_importance = px.imshow( | |
| feature_importance.T, | |
| labels=dict(x="Features", y="Principal Components", color="Contribution"), | |
| color_continuous_scale='RdBu_r', | |
| aspect="auto", | |
| title="Feature Contributions to Principal Components" | |
| ) | |
| fig_importance.update_layout( | |
| width=800, | |
| height=400 | |
| ) | |
| st.plotly_chart(fig_importance, use_container_width=True) | |
| # Show top contributing features | |
| st.subheader("Top Contributing Features") | |
| for i in range(n_components): | |
| st.write(f"**PC{i+1}**:") | |
| pc_features = feature_importance[f'PC{i+1}'].abs().sort_values(ascending=False) | |
| top_features = pc_features.head(5) | |
| for feature, value in top_features.items(): | |
| st.write(f"- {feature}: {value:.3f}") | |
| st.write("") | |
| # Information about PCA | |
| with st.expander("βΉοΈ About PCA"): | |
| st.markdown(""" | |
| **Principal Component Analysis (PCA)** is a dimensionality reduction technique that transforms | |
| high-dimensional data into a lower-dimensional space while preserving as much variance as possible. | |
| **Key Concepts:** | |
| - **Principal Components**: New axes that capture maximum variance in the data | |
| - **Explained Variance Ratio**: Proportion of total variance explained by each component | |
| - **Standardization**: Important preprocessing step to ensure all features contribute equally | |
| **Benefits:** | |
| - Reduces computational complexity | |
| - Removes multicollinearity | |
| - Helps with data visualization | |
| - Can improve model performance by reducing noise | |
| **Applications:** | |
| - Data visualization | |
| - Feature extraction | |
| - Noise reduction | |
| - Data compression | |
| """) |