Spaces:
Sleeping
Sleeping
File size: 5,919 Bytes
7aa8120 d8b0e7f 7aa8120 d8b0e7f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
# Set page config
st.set_page_config(
page_title="PCA Visualization App",
page_icon="π",
layout="wide"
)
# Title and description
st.title("π PCA Visualization Dashboard")
st.markdown("""
This app demonstrates Principal Component Analysis (PCA) visualization using different datasets.
Use the controls in the sidebar to customize your analysis.
""")
# Sidebar controls
st.sidebar.header("ποΈ Controls")
# Dataset selection
dataset_name = st.sidebar.selectbox(
"Select Dataset",
("Iris", "Wine", "Breast Cancer")
)
# Number of components
n_components = st.sidebar.slider(
"Number of Components",
min_value=2,
max_value=3,
value=2,
help="Select 2D or 3D visualization"
)
# Load selected dataset
@st.cache_data
def load_data(dataset_name):
if dataset_name == "Iris":
data = load_iris()
elif dataset_name == "Wine":
data = load_wine()
else:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df['target_names'] = [data.target_names[i] for i in data.target]
return df, data.target_names
# Load data
df, target_names = load_data(dataset_name)
# Display dataset info
st.subheader(f"Dataset: {dataset_name}")
st.write(f"Shape: {df.shape[0]} rows, {df.shape[1]-2} features")
st.write(f"Target classes: {', '.join(target_names)}")
# Show raw data toggle
if st.checkbox("Show raw data"):
st.write(df.head())
# Prepare data for PCA
X = df.drop(['target', 'target_names'], axis=1)
y = df['target']
target_names = df['target_names'].unique()
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)
# Create DataFrame with PCA results
pca_columns = [f"PC{i+1}" for i in range(n_components)]
df_pca = pd.DataFrame(X_pca, columns=pca_columns)
df_pca['target'] = y
df_pca['target_names'] = df['target_names']
# Display PCA info
st.subheader("PCA Analysis")
col1, col2 = st.columns(2)
with col1:
st.write(f"Explained Variance Ratio: {[f'{val:.2%}' for val in pca.explained_variance_ratio_]}")
st.write(f"Total Variance Explained: {pca.explained_variance_ratio_.sum():.2%}")
with col2:
st.write(f"Cumulative Variance Explained: {[f'{val:.2%}' for val in pca.explained_variance_ratio_.cumsum()]}")
# Create visualization
if n_components == 2:
fig = px.scatter(
df_pca,
x='PC1',
y='PC2',
color='target_names',
title=f"PCA Visualization - {dataset_name} Dataset (2D)",
labels={
'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)'
},
hover_data=['target_names']
)
fig.update_traces(marker=dict(size=8, opacity=0.8))
fig.update_layout(
width=800,
height=600,
legend_title_text='Classes'
)
else: # 3D visualization
fig = px.scatter_3d(
df_pca,
x='PC1',
y='PC2',
z='PC3',
color='target_names',
title=f"PCA Visualization - {dataset_name} Dataset (3D)",
labels={
'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)',
'PC3': f'PC3 ({pca.explained_variance_ratio_[2]:.1%} variance)'
},
hover_data=['target_names']
)
fig.update_traces(marker=dict(size=5, opacity=0.8))
fig.update_layout(
width=800,
height=600,
legend_title_text='Classes'
)
# Display plot
st.plotly_chart(fig, use_container_width=True)
# Feature contribution to principal components
st.subheader("Feature Contributions to Principal Components")
feature_importance = pd.DataFrame(
pca.components_.T,
columns=pca_columns,
index=X.columns
)
# Display feature importance as a heatmap
fig_importance = px.imshow(
feature_importance.T,
labels=dict(x="Features", y="Principal Components", color="Contribution"),
color_continuous_scale='RdBu_r',
aspect="auto",
title="Feature Contributions to Principal Components"
)
fig_importance.update_layout(
width=800,
height=400
)
st.plotly_chart(fig_importance, use_container_width=True)
# Show top contributing features
st.subheader("Top Contributing Features")
for i in range(n_components):
st.write(f"**PC{i+1}**:")
pc_features = feature_importance[f'PC{i+1}'].abs().sort_values(ascending=False)
top_features = pc_features.head(5)
for feature, value in top_features.items():
st.write(f"- {feature}: {value:.3f}")
st.write("")
# Information about PCA
with st.expander("βΉοΈ About PCA"):
st.markdown("""
**Principal Component Analysis (PCA)** is a dimensionality reduction technique that transforms
high-dimensional data into a lower-dimensional space while preserving as much variance as possible.
**Key Concepts:**
- **Principal Components**: New axes that capture maximum variance in the data
- **Explained Variance Ratio**: Proportion of total variance explained by each component
- **Standardization**: Important preprocessing step to ensure all features contribute equally
**Benefits:**
- Reduces computational complexity
- Removes multicollinearity
- Helps with data visualization
- Can improve model performance by reducing noise
**Applications:**
- Data visualization
- Feature extraction
- Noise reduction
- Data compression
""") |