streamlit-pca / src /streamlit_app.py
kozo2's picture
Upload src/streamlit_app.py with huggingface_hub
d8b0e7f verified
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
# Set page config
st.set_page_config(
page_title="PCA Visualization App",
page_icon="πŸ“Š",
layout="wide"
)
# Title and description
st.title("πŸ“Š PCA Visualization Dashboard")
st.markdown("""
This app demonstrates Principal Component Analysis (PCA) visualization using different datasets.
Use the controls in the sidebar to customize your analysis.
""")
# Sidebar controls
st.sidebar.header("πŸŽ›οΈ Controls")
# Dataset selection
dataset_name = st.sidebar.selectbox(
"Select Dataset",
("Iris", "Wine", "Breast Cancer")
)
# Number of components
n_components = st.sidebar.slider(
"Number of Components",
min_value=2,
max_value=3,
value=2,
help="Select 2D or 3D visualization"
)
# Load selected dataset
@st.cache_data
def load_data(dataset_name):
if dataset_name == "Iris":
data = load_iris()
elif dataset_name == "Wine":
data = load_wine()
else:
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df['target_names'] = [data.target_names[i] for i in data.target]
return df, data.target_names
# Load data
df, target_names = load_data(dataset_name)
# Display dataset info
st.subheader(f"Dataset: {dataset_name}")
st.write(f"Shape: {df.shape[0]} rows, {df.shape[1]-2} features")
st.write(f"Target classes: {', '.join(target_names)}")
# Show raw data toggle
if st.checkbox("Show raw data"):
st.write(df.head())
# Prepare data for PCA
X = df.drop(['target', 'target_names'], axis=1)
y = df['target']
target_names = df['target_names'].unique()
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)
# Create DataFrame with PCA results
pca_columns = [f"PC{i+1}" for i in range(n_components)]
df_pca = pd.DataFrame(X_pca, columns=pca_columns)
df_pca['target'] = y
df_pca['target_names'] = df['target_names']
# Display PCA info
st.subheader("PCA Analysis")
col1, col2 = st.columns(2)
with col1:
st.write(f"Explained Variance Ratio: {[f'{val:.2%}' for val in pca.explained_variance_ratio_]}")
st.write(f"Total Variance Explained: {pca.explained_variance_ratio_.sum():.2%}")
with col2:
st.write(f"Cumulative Variance Explained: {[f'{val:.2%}' for val in pca.explained_variance_ratio_.cumsum()]}")
# Create visualization
if n_components == 2:
fig = px.scatter(
df_pca,
x='PC1',
y='PC2',
color='target_names',
title=f"PCA Visualization - {dataset_name} Dataset (2D)",
labels={
'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)'
},
hover_data=['target_names']
)
fig.update_traces(marker=dict(size=8, opacity=0.8))
fig.update_layout(
width=800,
height=600,
legend_title_text='Classes'
)
else: # 3D visualization
fig = px.scatter_3d(
df_pca,
x='PC1',
y='PC2',
z='PC3',
color='target_names',
title=f"PCA Visualization - {dataset_name} Dataset (3D)",
labels={
'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)',
'PC3': f'PC3 ({pca.explained_variance_ratio_[2]:.1%} variance)'
},
hover_data=['target_names']
)
fig.update_traces(marker=dict(size=5, opacity=0.8))
fig.update_layout(
width=800,
height=600,
legend_title_text='Classes'
)
# Display plot
st.plotly_chart(fig, use_container_width=True)
# Feature contribution to principal components
st.subheader("Feature Contributions to Principal Components")
feature_importance = pd.DataFrame(
pca.components_.T,
columns=pca_columns,
index=X.columns
)
# Display feature importance as a heatmap
fig_importance = px.imshow(
feature_importance.T,
labels=dict(x="Features", y="Principal Components", color="Contribution"),
color_continuous_scale='RdBu_r',
aspect="auto",
title="Feature Contributions to Principal Components"
)
fig_importance.update_layout(
width=800,
height=400
)
st.plotly_chart(fig_importance, use_container_width=True)
# Show top contributing features
st.subheader("Top Contributing Features")
for i in range(n_components):
st.write(f"**PC{i+1}**:")
pc_features = feature_importance[f'PC{i+1}'].abs().sort_values(ascending=False)
top_features = pc_features.head(5)
for feature, value in top_features.items():
st.write(f"- {feature}: {value:.3f}")
st.write("")
# Information about PCA
with st.expander("ℹ️ About PCA"):
st.markdown("""
**Principal Component Analysis (PCA)** is a dimensionality reduction technique that transforms
high-dimensional data into a lower-dimensional space while preserving as much variance as possible.
**Key Concepts:**
- **Principal Components**: New axes that capture maximum variance in the data
- **Explained Variance Ratio**: Proportion of total variance explained by each component
- **Standardization**: Important preprocessing step to ensure all features contribute equally
**Benefits:**
- Reduces computational complexity
- Removes multicollinearity
- Helps with data visualization
- Can improve model performance by reducing noise
**Applications:**
- Data visualization
- Feature extraction
- Noise reduction
- Data compression
""")