File size: 5,919 Bytes
7aa8120
d8b0e7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7aa8120
d8b0e7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris, load_wine, load_breast_cancer
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go

# Set page config
st.set_page_config(
    page_title="PCA Visualization App",
    page_icon="πŸ“Š",
    layout="wide"
)

# Title and description
st.title("πŸ“Š PCA Visualization Dashboard")
st.markdown("""
This app demonstrates Principal Component Analysis (PCA) visualization using different datasets.
Use the controls in the sidebar to customize your analysis.
""")

# Sidebar controls
st.sidebar.header("πŸŽ›οΈ Controls")

# Dataset selection
dataset_name = st.sidebar.selectbox(
    "Select Dataset",
    ("Iris", "Wine", "Breast Cancer")
)

# Number of components
n_components = st.sidebar.slider(
    "Number of Components",
    min_value=2,
    max_value=3,
    value=2,
    help="Select 2D or 3D visualization"
)

# Load selected dataset
@st.cache_data
def load_data(dataset_name):
    if dataset_name == "Iris":
        data = load_iris()
    elif dataset_name == "Wine":
        data = load_wine()
    else:
        data = load_breast_cancer()
    
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df['target'] = data.target
    df['target_names'] = [data.target_names[i] for i in data.target]
    return df, data.target_names

# Load data
df, target_names = load_data(dataset_name)

# Display dataset info
st.subheader(f"Dataset: {dataset_name}")
st.write(f"Shape: {df.shape[0]} rows, {df.shape[1]-2} features")
st.write(f"Target classes: {', '.join(target_names)}")

# Show raw data toggle
if st.checkbox("Show raw data"):
    st.write(df.head())

# Prepare data for PCA
X = df.drop(['target', 'target_names'], axis=1)
y = df['target']
target_names = df['target_names'].unique()

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

# Create DataFrame with PCA results
pca_columns = [f"PC{i+1}" for i in range(n_components)]
df_pca = pd.DataFrame(X_pca, columns=pca_columns)
df_pca['target'] = y
df_pca['target_names'] = df['target_names']

# Display PCA info
st.subheader("PCA Analysis")
col1, col2 = st.columns(2)
with col1:
    st.write(f"Explained Variance Ratio: {[f'{val:.2%}' for val in pca.explained_variance_ratio_]}")
    st.write(f"Total Variance Explained: {pca.explained_variance_ratio_.sum():.2%}")

with col2:
    st.write(f"Cumulative Variance Explained: {[f'{val:.2%}' for val in pca.explained_variance_ratio_.cumsum()]}")

# Create visualization
if n_components == 2:
    fig = px.scatter(
        df_pca,
        x='PC1',
        y='PC2',
        color='target_names',
        title=f"PCA Visualization - {dataset_name} Dataset (2D)",
        labels={
            'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
            'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)'
        },
        hover_data=['target_names']
    )
    
    fig.update_traces(marker=dict(size=8, opacity=0.8))
    fig.update_layout(
        width=800,
        height=600,
        legend_title_text='Classes'
    )
    
else:  # 3D visualization
    fig = px.scatter_3d(
        df_pca,
        x='PC1',
        y='PC2',
        z='PC3',
        color='target_names',
        title=f"PCA Visualization - {dataset_name} Dataset (3D)",
        labels={
            'PC1': f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)',
            'PC2': f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)',
            'PC3': f'PC3 ({pca.explained_variance_ratio_[2]:.1%} variance)'
        },
        hover_data=['target_names']
    )
    
    fig.update_traces(marker=dict(size=5, opacity=0.8))
    fig.update_layout(
        width=800,
        height=600,
        legend_title_text='Classes'
    )

# Display plot
st.plotly_chart(fig, use_container_width=True)

# Feature contribution to principal components
st.subheader("Feature Contributions to Principal Components")
feature_importance = pd.DataFrame(
    pca.components_.T,
    columns=pca_columns,
    index=X.columns
)

# Display feature importance as a heatmap
fig_importance = px.imshow(
    feature_importance.T,
    labels=dict(x="Features", y="Principal Components", color="Contribution"),
    color_continuous_scale='RdBu_r',
    aspect="auto",
    title="Feature Contributions to Principal Components"
)

fig_importance.update_layout(
    width=800,
    height=400
)

st.plotly_chart(fig_importance, use_container_width=True)

# Show top contributing features
st.subheader("Top Contributing Features")
for i in range(n_components):
    st.write(f"**PC{i+1}**:")
    pc_features = feature_importance[f'PC{i+1}'].abs().sort_values(ascending=False)
    top_features = pc_features.head(5)
    for feature, value in top_features.items():
        st.write(f"- {feature}: {value:.3f}")
    st.write("")

# Information about PCA
with st.expander("ℹ️ About PCA"):
    st.markdown("""
    **Principal Component Analysis (PCA)** is a dimensionality reduction technique that transforms 
    high-dimensional data into a lower-dimensional space while preserving as much variance as possible.
    
    **Key Concepts:**
    - **Principal Components**: New axes that capture maximum variance in the data
    - **Explained Variance Ratio**: Proportion of total variance explained by each component
    - **Standardization**: Important preprocessing step to ensure all features contribute equally
    
    **Benefits:**
    - Reduces computational complexity
    - Removes multicollinearity
    - Helps with data visualization
    - Can improve model performance by reducing noise
    
    **Applications:**
    - Data visualization
    - Feature extraction
    - Noise reduction
    - Data compression
    """)