Solve the curse of dimensionality by implementing the PCA algorithm on a high-dimensional

Solve the curse of dimensionality by implementing the PCA algorithm on a high-dimensional

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Step 1: Generate a High-Dimensional Dataset
# Create a synthetic dataset with 100 features
X, y = make_classification(n_samples=500, n_features=100, n_informative=10, n_redundant=20, random_state=42)
# Convert the data to a DataFrame for easy manipulation
data = pd.DataFrame(X)
print("Original Data Shape:", data.shape)

# Step 2: Apply PCA for Dimensionality Reduction
# Specify the number of components to retain (e.g., keep 2 components for visualization)
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(data)
# Check the shape of the reduced data
print("Reduced Data Shape:", reduced_data.shape)

# Step 3: Check Explained Variance
# This shows how much variance is retained by the selected components
explained_variance = pca.explained_variance_ratio_
print("\nExplained Variance by each principal component:", explained_variance)
print("Total Explained Variance:", np.sum(explained_variance))

# Step 4: Visualize the Reduced Data
# Scatter plot of the two principal components
plt.figure(figsize=(8, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=y, cmap='viridis', marker='o', label='Data Points')
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA on High-Dimensional Data")
plt.colorbar(label='Class Label')
plt.legend()
plt.show()

Comments