Importing Required Libraries
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np
# Reading CSV File
df = pd.read_csv("C:/Users/srinu/Downloads/3-4_CSM_ML/Dataset/Obesity.csv")
print(df.head())
print(df)
# Encode the target variable 'Obesity_Level'
label_encoder = LabelEncoder()
df['Obesity_Level'] = label_encoder.fit_transform(df['Obesity_Level'])
# Features and target variable
X = df.drop(columns=['Obesity_Level', 'ID', 'Review'])
y = df['Obesity_Level']
# Numerical Pipeline
numerical_features = ['Age']
numerical_pipeline = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')), # Fill missing values
('scaler', StandardScaler()) # Standardize features
])
# Categorical Pipeline
categorical_features = ['Gender', 'Occupation']
categorical_pipeline = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')), # Fill missing values
('onehot', OneHotEncoder(handle_unknown='ignore')) # One-hot encode
])
# Combined Preprocessor
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_pipeline, numerical_features),
('cat', categorical_pipeline, categorical_features)
])
# Full pipeline with preprocessing and logistic regression model
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression())
])
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fit the model
model_pipeline.fit(X_train, y_train)
# Predict on the test set
y_pred = model_pipeline.predict(X_test)
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
# Print metrics
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", class_report)
# AUC_ROC CURVE
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
# Fit the model
model_pipeline.fit(X_train, y_train)
# Predict probabilities
y_prob = model_pipeline.predict_proba(X_test)
# Binarize the output
lb = LabelBinarizer()
y_test_bin = lb.fit_transform(y_test)
# Compute ROC curve and ROC AUC for each class
n_classes = len(lb.classes_)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Plot ROC curve
plt.figure()
colors = ['blue', 'red', 'green', 'orange'] # Adjust based on the number of classes
for i in range(n_classes):
plt.plot(fpr[i], tpr[i], color=colors[i],
lw=2, label=f'ROC curve (class {lb.classes_[i]}) (area = {roc_auc[i]:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()
Comments