Importing Required Libraries
import pandas as pd from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report from sklearn.model_selection import train_test_split, KFold from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder from sklearn.impute import SimpleImputer import numpy as np
# Reading CSV File
df = pd.read_csv("C:/Users/srinu/Downloads/3-4_CSM_ML/Dataset/Obesity.csv") print(df.head()) print(df)
# Encode the target variable 'Obesity_Level'
label_encoder = LabelEncoder() df['Obesity_Level'] = label_encoder.fit_transform(df['Obesity_Level'])
# Features and target variable
X = df.drop(columns=['Obesity_Level', 'ID', 'Review'])
y = df['Obesity_Level']
# Numerical Pipeline
numerical_features = ['Age']
numerical_pipeline = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')), # Fill missing values
('scaler', StandardScaler()) # Standardize features
])
# Categorical Pipeline categorical_features = ['Gender', 'Occupation'] categorical_pipeline = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), # Fill missing values ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-hot encode ])
# Combined Preprocessor preprocessor = ColumnTransformer( transformers=[ ('num', numerical_pipeline, numerical_features), ('cat', categorical_pipeline, categorical_features) ])
# Full pipeline with preprocessing and logistic regression model model_pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', LogisticRegression()) ])
# Split the data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fit the model model_pipeline.fit(X_train, y_train)
# Predict on the test set
y_pred = model_pipeline.predict(X_test)
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
# Print metrics print("Confusion Matrix:\n", conf_matrix) print("Accuracy:", accuracy) print("Precision:", precision) print("Recall:", recall) print("F1 Score:", f1) print("Classification Report:\n", class_report)
# AUC_ROC CURVE
from sklearn.metrics import roc_curve, roc_auc_score, auc from sklearn.preprocessing import LabelBinarizer import matplotlib.pyplot as plt # Fit the model model_pipeline.fit(X_train, y_train) # Predict probabilities y_prob = model_pipeline.predict_proba(X_test) # Binarize the output lb = LabelBinarizer() y_test_bin = lb.fit_transform(y_test) # Compute ROC curve and ROC AUC for each class n_classes = len(lb.classes_) fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Plot ROC curve plt.figure() colors = ['blue', 'red', 'green', 'orange'] # Adjust based on the number of classes for i in range(n_classes): plt.plot(fpr[i], tpr[i], color=colors[i], lw=2, label=f'ROC curve (class {lb.classes_[i]}) (area = {roc_auc[i]:.2f})') plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic (ROC) Curve') plt.legend(loc='lower right') plt.show()
Comments