Scikit-Learn

Importing Required Libraries

import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np

# Reading CSV File

df = pd.read_csv("C:/Users/srinu/Downloads/3-4_CSM_ML/Dataset/Obesity.csv")
print(df.head())
print(df)

# Encode the target variable 'Obesity_Level'

label_encoder = LabelEncoder()
df['Obesity_Level'] = label_encoder.fit_transform(df['Obesity_Level'])

# Features and target variable
X = df.drop(columns=['Obesity_Level', 'ID', 'Review'])
y = df['Obesity_Level']

# Numerical Pipeline
numerical_features = ['Age']
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values
    ('scaler', StandardScaler())  # Standardize features
])

# Categorical Pipeline
categorical_features = ['Gender', 'Occupation']
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode
])

# Combined Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Full pipeline with preprocessing and logistic regression model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model_pipeline.fit(X_train, y_train)

Pipeline
preprocessor: ColumnTransformer
num
SimpleImputer
StandardScaler
cat
SimpleImputer
OneHotEncoder
LogisticRegression

# Predict on the test set
y_pred = model_pipeline.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print metrics
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", class_report)

# AUC_ROC CURVE

from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt

# Fit the model
model_pipeline.fit(X_train, y_train)

# Predict probabilities
y_prob = model_pipeline.predict_proba(X_test)

# Binarize the output
lb = LabelBinarizer()
y_test_bin = lb.fit_transform(y_test)

# Compute ROC curve and ROC AUC for each class
n_classes = len(lb.classes_)
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve
plt.figure()
colors = ['blue', 'red', 'green', 'orange']  # Adjust based on the number of classes
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], color=colors[i],
             lw=2, label=f'ROC curve (class {lb.classes_[i]}) (area = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

Search This Blog

Cnuinformatica

Scikit-Learn

Comments

Popular posts from this blog

About me

A set of documents that need to be classified, use the Naive Bayesian Classifier

Keras