from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
# Step 1: Load a sample dataset (e.g., 20 Newsgroups)
categories = ['sci.space', 'comp.graphics', 'rec.sport.baseball']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)
# Print information about the newsgroups dataset
print("=== Newsgroups Dataset Information ===")
print(f"Number of documents: {len(newsgroups.data)}")
print(f"Number of categories: {len(newsgroups.target_names)}")
print("Categories:", newsgroups.target_names)
print("First document sample:\n", newsgroups.data[0][:500]) # Print first 500 characters of the first document
print("\n")
print("=== Newsgroups Dataset Information ===")
print(f"Number of documents: {len(newsgroups.data)}")
print(f"Number of categories: {len(newsgroups.target_names)}")
print("Categories:", newsgroups.target_names)
print("First document sample:\n", newsgroups.data[0][:500]) # Print first 500 characters of the first document
print("\n")
# Step 2: Preprocess the text data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target
# Print information about the vectorizer
print("=== Vectorizer Information ===")
print(f"Number of features (unique words): {len(vectorizer.get_feature_names_out())}")
print("Sample feature names (words):", vectorizer.get_feature_names_out()[:20]) # Print first 20 feature names
print("Shape of the document-term matrix:", X.shape)
print("\n")
# Step 3: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 4: Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)
# Step 5: Make predictions
y_pred = model.predict(X_test)
# Step 6: Evaluate the model
print("=== Model Evaluation ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=newsgroups.target_names))
Comments