import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('SpamCollection.csv')

df.head()

df['label']=df['label'].replace('ham', 0)
df['label']=df['label'].replace('spam', 1)

df.head()

X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

vectorizer = CountVectorizer()

X_train_counts = vectorizer.fit_transform(X_train)

X_test_counts = vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_counts, y_train)

MultinomialNB()

MultinomialNB()

predicted = classifier.predict(X_test_counts)

accuracy = accuracy_score(y_test, predicted)
print("Accuracy:", accuracy)

Accuracy: 0.9919282511210762

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

new_message = ["Congratulations! You've won a free vacation."]

new_message_counts = vectorizer.transform(new_message)
prediction = classifier.predict(new_message_counts)
print("Predicted classification for the new message:", 'Not Spam' if prediction == 0 else 'Spam')

Predicted classification for the new message: Spam

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/jimhardy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jimhardy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

True

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    processed_tokens = [stemmer.stem(token) for token in tokens if token not in string.punctuation and token not in stop_words]
    processed_text = ' '.join(processed_tokens)
    return processed_text

df['processed_message'] = df['message'].apply(preprocess_text)

df.head()

X_train, X_test, y_train, y_test = train_test_split(df['processed_message'], df['label'], test_size=0.2, random_state=42)

vectorizer = CountVectorizer()

X_train_counts = vectorizer.fit_transform(X_train)

X_test_counts = vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_counts, y_train)

MultinomialNB()

MultinomialNB()

predicted = classifier.predict(X_test_counts)

accuracy = accuracy_score(y_test, predicted)
print("Accuracy:", accuracy)

Accuracy: 0.9874439461883409

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.97      0.93      0.95       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

new_message = "Congratulations! You've won a free vacation."

new_message = new_message.lower()
tokens = word_tokenize(new_message)
processed_tokens = [stemmer.stem(token) for token in tokens if token not in string.punctuation and token not in stop_words]
processed_text = [' '.join(processed_tokens)]

new_message_counts = vectorizer.transform(processed_text)
prediction = classifier.predict(new_message_counts)
print("Predicted classification for the new message:", 'Not Spam' if prediction == 0 else 'Spam')

Predicted classification for the new message: Spam

	label	message
0	0	Go until jurong point, crazy.. Available only ...
1	0	Ok lar... Joking wif u oni...
2	1	Free entry in 2 a wkly comp to win FA Cup fina...
3	0	U dun say so early hor... U c already then say...
4	0	Nah I don't think he goes to usf, he lives aro...

	label	message	processed_message
0	0	Go until jurong point, crazy.. Available only ...	go jurong point crazi .. avail bugi n great wo...
1	0	Ok lar... Joking wif u oni...	ok lar ... joke wif u oni ...
2	1	Free entry in 2 a wkly comp to win FA Cup fina...	free entri 2 wkli comp win fa cup final tkt 21...
3	0	U dun say so early hor... U c already then say...	u dun say earli hor ... u c alreadi say ...
4	0	Nah I don't think he goes to usf, he lives aro...	nah n't think goe usf live around though

Spam Classification Practice: Using Bag of Words¶

Import Libraries¶

Import File and Convert Labels¶

Train, Test, Split the Data¶

Define and Create BoW Vectoriser¶

Create Multinomial Niave Bayes Classifier¶

Accuracy/Classification Report¶

Predict Classification of New Message¶

Redo Classification Model, using Text PreProcessing First¶

Text PreProccesing First¶

View Dataset (with processed_message column)¶

Train, Test, Split the Data (Using The processed_message Column)¶

	label	message
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...