Модель машинного обучения Классификатора спама (неправильное однозначное предсказание)
Я сделал в Python Classifier Spam Emails Classifier весь код корректным, но все еще не получаю правильного прогноза. Я получаю ответы, которые в основном верны в y_pred, но все еще получают ошибку в одном прогнозе. Я использую модель MultinomialNB из наивного байесов библиотеки sklearn.
Набор данных прямо здесь
http://www2.aueb.gr/users/ion/data/enron-spam/preprocessed/enron1.tar.gz
FirstFile (Spam.py)
import os
import pickle as c
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score
def save(clf, name):
with open(name, 'wb') as fp:
c.dump(clf, fp)
print ("saved")
def make_dict():
direc="emails/"
files=os.listdir(direc)
emails=[direc + email for email in files]
Words=[]
c=len(emails)
for email in emails:
f=open(email,encoding="utf8",errors='ignore')
blob=f.read()
Words+=blob.split(" ")
print(c)
c-=1
for i in range(len(Words)):
if not Words[i].isalpha():
Words[i]=""
dictionary=Counter(Words)
del dictionary[""]
return dictionary.most_common(3000)
def make_dataset(dictionary):
direc="emails/"
files=os.listdir(direc)
emails=[direc + email for email in files]
Feature_Set=[]
labels=[]
c=len(emails)
for email in emails:
data=[]
f=open(email,encoding="utf8",errors='ignore')
Words=f.read().split(' ')
for entry in dictionary:
data.append(Words.count(entry[0]))
Feature_Set.append(data)
if "ham" in email:
labels.append(0)
if "spam" in email:
labels.append(1)
print(c)
c=c-1
print(len(Feature_Set),len(labels))
return Feature_Set,labels
d=make_dict()
features,labels=make_dataset(d)
X_train,X_test,y_train,y_test=tts(features,labels,test_size=0.2)
Classifier=MultinomialNB()
Classifier.fit(X_train,y_train)
y_pred=Classifier.predict(X_test)
print(accuracy_score(y_test,y_pred))
save(Classifier, "text-classifier.mdl")
Detector.py
import os
from collections import Counter
import pickle as c
def load(Classifier_file):
with open(Classifier_file) as fp:
Classifier=c.load(fp)
return Classifier
def make_dict():
direc="emails/"
files=os.listdir(direc)
emails=[direc + email for email in files]
Words=[]
c=len(emails)
for email in emails:
f=open(email,encoding="utf8",errors='ignore')
blob=f.read()
Words+=blob.split(" ")
print(c)
c-=1
for i in range(len(Words)):
if not Words[i].isalpha():
Words[i]=""
dictionary=Counter(Words)
del dictionary[""]
return dictionary.most_common(3000)
Classifier=c.load(open("text_Classifier.mdl","rb"))
d=make_dict()
while True:
features = []
inp = input(">")
if inp == "exit":
break
for word in d:
features.append(inp.count(word[0]))
res = Classifier.predict([features])
print (["Not Spam", "Spam!"][res[0]])