Модель машинного обучения Классификатора спама (неправильное однозначное предсказание)

Я сделал в Python Classifier Spam Emails Classifier весь код корректным, но все еще не получаю правильного прогноза. Я получаю ответы, которые в основном верны в y_pred, но все еще получают ошибку в одном прогнозе. Я использую модель MultinomialNB из наивного байесов библиотеки sklearn.

Набор данных прямо здесь

http://www2.aueb.gr/users/ion/data/enron-spam/preprocessed/enron1.tar.gz

FirstFile (Spam.py)

import os
import pickle as c
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score

def save(clf, name):
    with open(name, 'wb') as fp:
        c.dump(clf, fp)
        print ("saved")

def make_dict():
    direc="emails/"

    files=os.listdir(direc)

    emails=[direc + email for email in files]

    Words=[]
    c=len(emails)
    for email in emails:
        f=open(email,encoding="utf8",errors='ignore')
        blob=f.read()
        Words+=blob.split(" ")
        print(c)
        c-=1  
    for i in range(len(Words)):
         if not Words[i].isalpha():
               Words[i]=""
    dictionary=Counter(Words)
    del dictionary[""]
    return dictionary.most_common(3000)

def make_dataset(dictionary):
       direc="emails/"

       files=os.listdir(direc)

       emails=[direc + email for email in files]

       Feature_Set=[]
       labels=[]
       c=len(emails) 

       for email in emails:
           data=[]
           f=open(email,encoding="utf8",errors='ignore')
           Words=f.read().split(' ')
           for entry in dictionary:
                data.append(Words.count(entry[0]))
                Feature_Set.append(data)

           if "ham" in email:
               labels.append(0)
           if "spam" in email:
               labels.append(1)
           print(c)
           c=c-1
           print(len(Feature_Set),len(labels))
           return Feature_Set,labels

d=make_dict()
features,labels=make_dataset(d)
X_train,X_test,y_train,y_test=tts(features,labels,test_size=0.2)
Classifier=MultinomialNB()
Classifier.fit(X_train,y_train)
y_pred=Classifier.predict(X_test)
print(accuracy_score(y_test,y_pred))
save(Classifier, "text-classifier.mdl")

Detector.py

import os
from collections import Counter
import pickle as c

def load(Classifier_file):
     with open(Classifier_file) as fp:
        Classifier=c.load(fp)
     return Classifier

def make_dict():
    direc="emails/"

    files=os.listdir(direc)

    emails=[direc + email for email in files]

    Words=[]
    c=len(emails)
    for email in emails:
        f=open(email,encoding="utf8",errors='ignore')
        blob=f.read()
        Words+=blob.split(" ")
        print(c)
        c-=1  
    for i in range(len(Words)):
           if not Words[i].isalpha():
               Words[i]=""

    dictionary=Counter(Words)
    del dictionary[""]
    return dictionary.most_common(3000)


Classifier=c.load(open("text_Classifier.mdl","rb"))
d=make_dict()
while True:
    features = []
    inp = input(">")
    if inp == "exit":
        break
    for word in d:
        features.append(inp.count(word[0]))
        res = Classifier.predict([features])
        print (["Not Spam", "Spam!"][res[0]])

0 ответов

Другие вопросы по тегам