Поиск по сетке с f1 в качестве функции подсчета очков, несколько страниц сообщения об ошибке
Хотите использовать Gridsearch, чтобы найти лучшие параметры и использовать f1 в качестве метрики оценки.
Если я удаляю функцию подсчета очков, все работает хорошо, и я не получаю ошибок.
Вот мой код:
from sklearn import grid_search
parameters = {'n_neighbors':(1,3,5,10,15),'weights':('uniform','distance'),'algorithm':('ball_tree','kd_tree','brute'),'leaf_size':(5,10,20,30,50)}
reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring="f1")
train_classifier(reg, X_train, y_train)
train_f1_score = predict_labels(reg, X_train, y_train)
print reg.best_params_
print "F1 score for training set: {}".format(train_f1_score)
print "F1 score for test set: {}".format(predict_labels(reg, X_test, y_test))
Когда я выполняю, я получаю страницы на страницах как ошибки, и я не могу понять, как это происходит:(
ValueError Traceback (most recent call last)
<ipython-input-17-3083ff8a20ea> in <module>()
3 parameters = {'n_neighbors':(1,3,5,10,15),'weights':('uniform','distance'),'algorithm':('ball_tree','kd_tree','brute'),'leaf_size':(5,10,20,30,50)}
4 reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring="f1")
----> 5 train_classifier(reg, X_train, y_train)
6 train_f1_score = predict_labels(reg, X_train, y_train)
7 print reg.best_params_
<ipython-input-9-b56ce25fd90b> in train_classifier(clf, X_train, y_train)
5 print "Training {}...".format(clf.__class__.__name__)
6 start = time.time()
----> 7 clf.fit(X_train, y_train)
8 end = time.time()
9 print "Done!\nTraining time (secs): {:.3f}".format(end - start)
//anaconda/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
//anaconda/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
802 self._iterating = True
803
--> 804 while self.dispatch_one_batch(iterator):
805 pass
806
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
660 return False
661 else:
--> 662 self._dispatch(tasks)
663 return True
664
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
568
569 if self._pool is None:
--> 570 job = ImmediateComputeBatch(batch)
571 self._jobs.append(job)
572 self.n_dispatched_batches += 1
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
181 # Don't delay the application, to avoid keeping the input
182 # arguments in memory
--> 183 self.results = batch()
184
185 def get(self):
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1548
1549 else:
-> 1550 test_score = _score(estimator, X_test, y_test, scorer)
1551 if return_train_score:
1552 train_score = _score(estimator, X_train, y_train, scorer)
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1604 score = scorer(estimator, X_test)
1605 else:
-> 1606 score = scorer(estimator, X_test, y_test)
1607 if not isinstance(score, numbers.Number):
1608 raise ValueError("scoring must return a number, got %s (%s) instead."
//anaconda/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self, estimator, X, y_true, sample_weight)
88 else:
89 return self._sign * self._score_func(y_true, y_pred,
---> 90 **self._kwargs)
91
92
//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in f1_score(y_true, y_pred, labels, pos_label, average, sample_weight)
637 return fbeta_score(y_true, y_pred, 1, labels=labels,
638 pos_label=pos_label, average=average,
--> 639 sample_weight=sample_weight)
640
641
//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in fbeta_score(y_true, y_pred, beta, labels, pos_label, average, sample_weight)
754 average=average,
755 warn_for=('f-score',),
--> 756 sample_weight=sample_weight)
757 return f
758
//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)
982 else:
983 raise ValueError("pos_label=%r is not a valid label: %r" %
--> 984 (pos_label, present_labels))
985 labels = [pos_label]
986 if labels is None:
ValueError: pos_label=1 is not a valid label: array(['no', 'yes'],
dtype='|S3')
1 ответ
Решение
Кажется, что у вас есть массив меток со значениями "нет" и "да", вы должны преобразовать их в двоичное числовое представление 1-0, потому что ваша ошибка утверждает, что функция подсчета не может понять, где 0 и 1 находятся в вашем массиве меток.
Другой возможный способ решить это без изменения массива меток:
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
f1_scorer = make_scorer(f1_score, pos_label="yes")
reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring=f1_scorer)