Может ли stw(контролируемое взвешивание) как (tf-chi, tf-ig) использоваться для мультиклассовой классификации?
Исследовательскую работу показывают только для бинарной классификации. Если мы можем использовать stw для мультиклассовой классификации, просьба привести пример в тетради Python или Jupyter.
import numpy as np
import scipy.sparse as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import normalize
class SupervisedTermWeightingTransformer(BaseEstimator, TransformerMixin):
def __init__(self, scheme='tfchi2'):
self.scheme = scheme
def fit(self, X, y):
n_samples, n_features = X.shape
# Masks for positive and negative samples
pos_samples = sp.spdiags(y, 0, n_samples, n_samples)
neg_samples = sp.spdiags(1-y, 0, n_samples, n_samples)
# Extract positive and negative samples
X_pos = pos_samples*X
X_neg = neg_samples*X
# tp: number of positive samples that contain given term
# fp: number of positive samples that do not contain given term
# fn: number of negative samples that contain given term
# tn: number of negative samples that do not contain given term
tp = np.bincount(X_pos.indices, minlength=n_features)
fp = np.sum(y)-tp
fn = np.bincount(X_neg.indices, minlength=n_features)
tn = np.sum(1-y)-fn
# Smooth document frequencies
self._tp = tp + 1.0
self._fp = fp + 1.0
self._fn = fn + 1.0
self._tn = tn + 1.0
self._n_samples = n_samples
self._n_features = n_features
return self
def transform(self, X):
tp = self._tp
fp = self._fp
fn = self._fn
tn = self._tn
# Smooth document frequencies
n = self._n_samples + 4
f = self._n_features
if self.scheme == 'tfchi2':
k = n * (tp * tn - fp * fn)**2
k /= (tp + fp) * (fn + tn) * (tp + fn) * (fp + tn)
elif self.scheme == 'tfig':
k = -((tp + fp) / n) * np.log((tp + fp) / n)
k -= ((fn + tn) / n) * np.log((fn + tn) / n)
k += (tp / n) * np.log(tp / (tp + fn))
k += (fn / n) * np.log(fn / (tp + fn))
k += (fp / n) * np.log(fp / (fp + tn))
k += (tn / n) * np.log(tn / (fp + tn))
elif self.scheme == 'tfgr':
k = -((tp + fp) / n) * np.log((tp + fp) / n)
k -= ((fn + tn) / n) * np.log((fn + tn) / n)
k += (tp / n) * np.log(tp / (tp + fn))
k += (fn / n) * np.log(fn / (tp + fn))
k += (fp / n) * np.log(fp / (fp + tn))
k += (tn / n) * np.log(tn / (fp + tn))
d = -((tp + fp) / n) * np.log((tp + fp) / n)
d -= ((fn + tn) / n) * np.log((fn + tn) / n)
k /= d
elif self.scheme == 'tfor':
k = np.log( (tp * tn ) / (fp * fn) )
elif self.scheme == 'tfrf':
k = np.log(2 + tp / fn)
X = X * sp.spdiags(k, 0, f, f)
return normalize(X, 'l2')
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
d = ['the red apple', 'the orange', 'the green apple', 'the lemon orange', 'the plum']
y = [[1, 0, 1, 0, 0]]
vectorizer = CountVectorizer () x = vectorizer.fit_transform (d)
transformer = SupervisedTermWeightingTransformer (схема = 'tfor') x1 = transformer.fit_transform (x, np.asarray (y))
print (x1.toarray ())
Это для бинарной классификации``