использование SMOTE для балансировки набора данных с пакетом ktrain
У меня есть проект классификации для этого, я использую ktrain & Bert с набором данных о дисбалансе, для этого я использовал SMOTE после разделения набора данных на обучение и проверки проблема в том, что когда я пытался передискретизировать набор данных, он вылетал и отображал следующую ошибку:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-27-780eef8e5a1f> in <module>()
7
8 oversample = SMOTE(random_state = 42)
----> 9 x_smote, y_smote = oversample.fit_sample(x_train, y_train)
10 # x_smote,y_smote = oversample.fit_resample(vect_df, y_train["Sentiment"])
11 print("shape x before SMOTE: {}".format(x_train.shape))
5 frames
/usr/local/lib/python3.7/dist-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
639 if not allow_nd and array.ndim >= 3:
640 raise ValueError("Found array with dim %d. %s expected <= 2."
--> 641 % (array.ndim, estimator_name))
642
643 if force_all_finite:
ValueError: Found array with dim 3. Estimator expected <= 2.
код:
import pandas as pd
import numpy as np
import six
import sys
sys.modules['sklearn.externals.six'] = six
from imblearn.over_sampling import SMOTE# for inbalance dataset
import ktrain
from ktrain import text
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn import metrics
data_train = df.sample(frac=0.85, random_state=42)
data_test = df.drop(data_train.index)
len(data_train), len(data_test)
(x_train,y_train),(x_test,y_test),preprocess =text.texts_from_df(train_df=data_train,
text_column="Tweet",
label_columns = "Sentiment",
val_df = data_test,
maxlen = 400,
preprocess_mode = 'bert')
# Solving inbalanced dataset using SMOTE
# Note: just oversampling the training data
oversample = SMOTE(random_state = 42)
x_smote, y_smote = oversample.fit_sample(x_train, y_train)
print("shape x before SMOTE: {}".format(x_train.shape))
print("shape x after SMOTE: {}".format(x_smote.shape))
print("balance of targets feild %")
type(y_smote)# it seams that this is numpy array
y_smote = pd.DataFrame(y_smote)# convert it to dataframe
type(y_smote) # check the new type
y_smote.value_counts(normalize = True)*100