pandas JupyterLab与朴素贝叶斯

kgqe7b3p  于 2023-02-20  发布在  其他
关注(0)|答案(1)|浏览(115)

我试图通过在jupyter notebook中使用朴素贝叶斯来检测quillbot释义。我的数据集有2列,第一列是用来自各种来源的样本文本填充的,大约250个单词。第二列是类型,如果是parahrased,则设置为1,如果是原始文本,则设置为0。到目前为止,我得到了这段代码,但我得到了一些错误。这里是我的数据集的链接:https://pastebin.com/ts8SLGHq,下面是我的代码:

from pydataset import data
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

# read the CSV file into a DataFrame
df = pd.read_csv('Desktop/Dataset.csv', encoding='utf-8')

# specify the column names
df.columns = ['text', 'type']

# split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# convert text data into numerical features
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])
y_train = train_df['type'].values
y_test = test_df['type'].values

# initialize a Gaussian Naive Bayes model
model = GaussianNB()

# train the model on the train set
model.fit(X_train, y_train)

# evaluate the model on the test set
accuracy = model.score(X_test, y_test)
print("Model accuracy on test set: {:.2f}%".format(accuracy * 100))

这是我得到的错误:

TypeError Traceback (most recent call last)
Cell In[184], line 5
2 model = GaussianNB()
4 # train the model on the train set
----> 5 model.fit(X_train, y_train)

File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\naive_bayes.py:267, in GaussianNB.fit(self, X, y, sample_weight)
265 self._validate_params()
266 y = self._validate_data(y=y)
--> 267 return self._partial_fit(
268 X, y, np.unique(y), _refit=True, sample_weight=sample_weight
269 )

File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\naive_bayes.py:428, in GaussianNB._partial_fit(self, X, y, classes, refit, sample_weight)
425 self.classes
= None
427 first_call = _check_partial_fit_first_call(self, classes)
--> 428 X, y = self._validate_data(X, y, reset=first_call)
429 if sample_weight is not None:
430 sample_weight = _check_sample_weight(sample_weight, X)

File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py:565, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
563 y = check_array(y, input_name="y", **check_y_params)
564 else:
--> 565 X, y = check_X_y(X, y, **check_params)
566 out = X, y
568 if not no_val_X and check_params.get("ensure_2d", True):

File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py:1106, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1101 estimator_name = _check_estimator_name(estimator)
1102 raise ValueError(
1103 f"{estimator_name} requires y to be passed, but the target y is None"
1104 )
-> 1106 X = check_array(
1107 X,
1108 accept_sparse=accept_sparse,
1109 accept_large_sparse=accept_large_sparse,
1110 dtype=dtype,
1111 order=order,
1112 copy=copy,
1113 force_all_finite=force_all_finite,
1114 ensure_2d=ensure_2d,
1115 allow_nd=allow_nd,
1116 ensure_min_samples=ensure_min_samples,
1117 ensure_min_features=ensure_min_features,
1118 estimator=estimator,
1119 input_name="X",
1120 )
1122 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
1124 check_consistent_length(X, y)

File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py:845, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
843 if sp.issparse(array):
844 _ensure_no_complex_data(array)
--> 845 array = _ensure_sparse_format(
846 array,
847 accept_sparse=accept_sparse,
848 dtype=dtype,
849 copy=copy,
850 force_all_finite=force_all_finite,
851 accept_large_sparse=accept_large_sparse,
852 estimator_name=estimator_name,
853 input_name=input_name,
854 )
855 else:
856 # If np.array(..) gives ComplexWarning, then we convert the warning
857 # to an error. This is needed because specifying a non complex
858 # dtype to the function converts complex to real dtype,
859 # thereby passing the test made in the lines following the scope
860 # of warnings context manager.
861 with warnings.catch_warnings():


File ~\AppData\Local\Programs\Python\Python 39\lib\site-packages\sklearn\utils\validation.py:522,采用确保稀疏格式(spmatrix,接受稀疏,数据类型,复制,强制所有有限,接受大稀疏,估计器名称,输入名称)519检查大稀疏(spmatrix,接受大稀疏)521如果接受稀疏为假:--〉522引发类型错误(523“传递了稀疏矩阵,但需要密集“524“数据。使用X.toarray()“525“转换为密集numpy数组。”526)527 elif is instance(accept_sparse,(list,tuple)):528,如果len(接受稀疏)== 0:
TypeError:传递了稀疏矩阵,但需要密集数据。请使用X.toarray()转换为密集numpy数组。“
任何帮助都将不胜感激。
我首先尝试将数据转换为数值要素,但要么是我做错了,要么是存在其他问题
ecbunoof

ecbunoof1#

将稀疏矩阵转换为NumPy数组

X_train.todense()

相关问题