pandas JupyterLab与朴素贝叶斯

我试图通过在jupyter notebook中使用朴素贝叶斯来检测quillbot释义。我的数据集有2列，第一列是用来自各种来源的样本文本填充的，大约250个单词。第二列是类型，如果是parahrased，则设置为1，如果是原始文本，则设置为0。到目前为止，我得到了这段代码，但我得到了一些错误。这里是我的数据集的链接：https://pastebin.com/ts8SLGHq，下面是我的代码：

from pydataset import data
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

# read the CSV file into a DataFrame
df = pd.read_csv('Desktop/Dataset.csv', encoding='utf-8')

# specify the column names
df.columns = ['text', 'type']

# split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# convert text data into numerical features
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])
y_train = train_df['type'].values
y_test = test_df['type'].values

# initialize a Gaussian Naive Bayes model
model = GaussianNB()

# train the model on the train set
model.fit(X_train, y_train)

# evaluate the model on the test set
accuracy = model.score(X_test, y_test)
print("Model accuracy on test set: {:.2f}%".format(accuracy * 100))

这是我得到的错误：

TypeError Traceback (most recent call last)
Cell In[184], line 5
2 model = GaussianNB()
4 # train the model on the train set
----> 5 model.fit(X_train, y_train)

File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\naive_bayes.py:267, in GaussianNB.fit(self, X, y, sample_weight)
265 self._validate_params()
266 y = self._validate_data(y=y)
--> 267 return self._partial_fit(
268 X, y, np.unique(y), _refit=True, sample_weight=sample_weight
269 )

File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\naive_bayes.py:428, in GaussianNB._partial_fit(self, X, y, classes, refit, sample_weight)
425 self.classes = None
427 first_call = _check_partial_fit_first_call(self, classes)
--> 428 X, y = self._validate_data(X, y, reset=first_call)
429 if sample_weight is not None:
430 sample_weight = _check_sample_weight(sample_weight, X)

File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py:565, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params)
563 y = check_array(y, input_name="y", **check_y_params)
564 else:
--> 565 X, y = check_X_y(X, y, **check_params)
566 out = X, y
568 if not no_val_X and check_params.get("ensure_2d", True):

File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py:1106, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
1101 estimator_name = _check_estimator_name(estimator)
1102 raise ValueError(
1103 f"{estimator_name} requires y to be passed, but the target y is None"
1104 )
-> 1106 X = check_array(
1107 X,
1108 accept_sparse=accept_sparse,
1109 accept_large_sparse=accept_large_sparse,
1110 dtype=dtype,
1111 order=order,
1112 copy=copy,
1113 force_all_finite=force_all_finite,
1114 ensure_2d=ensure_2d,
1115 allow_nd=allow_nd,
1116 ensure_min_samples=ensure_min_samples,
1117 ensure_min_features=ensure_min_features,
1118 estimator=estimator,
1119 input_name="X",
1120 )
1122 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
1124 check_consistent_length(X, y)

File ~\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py:845, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
843 if sp.issparse(array):
844 _ensure_no_complex_data(array)
--> 845 array = _ensure_sparse_format(
846 array,
847 accept_sparse=accept_sparse,
848 dtype=dtype,
849 copy=copy,
850 force_all_finite=force_all_finite,
851 accept_large_sparse=accept_large_sparse,
852 estimator_name=estimator_name,
853 input_name=input_name,
854 )
855 else:
856 # If np.array(..) gives ComplexWarning, then we convert the warning
857 # to an error. This is needed because specifying a non complex
858 # dtype to the function converts complex to real dtype,
859 # thereby passing the test made in the lines following the scope
860 # of warnings context manager.
861 with warnings.catch_warnings():


File ~\AppData\Local\Programs\Python\Python 39\lib\site-packages\sklearn\utils\validation.py：522，采用确保稀疏格式（spmatrix，接受稀疏，数据类型，复制，强制所有有限，接受大稀疏，估计器名称，输入名称）519检查大稀疏（spmatrix，接受大稀疏）521如果接受稀疏为假：--〉522引发类型错误（523“传递了稀疏矩阵，但需要密集“524“数据。使用X.toarray（）“525“转换为密集numpy数组。”526）527 elif is instance（accept_sparse，（list，tuple））：528，如果len（接受稀疏）== 0：
TypeError：传递了稀疏矩阵，但需要密集数据。请使用X.toarray（）转换为密集numpy数组。“
任何帮助都将不胜感激。
我首先尝试将数据转换为数值要素，但要么是我做错了，要么是存在其他问题

pandas JupyterLab与朴素贝叶斯

1条答案

相关问题

热门标签

最新问答