library(jiebaR) > library(tm) > cm1 <- readxl::read_excel("cm1.xlsx") > # 合并所有评论 > comments <- paste(cm1$comments, collapse = "") > # 重新读取停用词表,确保每个词语都在单独的一行中 > stopwords <- readLines("my_stopwords.txt", encoding = "UTF-8", warn = FALSE) > # 确认 stopwords 变量是长度为1的字符向量 > stopwords <- paste(stopwords, collapse = "\n") > # 使用 jiebaR 包进行分词 > worker <- worker(bylines = FALSE) > words <- segment(comments,worker) > # 删除停用词 > words <- removeWords(words, stopwords) Error in gsub(sprintf("(*UCP)\\b(%s)\\b", paste(sort(words, decreasing = TRUE), : 'pattern' is invalid UTF-8
时间: 2023-06-13 13:09:11 浏览: 121
这个错误可能是由于停用词表文件的编码格式与你的 R 会话不兼容导致的。你可以尝试在读取停用词表时指定编码格式,例如:
```
stopwords <- readLines("my_stopwords.txt", encoding = "UTF-8-BOM", warn = FALSE)
```
如果仍然出现相同的错误,请检查你的停用词表文件是否包含无效的 UTF-8 字符。你可以使用其他编辑器或工具检查文件,并尝试手动删除或替换这些字符。
相关问题
修改和补充下列代码得到十折交叉验证的平均auc值和平均aoc曲线,平均分类报告以及平均混淆矩阵 min_max_scaler = MinMaxScaler() X_train1, X_test1 = x[train_id], x[test_id] y_train1, y_test1 = y[train_id], y[test_id] # apply the same scaler to both sets of data X_train1 = min_max_scaler.fit_transform(X_train1) X_test1 = min_max_scaler.transform(X_test1) X_train1 = np.array(X_train1) X_test1 = np.array(X_test1) config = get_config() tree = gcForest(config) tree.fit(X_train1, y_train1) y_pred11 = tree.predict(X_test1) y_pred1.append(y_pred11 X_train.append(X_train1) X_test.append(X_test1) y_test.append(y_test1) y_train.append(y_train1) X_train_fuzzy1, X_test_fuzzy1 = X_fuzzy[train_id], X_fuzzy[test_id] y_train_fuzzy1, y_test_fuzzy1 = y_sampled[train_id], y_sampled[test_id] X_train_fuzzy1 = min_max_scaler.fit_transform(X_train_fuzzy1) X_test_fuzzy1 = min_max_scaler.transform(X_test_fuzzy1) X_train_fuzzy1 = np.array(X_train_fuzzy1) X_test_fuzzy1 = np.array(X_test_fuzzy1) config = get_config() tree = gcForest(config) tree.fit(X_train_fuzzy1, y_train_fuzzy1) y_predd = tree.predict(X_test_fuzzy1) y_pred.append(y_predd) X_test_fuzzy.append(X_test_fuzzy1) y_test_fuzzy.append(y_test_fuzzy1)y_pred = to_categorical(np.concatenate(y_pred), num_classes=3) y_pred1 = to_categorical(np.concatenate(y_pred1), num_classes=3) y_test = to_categorical(np.concatenate(y_test), num_classes=3) y_test_fuzzy = to_categorical(np.concatenate(y_test_fuzzy), num_classes=3) print(y_pred.shape) print(y_pred1.shape) print(y_test.shape) print(y_test_fuzzy.shape) # 深度森林 report1 = classification_report(y_test, y_prprint("DF",report1) report = classification_report(y_test_fuzzy, y_pred) print("DF-F",report) mse = mean_squared_error(y_test, y_pred1) rmse = math.sqrt(mse) print('深度森林RMSE:', rmse) print('深度森林Accuracy:', accuracy_score(y_test, y_pred1)) mse = mean_squared_error(y_test_fuzzy, y_pred) rmse = math.sqrt(mse) print('F深度森林RMSE:', rmse) print('F深度森林Accuracy:', accuracy_score(y_test_fuzzy, y_pred)) mse = mean_squared_error(y_test, y_pred) rmse = math.sqrt(mse) print('F?深度森林RMSE:', rmse) print('F?深度森林Accuracy:', accuracy_score(y_test, y_pred))
以下是修改和补充后的代码,实现了十折交叉验证的平均auc值和平均aoc曲线,平均分类报告以及平均混淆矩阵:
```python
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import StratifiedKFold
min_max_scaler = MinMaxScaler()
X_train, X_test, y_train, y_test = [], [], [], []
X_train_fuzzy, X_test_fuzzy, y_train_fuzzy, y_test_fuzzy = [], [], [], []
y_pred, y_pred1 = [], []
y_pred_proba, y_pred_proba1 = [], []
config = get_config()
tree = gcForest(config)
skf = StratifiedKFold(n_splits=10)
for train_id, test_id in skf.split(x, y):
# split data and normalize
X_train1, X_test1 = x[train_id], x[test_id]
y_train1, y_test1 = y[train_id], y[test_id]
X_train1 = min_max_scaler.fit_transform(X_train1)
X_test1 = min_max_scaler.transform(X_test1)
X_train1 = np.array(X_train1)
X_test1 = np.array(X_test1)
# train gcForest
tree.fit(X_train1, y_train1)
# predict on test set
y_pred11 = tree.predict(X_test1)
y_pred_proba11 = tree.predict_proba(X_test1)
# append predictions and test data
y_pred1.append(y_pred11)
y_pred_proba1.append(y_pred_proba11)
X_train.append(X_train1)
X_test.append(X_test1)
y_test.append(y_test1)
y_train.append(y_train1)
# split fuzzy data and normalize
X_train_fuzzy1, X_test_fuzzy1 = X_fuzzy[train_id], X_fuzzy[test_id]
y_train_fuzzy1, y_test_fuzzy1 = y_sampled[train_id], y_sampled[test_id]
X_train_fuzzy1 = min_max_scaler.fit_transform(X_train_fuzzy1)
X_test_fuzzy1 = min_max_scaler.transform(X_test_fuzzy1)
X_train_fuzzy1 = np.array(X_train_fuzzy1)
X_test_fuzzy1 = np.array(X_test_fuzzy1)
# train gcForest on fuzzy data
tree.fit(X_train_fuzzy1, y_train_fuzzy1)
# predict on fuzzy test set
y_predd = tree.predict(X_test_fuzzy1)
y_predd_proba = tree.predict_proba(X_test_fuzzy1)
# append predictions and test data
y_pred.append(y_predd)
y_pred_proba.append(y_predd_proba)
X_test_fuzzy.append(X_test_fuzzy1)
y_test_fuzzy.append(y_test_fuzzy1)
# concatenate and convert to categorical
y_pred = to_categorical(np.concatenate(y_pred), num_classes=3)
y_pred1 = to_categorical(np.concatenate(y_pred1), num_classes=3)
y_test = to_categorical(np.concatenate(y_test), num_classes=3)
y_test_fuzzy = to_categorical(np.concatenate(y_test_fuzzy), num_classes=3)
# calculate and print average accuracy and RMSE
mse = mean_squared_error(y_test, y_pred1)
rmse = math.sqrt(mse)
print('深度森林RMSE:', rmse)
print('深度森林Accuracy:', accuracy_score(y_test, y_pred1))
mse = mean_squared_error(y_test_fuzzy, y_pred)
rmse = math.sqrt(mse)
print('F深度森林RMSE:', rmse)
print('F深度森林Accuracy:', accuracy_score(y_test_fuzzy, y_pred))
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
print('F?深度森林RMSE:', rmse)
print('F?深度森林Accuracy:', accuracy_score(y_test, y_pred))
# calculate and print average classification report
report1 = classification_report(y_test, y_pred1)
print("DF", report1)
report = classification_report(y_test_fuzzy, y_pred)
print("DF-F", report)
# calculate and print average confusion matrix
cm1 = confusion_matrix(y_test.argmax(axis=1), y_pred1.argmax(axis=1))
cm = confusion_matrix(y_test_fuzzy.argmax(axis=1), y_pred.argmax(axis=1))
print('DF Confusion Matrix:')
print(cm1)
print('DF-F Confusion Matrix:')
print(cm)
# calculate and print average ROC curve and AUC value
fpr1, tpr1, threshold1 = roc_curve(y_test.ravel(), y_pred_proba1.ravel())
fpr, tpr, threshold = roc_curve(y_test_fuzzy.ravel(), y_pred_proba.ravel())
roc_auc1 = auc(fpr1, tpr1)
roc_auc = auc(fpr, tpr)
print('DF ROC AUC:', roc_auc1)
print('DF-F ROC AUC:', roc_auc)
# plot average ROC curve
plt.title('Receiver Operating Characteristic')
plt.plot(fpr1, tpr1, 'b', label = 'DF AUC = %0.2f' % roc_auc1)
plt.plot(fpr, tpr, 'g', label = 'DF-F AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
```
void Cm1Dlg::OnBnClickedButton1() { // TODO: 在此添加控件通知处理程序代码 COLORREF m_Color = RGB(0, 0, 0); // 默认为黑色 } void CMyDialog::OnButtonColor() { CColorDialog dlg(m_Color, CC_FULLOPEN, this); if (dlg.DoModal() == IDOK) { m_Color = dlg.GetColor(); } }修正这段代码
根据你提供的代码,你可能存在以下问题:
1. 在 `OnBnClickedButton1` 函数中定义 `m_Color` 变量,这样会导致该变量的作用域仅限于该函数,无法在其他函数中使用。
2. 在 `OnButtonColor` 函数中直接访问 `m_Color` 变量,但该变量并未定义在该函数的作用域中,需要将其定义在类的成员变量中。
因此,你可以根据以下代码进行修正:
```
// 在 Cm1Dlg 类的头文件中添加成员变量
class Cm1Dlg : public CDialogEx {
...
private:
COLORREF m_Color = RGB(0, 0, 0); // 默认为黑色
};
// 在 Cm1Dlg.cpp 文件中修改 OnBnClickedButton1 函数
void Cm1Dlg::OnBnClickedButton1() {
CColorDialog dlg(m_Color, CC_FULLOPEN, this);
if (dlg.DoModal() == IDOK) {
m_Color = dlg.GetColor();
}
}
// 在 Cm1Dlg.cpp 文件中添加 OnPaint 函数
void Cm1Dlg::OnPaint() {
CPaintDC dc(this);
CBrush brush(m_Color);
CBrush* pOldBrush = dc.SelectObject(&brush);
// 绘制图形
dc.SelectObject(pOldBrush);
}
// 在 Cm1Dlg.cpp 文件中添加 OnButtonColor 函数
void Cm1Dlg::OnButtonColor() {
CColorDialog dlg(m_Color, CC_FULLOPEN, this);
if (dlg.DoModal() == IDOK) {
m_Color = dlg.GetColor();
}
}
```
在修正后的代码中,将 `m_Color` 定义为 `Cm1Dlg` 类的成员变量,可以在各个函数中访问。同时,修改了 `OnBnClickedButton1` 函数的实现,使其弹出颜色选择对话框并获取用户选择的颜色,并将其保存到 `m_Color` 变量中。在 `OnPaint` 函数中,创建一个画刷并将其选入设备上下文中,以设置画笔的颜色为当前选择的颜色。在 `OnButtonColor` 函数中,同样弹出颜色选择对话框并获取用户选择的颜色,并将其保存到 `m_Color` 变量中,以更新当前的画笔颜色。
阅读全文