优化import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB #加载 CSV 文件并忽略错误行 try: train_df = pd.read_csv('train.csv', encoding='utf-8') except: pass try: test_df = pd.read_csv('test.csv', encoding='utf-8') except: pass #删除无效行 train_df.dropna(inplace=True) test_df.dropna(inplace=True) #划分训练集和测试集 X_train = train_df['content'] y_train = train_df['category'] X_test = test_df['content'] y_test = test_df #特征提取 vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) #训练模型 model = MultinomialNB() model.fit(X_train, y_train) #测试模型 score = model.score(X_test, y_test) print('Accuracy:', score)
时间: 2023-12-03 09:47:45 浏览: 36
这段代码看起来已经很优化了,但是还有一些可以改进的地方:
1. 在加载 CSV 文件时,可以使用 with open() as f 和 csv.reader(f) 的方式来读取文件,这样可以避免因为文件未关闭而导致的内存泄漏。
2. 在删除无效行之前,最好先检查一下数据集是否存在无效值。可以使用 train_df.isnull().sum() 来检查每列的缺失值数量,如果发现某列缺失值数量较多,可以选择删除该列或者使用其他方法来填充缺失值。
3. 在测试模型时,y_test 应该是测试集的标签,而不是整个测试集。可以使用 y_test = test_df['category'] 来获取测试集的标签。
改进后的代码如下:
```python
import csv
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
# 加载 CSV 文件并忽略错误行
with open('train.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
train_df = pd.DataFrame(reader)
train_df.dropna(inplace=True)
with open('test.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
test_df = pd.DataFrame(reader)
test_df.dropna(inplace=True)
# 划分训练集和测试集
X_train = train_df[1]
y_train = train_df[0]
X_test = test_df[1]
y_test = test_df[0]
# 特征提取
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
# 训练模型
model = MultinomialNB()
model.fit(X_train, y_train)
# 测试模型
score = model.score(X_test, y_test)
print('Accuracy:', score)
```