ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") ax.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
时间: 2024-04-09 18:31:17 浏览: 114
这段代码是用于在图形中填充两个区域。第一个区域是训练集得分的平均值减去训练集得分的标准差和训练集得分的平均值加上训练集得分的标准差之间的区域,填充的颜色为红色。第二个区域是测试集得分的平均值减去测试集得分的标准差和测试集得分的平均值加上测试集得分的标准差之间的区域,填充的颜色为绿色。这样可以直观地显示出训练集和测试集得分的差异范围。
相关问题
rf = RandomForestClassifier(n_estimators=90, max_depth=) train_sizes, train_scores, cv_scores = learning_curve(rf,x_train,y_train,cv=5,train_sizes=np.linspace(0.01,1,100)) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) cv_scores_mean = np.mean(cv_scores, axis=1) cv_scores_std = np.std(cv_scores, axis=1) fig = plt.figure(figsize=(8,6), dpi=200) ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) ax.plot(train_sizes, train_scores_mean, color='dodgerblue', alpha=0.8) ax.plot(train_sizes, cv_scores_mean, color='g', alpha=0.8) ax.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="dodgerblue") ax.fill_between(train_sizes, cv_scores_mean - cv_scores_std, cv_scores_mean + cv_scores_std, alpha=0.1, color="g") ax.legend(labels=['train_set_scores', 'cross_val_scores'], loc='best') ax.set_title('Learning curve of the random forests') ax.grid(True) ax.set_xlabel('The number of training samples') ax.set_ylabel('Model score') plt.savefig('Learning curve of the random forests.jpg') plt.show()
这段代码使用了随机森林分类器(RandomForestClassifier)进行训练,并使用学习曲线(learning_curve)来评估模型的性能。具体来说,它通过在训练集上逐渐增加样本数量,并在交叉验证集上计算模型得分,来绘制训练集和交叉验证集得分随训练集样本数量变化的曲线。这可以帮助我们判断模型是否存在过拟合或欠拟合的问题。
其中,n_estimators是指随机森林中树的个数,max_depth是指每棵树的最大深度。learning_curve函数中的train_sizes参数指定了训练集的大小范围,np.linspace(0.01,1,100)表示从0.01到1均匀取100个数,即训练集大小从1%到100%。cv参数指定了交叉验证的折数,这里设置为5。最后,绘制了训练集得分和交叉验证集得分随训练集样本数量变化的曲线,并保存了图片。
import requests from bs4 import BeautifulSoup import matplotlib.pyplot as plt import pandas as pd plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False url = "https://nba.hupu.com/stats/players" response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") players = [] scores = [] table = soup.find("table", class_="players_table") rows = table.find_all("tr") for row in rows[1:26]: cols = row.find_all("td") player = cols[1].text.strip() score_range = cols[4].text.strip() score_parts = score_range.split("-") min_score = float(score_parts[0]) max_score = float(score_parts[1]) score = int((min_score + max_score) / 2) players.append(player) scores.append(score) data = {"Player": players, "Score": scores} df = pd.DataFrame(data) fig, ax = plt.subplots(figsize=(12, 6)) ax.bar(players, scores, color='green', alpha=0.6) ax.set_xlabel('球员', fontsize=12) ax.set_ylabel('得分', fontsize=12) ax.set_title('NBA球员得分', fontsize=14) plt.xticks(rotation=45, ha='right', fontsize=8) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) for i, score in enumerate(scores): ax.text(i, score+0.5, str(score), ha='center', va='bottom') fig.tight_layout() plt.show()对这些代码做一些小的优化
这段代码本身已经比较简洁,可以考虑做一些小的优化,包括:
1. 为了避免在代码中多次使用硬编码的数字和字符串,可以将它们定义为常量或变量,以提高代码的可读性和可维护性。比如,将“players_table”定义为一个常量,将球员数量限制为一个变量等等。
2. 可以考虑将整个代码块封装成一个函数,以提高代码的复用性和可维护性。
3. 如果只需要显示前25个球员的得分,可以将这个限制作为一个参数传入函数中,以便在需要时更改。
下面是一个根据上述优化建议进行修改的示例代码:
```python
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
PLAYERS_LIMIT = 25
TABLE_CLASS_NAME = "players_table"
def get_top_players_scores(limit=PLAYERS_LIMIT, table_class_name=TABLE_CLASS_NAME):
url = "https://nba.hupu.com/stats/players"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
players = []
scores = []
table = soup.find("table", class_=table_class_name)
rows = table.find_all("tr")
for row in rows[1:limit+1]:
cols = row.find_all("td")
player = cols[1].text.strip()
score_range = cols[4].text.strip()
score_parts = score_range.split("-")
min_score = float(score_parts[0])
max_score = float(score_parts[1])
score = int((min_score + max_score) / 2)
players.append(player)
scores.append(score)
return players, scores
def plot_top_players_scores(players, scores):
data = {"Player": players, "Score": scores}
df = pd.DataFrame(data)
fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(players, scores, color='green', alpha=0.6)
ax.set_xlabel('球员', fontsize=12)
ax.set_ylabel('得分', fontsize=12)
ax.set_title('NBA球员得分', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=8)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
for i, score in enumerate(scores):
ax.text(i, score+0.5, str(score), ha='center', va='bottom')
fig.tight_layout()
plt.show()
if __name__ == "__main__":
players, scores = get_top_players_scores()
plot_top_players_scores(players, scores)
```
阅读全文