data['FamilySize'] = data['sibsp'] + data['parch'] + 1 survived = data[data['survived'] == 1]['FamilySize'] not_survived = data[data['survived'] == 0]['FamilySize'] survived_ratio = survived.value_counts() / len(survived) not_survived_ratio = not_survived.value_counts() / len(not_survived) # 使用ANOVA分析验证多个样本之间的差异 f_stat, p_val = stats.f_oneway(survived, not_survived) # 输出结果 print('Survived ratio by family size:') print(survived_ratio) print('Not survived ratio by family size:') print(not_survived_ratio) print('f-statistic:', f_stat) print('p-value:', p_val) Survived ratio by family size: 1 0.476608 2 0.260234 3 0.172515 4 0.061404 7 0.011696 6 0.008772 5 0.008772 Name: FamilySize, dtype: float64 Not survived ratio by family size: 1 0.681239 2 0.131148 3 0.078324 6 0.034608 5 0.021858 7 0.014572 4 0.014572 9 0.012750 8 0.010929 Name: FamilySize, dtype: float64 f-statistic: 0.5837375690419451 p-value: 0.4450537592077023什么意思
时间: 2024-03-04 13:53:34 浏览: 22
这段代码是用来分析 Titanic 数据集中不同家庭大小(通过计算 sibsp 和 parch 的和再加 1 得到)在幸存和未幸存乘客之间的比例差异,并使用 ANOVA 分析方法验证多个样本之间的差异是否显著。输出结果包括幸存和未幸存乘客在不同家庭大小下的比例,以及 ANOVA 的 f-statistic 和 p-value。其中,f-statistic 是用来评估不同组之间的差异大小,p-value 表示这种差异是否有统计学意义,如果 p-value 较小,则说明差异较大并且是显著的。在这个例子中,由于 p-value 较大(大于 0.05),因此无法拒绝零假设,即家庭大小与幸存与否之间的关系不显著。
相关问题
import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split # 读取训练集和测试集数据 train_data = pd.read_csv(r'C:\ADULT\Titanic\train.csv') test_data = pd.read_csv(r'C:\ADULT\Titanic\test.csv') # 统计训练集和测试集缺失值数目 print(train_data.isnull().sum()) print(test_data.isnull().sum()) # 处理 Age, Fare 和 Embarked 缺失值 most_lists = ['Age', 'Fare', 'Embarked'] for col in most_lists: train_data[col] = train_data[col].fillna(train_data[col].mode()[0]) test_data[col] = test_data[col].fillna(test_data[col].mode()[0]) # 拆分 X, Y 数据并将分类变量 one-hot 编码 y_train_data = train_data['Survived'] features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked'] X_train_data = pd.get_dummies(train_data[features]) X_test_data = pd.get_dummies(test_data[features]) # 合并训练集 Y 和 X 数据,并创建乘客信息分类变量 train_data_selected = pd.concat([y_train_data, X_train_data], axis=1) print(train_data_selected) cate_features = ['Pclass', 'SibSp', 'Parch', 'Sex', 'Embarked', 'Age_category', 'Fare_category'] train_data['Age_category'] = pd.cut(train_data.Fare, bins=range(0, 100, 10)).astype(str) train_data['Fare_category'] = pd.cut(train_data.Fare, bins=list(range(-20, 110, 20)) + [800]).astype(str) print(train_data) # 统计各分类变量的分布并作出可视化呈现 plt.figure(figsize=(18, 16)) plt.subplots_adjust(hspace=0.3, wspace=0.3) for i, cate_feature in enumerate(cate_features): plt.subplot(7, 2, 2 * i + 1) sns.histplot(x=cate_feature, data=train_data, stat="density") plt.xlabel(cate_feature) plt.ylabel('Density') plt.subplot(7, 2, 2 * i + 2) sns.lineplot(x=cate_feature, y='Survived', data=train_data) plt.xlabel(cate_feature) plt.ylabel('Survived') plt.show() # 绘制点状的相关系数热图 plt.figure(figsize=(12, 8)) sns.heatmap(train_data_selected.corr(), vmin=-1, vmax=1, annot=True) plt.show() sourceRow = 891 output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions}) output.head() # 保存结果 output.to_csv('gender_submission.csv', index=False) print(output) train_X, test_X, train_y, test_y = train_test_split(X_train_data, y_train_data, train_size=0.8, random_state=42) print("随机森林分类结果") y_pred_train1 = train_data.predict(train_X) y_pred_test1 = train_data.predict(test_X) accuracy_train1 = accuracy_score(train_y, y_pred_train1) accuracy_test1 = accuracy_score(test_y, y_pred_test1) print("训练集——随机森林分类器准确率为:", accuracy_train1) print("测试集——随机森林分类器准确率为:", accuracy_train1)
在你的代码中,你正在尝试从 `train_data` 对象上调用一个名为 "predict" 的方法,而 `train_data` 实际上是一个 DataFrame 对象,该对象并没有 "predict" 方法。你应该使用你之前定义的随机森林分类器对象 `RandomForestClassifier` 来进行预测。请将以下代码:
```
y_pred_train1 = train_data.predict(train_X)
y_pred_test1 = train_data.predict(test_X)
```
更改为:
```
# 定义随机森林分类器对象
rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
# 在训练数据上拟合分类器
rfc.fit(train_X, train_y)
# 在训练集和测试集上进行预测
y_pred_train1 = rfc.predict(train_X)
y_pred_test1 = rfc.predict(test_X)
```
这将使用你之前定义的随机森林分类器对象 `rfc` 来进行预测。同时,你需要使用 `rfc.fit(train_X, train_y)` 在训练数据上拟合分类器,然后使用 `rfc.predict(train_X)` 和 `rfc.predict(test_X)` 在训练集和测试集上进行预测。
用R语言复现https://www.kaggle.com/lelexy/titanic-data-science-solutions/edit的数据处理,可视化与建模过程
好的,我会尽力为您解答。首先,您需要先下载 Titanic 数据集,可以在 Kaggle 上下载。然后,我们按照以下步骤进行数据处理,可视化与建模:
1. 导入所需的 R 包:
```
library(tidyverse)
library(survival)
library(rpart)
library(rpart.plot)
library(ranger)
```
2. 读取数据集,并将训练集和测试集合并:
```
train <- read.csv("train.csv", stringsAsFactors = F)
test <- read.csv("test.csv", stringsAsFactors = F)
full <- bind_rows(train, test)
```
3. 数据预处理:
```
# 用平均值填充 Age 中的缺失值
full$Age[is.na(full$Age)] <- mean(full$Age, na.rm = T)
# 用众数填充 Embarked 中的缺失值
full$Embarked[is.na(full$Embarked)] <- mode(full$Embarked, na.rm = T)
# 用中位数填充 Fare 中的缺失值
full$Fare[is.na(full$Fare)] <- median(full$Fare, na.rm = T)
# 将 Cabin 中的缺失值替换为 "Unknown"
full$Cabin[is.na(full$Cabin)] <- "Unknown"
# 创建新的变量 FamilySize 和 Alone
full$FamilySize <- full$SibSp + full$Parch + 1
full$Alone <- ifelse(full$FamilySize == 1, "Alone", "Not Alone")
# 将 Name 中的称谓提取出来
full$Title <- gsub('(.*, )|(\\..*)', '', full$Name)
full$Title[full$Title %in% c('Mlle', 'Ms')] <- 'Miss'
full$Title[full$Title == 'Mme'] <- 'Mrs'
full$Title[full$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
full$Title[full$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'
```
4. 可视化:
```
# 柱状图:Survived 和 Sex
full %>%
filter(!is.na(Survived)) %>%
ggplot(aes(x = factor(Survived), fill = Sex)) +
geom_bar(position = "dodge") +
labs(x = "Survived", y = "Count", fill = "Sex")
# 箱线图:Survived 和 Age
full %>%
filter(!is.na(Survived)) %>%
ggplot(aes(x = factor(Survived), y = Age, fill = factor(Survived))) +
geom_boxplot() +
labs(x = "Survived", y = "Age", fill = "Survived")
# 散点图:Survived 和 Fare
full %>%
filter(!is.na(Survived)) %>%
ggplot(aes(x = factor(Survived), y = Fare, color = factor(Survived))) +
geom_jitter(alpha = 0.4) +
labs(x = "Survived", y = "Fare", color = "Survived")
# 热力图:特征之间的相关性
corr <- full %>%
select(-PassengerId, -Survived) %>%
cor()
corrplot::corrplot(corr, method = "color", type = "upper", order = "hclust")
```
5. 建模:
```
# 划分数据集
train <- full[1:891, ]
test <- full[892:1309, ]
# 决策树模型
tree <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, data = train, method = "class")
rpart.plot::rpart.plot(tree)
# 随机森林模型
rf <- ranger(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title, data = train, num.trees = 1000)
importance <- importance(rf)
varImpPlot(importance)
# 提交结果
test$Survived <- predict(rf, test)$predictions
submission <- test %>%
select(PassengerId, Survived) %>%
write.csv("submission.csv", row.names = F)
```
以上就是复现 Titanic 数据处理、可视化与建模过程的 R 代码,希望能对您有所帮助。
相关推荐
![zip](https://img-home.csdnimg.cn/images/20210720083736.png)
![zip](https://img-home.csdnimg.cn/images/20210720083736.png)
![zip](https://img-home.csdnimg.cn/images/20210720083736.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)
![](https://csdnimg.cn/download_wenku/file_type_ask_c1.png)