一、基本设置
library(tidyverse); library(tidymodels)
tidymodels_prefer()
data(ames, package = "modeldata"); dim(ames)
二、样本抽样
# 随机抽样
# 分层抽样,strata 控制了需要分层的参数
split <- initial_split(data, prop = 0.8, strata = x1)
# 时间序列数据,一般使用最新的数据作为测试集
time_split <- initial_time_split(data, prop = 0.8)
train_data <- training(split)
test_data <- testing(split)
# K 折交叉
folds <- vfold_cv(train_data, v = 10) # 10 折交叉
# Bootstrap
boots <- bootstraps(data, times = 1000) # 重抽样 10000 次
三、数据建模
# 机器学习模型代码选择
parsnip_addin()
# 查看模型参数
translate(tree_model)
# 传统回归方法
model <- linear_reg() %>% set_mode("regression") %>%
set_engine("lm") # 线性回归
set_engine("glmnet") # 广义线性回归
set_engine("stan")
# 高维回归 mixture = 0 是岭回归 = 1 则为 lasso 可以介于二者之间
model <- linear_reg(mixture = 0, penalty = 0) %>% set_mode("regression") %>%
set_engine("glmnet") # 岭回归
model <- logistic_reg() %>% set_mode("classification") %>%
set_engine("glmnet") # 逻辑回归
model <- poisson_reg() %>% set_mode("regression") %>%
set_engine("glm") # 泊松回归
model <- naive_Bayes() %>% set_mode("classification") %>%
set_engine("klaR") %>% # 朴素贝叶斯
set_args(usekernel = FALSE) # 假设 x1、x2 来自高斯分布
model <- nearest_neighbor(neighbors = 3) %>% set_mode("classification") %>%
set_engine("kknn") # K 邻近
library(discrim)
model <- discrim_linear() %>% set_mode("classification") %>%
set_engine("MASS") # 线性判别分析 LDA
library(discrim)
model <- discrim_quad() %>% set_mode("classification") %>%
set_engine("MASS") # QDA
# 树方法
model <- decision_tree() %>% set_mode("classification")x %>%
set_engine("rpart") # 分类树
model <- decision_tree() %>% set_mode("regression")x %>%
set_engine("rpart") # 回归树
model <- rand_forest(mtry = .cols()) %>% set_mode("regression") %>%
set_engine("randomForest", importance = TRUE) # 袋装法
model <- rand_forest(mtry = 6) %>% set_mode("regression") %>%
set_engine("randomForest", importance = TRUE) # 随机森林
model <- boost_tree(trees = 5000, tree_depth = 4) %>% set_mode("regression") %>%
set_engine("xgboost") %>% # 提升法
# 支持向量机
model <- svm_poly(degree = 1) %>% set_mode("classification") %>%
set_engine("kernlab", scaled = FALSE) # 支持向量分类器
model <- svm_rbf() %>% set_mode("classification") %>%
set_engine("kernlab") # 支持向量机
# 通过 model 对象拟合
fit <-
model %>%
set_args() %>% # 传入额外参数
fit(y ~ x1 + x2, data = train_data); fit
xy_fit <-
model %>%
fit_xy(
x = train_data %>% select(x1, x2),
y = train_data %>% pull(y)
); xy_fit
四、特征工程
# 修饰变量
rec_spec <- recipe(y ~ x1 + x2 + x3, data = data) %>%
# 变量的基本变换
step_interact(~ x1:x2 + x2:x3) %>% # 创建交互项
step_mutate(x2_2 = x2 ^ 2) %>% # 将 lstat 平方
step_log(y) %>% # 将 y 取对数
# 多项式设置与基函数
step_poly(x1, degree = 2, # 多项式展开 默认是正交多项式
options = list(raw = TRUE)) %>% # 设置为普通多项式而非正交化
step_discretize(x1, num_breaks = 4) %>% # 将连续变量分为 4 类的因子变量
step_cut(y, breaks = c(30, 50, 70)) %>% # 手动设置中断点
step_bs(x1, options = list( # 设置样条 splines
knots = 25, 40, 60)) %>%
# 其他变换
step_novel(all_nominal_predictors()) %>% # 将字符串改为因子变量
step_dummy(all_nominal_predictors()) %>% # 将因子变量改为哑变量
step_zv(all_predictors()) %>% # 删除只有一个值的列
step_normalize(all_predictors()) %>% # 将连续变量的尺度统一
# 降维
step_pca(all_predictors(), threshold = tune()) # 如要使用pca应放在最后
step_pls(all_predictors(), num_comp = tune(), # 偏最小二乘类似pca
outcome = "y")
# 创建工作流
wf <- workflow() %>%
add_recipe(rec_spec) %>%
add_model(model)
# 通过 workflow 对象拟合
fit <- wf %>% fit(data = train_data)
五、参数选择
# 创建要调整的超参数
# 调整多项式的展开维度
recipe_tuned <- recipe(y ~ x1, data = train_data) %>%
step_poly(x1, degree = tune())
degree_grid <- grid_regular(degree(range = c(1, 10)), levels = 10)
# 调整岭回归中的惩罚项
model_tuned <-
linear_reg(penalty = tune(), mixture = 0) %>%
set_mode("regression") %>%
set_engine("glmnet")
penalty_grid <- grid_regular(penalty(range = c(-5, 5)), levels = 50)
# 调整 pca 和 pls
threshold_grid <- grid_regular(threshold(), levels = 10) # pca
num_comp_grid <- grid_regular(num_comp(c(1, 20)), levels = 10) # pls
# 调整决策树中的模型复杂度
mmodel_tuned <- decision_tree() %>% set_engine("rpart") %>%
set_mode("classification") %>%
set_args(cost_complexity = tune())
param_grid <- grid_regular(cost_complexity(range = c(-3, -1)), levels = 10)
# 调整支持向量机中的代价函数
model_tuned <-
svm_poly(degree = 1) %>% set_mode("classification") %>%
set_engine("kernlab", scaled = FALSE) %>%
set_args(cost = tune())
param_grid <- grid_regular(cost(), levels = 10)
# 添加到工作流
wf_tuned <- workflow() %>%
add_recipe(recipe_tuned) %>%
add_model(model_tuned)
# 分析结果
res_tuned <- tune_grid(
object = wf_tuned,
resamples = folds,
grid = grid,
control = control_grid(verbose = TRUE) # 展示进度
)
# 查看结果 tuned_var 为调整的超参数 rmse 也可为 rsq
autoplot(res_tuned) # 查看拟合曲线
collect_metrics(res_tuned) # 查看绘图数据
show_best(res_tuned, metric = "rmse") # 查看最佳数据
select_by_one_std_err(res_tuned, tuned_var, metric = "rmse") # 根据标准误选择
select_by_pct_loss(tune_res, tuned_var, metric = "rmse") # 根据损失选择
# 选择最优的参数
best_var <- select_by_one_std_err(tune_res, tuned_var, metric = "rmse")
# 确定最终的模型
final_var <- finalize_workflow(wf, tuned_var)
六、查看结果
1. 回归问题
# 预测结果
predict(fit, new_data = test_data)
fit %>%
extract_fit_engine() %>%
vcov() # 查看协方差矩阵
summary() # 查看拟合结果
tidy(fit) # 获得系数、p值等
glance(fit) # 获得拟合结果等
# 查看模型拟合的结果与真实结果
data %>%
select(y) %>%
bind_cols(predict(fit, data)) %>%
# Add 95% prediction intervals to the results:
bind_cols(predict(fit, data, type = "pred_int"))
# 绘制拟合曲线
regression_lines <- bind_cols(
augment(fit, new_data = train_data),
predict(fit, new_data = train_data, type = "conf_int")
)
data %>%
ggplot(aes(x, y)) +
geom_point(alpha = 0.2) +
geom_line(aes(y = .pred), color = "darkgreen",
data = regression_lines) +
geom_line(aes(y = .pred_lower), data = regression_lines,
linetype = "dashed", color = "blue") +
geom_line(aes(y = .pred_upper), data = regression_lines,
linetype = "dashed", color = "blue")
# 查看 RMSE
augment(fit, new_data = test_data) %>%
rmse(truth = y, estimate = .pred)
# 高维回归
# 适用不同的 惩罚度 lambda
predict(fit, new_data = test_data, penalty = 500)
tidy(fit, penalty = 1)
# 绘制不同的 lambda 图像
fit %>%
extract_fit_engine() %>%
plot(xvar = "lambda") # 惩罚度的增加将逐渐把回归系数正则化为 0
2. 分类问题
# 预测结果
predict(fit, new_data = test_data, type = "prob") # 有 prob 就是概率,无就是结果
# 查看混淆矩阵
augment(fit, new_data = test_data) %>%
conf_mat(truth = y, estimate = .pred_class) %>% # 矩阵
autoplot(type = "heatmap") # 画图
# 查看准确率
augment(lr_fit, new_data = test_data) %>%
accuracy(truth = y, estimate = .pred_class)
3. 模型间比较
models <- list("logistic regression" = lr_fit,
"LDA" = lda_fit,
"QDA" = qda_fit,
"KNN" = knn_fit)
preds <- imap_dfr(models, augment,
new_data = test_data, .id = "model")
preds %>%
select(model, y, .pred_class, .pred_Down, .pred_Up)
multi_metric <- metric_set(accuracy, sensitivity, specificity)
preds %>%
group_by(model) %>%
multi_metric(truth = y, estimate = .pred_class)
preds %>%
group_by(model) %>%
roc_curve(y, .pred_Down) %>%
autoplot()
4. 决策树
# 展示树状
library(rpart.plot); tree_fit %>%
extract_fit_engine() %>%
rpart.plot()
# 分类树
# 查看准确率
augment(class_tree_fit, new_data = train_data) %>%
accuracy(truth = y, estimate = .pred_class)
# 查看混淆矩阵
augment(class_tree_fit, new_data = train_data) %>%
conf_mat(truth = y, estimate = .pred_class)
# 回归树
# 查看平均方差根
augment(reg_tree_fit, new_data = train_data) %>%
rmse(truth = medv, estimate = .pred)
# 查看拟合结果
augment(bagging_fit, new_data = test_data) %>%
ggplot(aes(medv, .pred)) +
geom_abline() +
geom_point(alpha = 0.5)
# 查看变量重要性
library(vip)
vip(bagging_fit)
5. 支持向量
# 支持向量分类器
# 可视化支持向量分类器结果
library(kernlab)
svm_linear_fit %>%
extract_fit_engine() %>%
plot()
# 支持向量机
# 查看混淆矩阵
augment(svm_rbf_fit, new_data = test_data) %>%
conf_mat(truth = y, estimate = .pred_class)
# 创建并绘制 ROC 曲线
augment(svm_rbf_fit, new_data = test_data) %>%
roc_curve(truth = y, estimate = .pred_1) %>% # 创建数据
autoplot() # 绘制曲线
# 计算 ROC 曲线下面积
augment(svm_rbf_fit, new_data = test_data) %>%
roc_auc(truth = y, estimate = .pred_1)