R：机器学习 | Walton's Blog

一、基本设置

library(tidyverse); library(tidymodels)
tidymodels_prefer()

data(ames, package = "modeldata"); dim(ames)

二、样本抽样

# 随机抽样

# 分层抽样，strata 控制了需要分层的参数
split <- initial_split(data, prop = 0.8, strata = x1)

# 时间序列数据，一般使用最新的数据作为测试集
time_split <- initial_time_split(data, prop = 0.8)

train_data <- training(split)
test_data  <- testing(split)

# K 折交叉

folds <- vfold_cv(train_data, v = 10) # 10 折交叉

# Bootstrap

boots <- bootstraps(data, times = 1000) # 重抽样 10000 次

三、数据建模

# 机器学习模型代码选择
parsnip_addin()

# 查看模型参数
translate(tree_model)

# 传统回归方法

model <- linear_reg() %>% set_mode("regression")  %>%
  set_engine("lm")     # 线性回归
  set_engine("glmnet") # 广义线性回归
  set_engine("stan")

# 高维回归 mixture = 0 是岭回归 = 1 则为 lasso 可以介于二者之间
model <- linear_reg(mixture = 0, penalty = 0) %>% set_mode("regression") %>%
  set_engine("glmnet") # 岭回归

model <- logistic_reg() %>% set_mode("classification") %>%
  set_engine("glmnet") # 逻辑回归

model <- poisson_reg() %>% set_mode("regression") %>%
  set_engine("glm") # 泊松回归

model <- naive_Bayes() %>% set_mode("classification") %>%
  set_engine("klaR") %>%      # 朴素贝叶斯
  set_args(usekernel = FALSE) # 假设 x1、x2 来自高斯分布

model <- nearest_neighbor(neighbors = 3) %>% set_mode("classification") %>%
  set_engine("kknn") # K 邻近

library(discrim)
model <- discrim_linear() %>% set_mode("classification") %>%
  set_engine("MASS") # 线性判别分析 LDA

library(discrim)
model <- discrim_quad() %>% set_mode("classification") %>%
  set_engine("MASS") # QDA

# 树方法

model <- decision_tree() %>% set_mode("classification")x %>%
  set_engine("rpart") # 分类树

model <- decision_tree() %>% set_mode("regression")x %>%
  set_engine("rpart") # 回归树

model <- rand_forest(mtry = .cols()) %>% set_mode("regression") %>%
  set_engine("randomForest", importance = TRUE) # 袋装法

model <- rand_forest(mtry = 6) %>% set_mode("regression") %>%
  set_engine("randomForest", importance = TRUE) # 随机森林

model <- boost_tree(trees = 5000, tree_depth = 4) %>% set_mode("regression") %>%
  set_engine("xgboost") %>% # 提升法

# 支持向量机

model <- svm_poly(degree = 1) %>% set_mode("classification") %>%
  set_engine("kernlab", scaled = FALSE) # 支持向量分类器

model <- svm_rbf() %>% set_mode("classification") %>%
  set_engine("kernlab") # 支持向量机

# 通过 model 对象拟合

fit <-
  model %>%
  set_args() %>% # 传入额外参数
  fit(y ~ x1 + x2, data = train_data); fit

xy_fit <-
  model %>%
  fit_xy(
    x = train_data %>% select(x1, x2),
    y = train_data %>% pull(y)
  ); xy_fit

四、特征工程

# 修饰变量
rec_spec <- recipe(y ~ x1 + x2 + x3, data = data) %>%
  # 变量的基本变换
  step_interact(~ x1:x2 + x2:x3) %>%             # 创建交互项
  step_mutate(x2_2 = x2 ^ 2) %>%                 # 将 lstat 平方
  step_log(y) %>%                                # 将 y 取对数
  # 多项式设置与基函数
  step_poly(x1, degree = 2,                      # 多项式展开 默认是正交多项式
            options = list(raw = TRUE)) %>%      # 设置为普通多项式而非正交化
  step_discretize(x1, num_breaks = 4) %>%        # 将连续变量分为 4 类的因子变量
  step_cut(y, breaks = c(30, 50, 70)) %>%        # 手动设置中断点
  step_bs(x1, options = list(                    # 设置样条 splines
          knots = 25, 40, 60)) %>%
  # 其他变换
  step_novel(all_nominal_predictors()) %>%       # 将字符串改为因子变量
  step_dummy(all_nominal_predictors()) %>%       # 将因子变量改为哑变量
  step_zv(all_predictors()) %>%                  # 删除只有一个值的列
  step_normalize(all_predictors()) %>%           # 将连续变量的尺度统一
  # 降维
  step_pca(all_predictors(), threshold = tune()) # 如要使用pca应放在最后
  step_pls(all_predictors(), num_comp = tune(),  # 偏最小二乘类似pca
           outcome = "y")

# 创建工作流
wf <- workflow() %>%
  add_recipe(rec_spec) %>%
  add_model(model)

# 通过 workflow 对象拟合
fit <- wf %>% fit(data = train_data)

五、参数选择

# 创建要调整的超参数

# 调整多项式的展开维度
recipe_tuned <- recipe(y ~ x1, data = train_data) %>%
  step_poly(x1, degree = tune())
degree_grid <- grid_regular(degree(range = c(1, 10)), levels = 10)

# 调整岭回归中的惩罚项
model_tuned <-
  linear_reg(penalty = tune(), mixture = 0) %>%
  set_mode("regression") %>%
  set_engine("glmnet")
penalty_grid <- grid_regular(penalty(range = c(-5, 5)), levels = 50)

# 调整 pca 和 pls
threshold_grid <- grid_regular(threshold(), levels = 10)       # pca
num_comp_grid <- grid_regular(num_comp(c(1, 20)), levels = 10) # pls

# 调整决策树中的模型复杂度
mmodel_tuned <- decision_tree() %>% set_engine("rpart") %>%
  set_mode("classification")  %>%
  set_args(cost_complexity = tune())
param_grid <- grid_regular(cost_complexity(range = c(-3, -1)), levels = 10)

# 调整支持向量机中的代价函数
model_tuned <-
  svm_poly(degree = 1) %>% set_mode("classification") %>%
  set_engine("kernlab", scaled = FALSE) %>%
  set_args(cost = tune())
param_grid <- grid_regular(cost(), levels = 10)

# 添加到工作流
wf_tuned <- workflow() %>%
  add_recipe(recipe_tuned) %>%
  add_model(model_tuned)

# 分析结果
res_tuned <- tune_grid(
  object = wf_tuned,
  resamples = folds,
  grid = grid,
  control = control_grid(verbose = TRUE) # 展示进度
)

# 查看结果 tuned_var 为调整的超参数 rmse 也可为 rsq
autoplot(res_tuned)                                          # 查看拟合曲线
collect_metrics(res_tuned)                                   # 查看绘图数据
show_best(res_tuned, metric = "rmse")                        # 查看最佳数据
select_by_one_std_err(res_tuned, tuned_var, metric = "rmse") # 根据标准误选择
select_by_pct_loss(tune_res, tuned_var, metric = "rmse")     # 根据损失选择

# 选择最优的参数
best_var <- select_by_one_std_err(tune_res, tuned_var, metric = "rmse")

# 确定最终的模型
final_var <- finalize_workflow(wf, tuned_var)

六、查看结果

1. 回归问题

# 预测结果
predict(fit, new_data = test_data)

fit %>%
  extract_fit_engine() %>%
  vcov()    # 查看协方差矩阵
  summary() # 查看拟合结果

tidy(fit)   # 获得系数、p值等
glance(fit) # 获得拟合结果等

# 查看模型拟合的结果与真实结果
data %>%
  select(y) %>%
  bind_cols(predict(fit, data)) %>%
  # Add 95% prediction intervals to the results:
  bind_cols(predict(fit, data, type = "pred_int"))

# 绘制拟合曲线
regression_lines <- bind_cols(
  augment(fit, new_data = train_data),
  predict(fit, new_data = train_data, type = "conf_int")
)
data %>%
  ggplot(aes(x, y)) +
  geom_point(alpha = 0.2) +
  geom_line(aes(y = .pred), color = "darkgreen",
            data = regression_lines) +
  geom_line(aes(y = .pred_lower), data = regression_lines,
            linetype = "dashed", color = "blue") +
  geom_line(aes(y = .pred_upper), data = regression_lines,
            linetype = "dashed", color = "blue")

# 查看 RMSE
augment(fit, new_data = test_data) %>%
  rmse(truth = y, estimate = .pred)

# 高维回归

# 适用不同的 惩罚度 lambda
predict(fit, new_data = test_data, penalty = 500)
tidy(fit, penalty = 1)

# 绘制不同的 lambda 图像
fit %>%
  extract_fit_engine() %>%
  plot(xvar = "lambda")     # 惩罚度的增加将逐渐把回归系数正则化为 0

2. 分类问题

# 预测结果
predict(fit, new_data = test_data, type = "prob") # 有 prob 就是概率，无就是结果

# 查看混淆矩阵
augment(fit, new_data = test_data) %>%
  conf_mat(truth = y, estimate = .pred_class) %>% # 矩阵
  autoplot(type = "heatmap")                      # 画图

# 查看准确率
augment(lr_fit, new_data = test_data) %>%
  accuracy(truth = y, estimate = .pred_class)

3. 模型间比较

models <- list("logistic regression" = lr_fit,
               "LDA" = lda_fit,
               "QDA" = qda_fit,
               "KNN" = knn_fit)

preds <- imap_dfr(models, augment,
                  new_data = test_data, .id = "model")

preds %>%
  select(model, y, .pred_class, .pred_Down, .pred_Up)

multi_metric <- metric_set(accuracy, sensitivity, specificity)

preds %>%
  group_by(model) %>%
  multi_metric(truth = y, estimate = .pred_class)

preds %>%
  group_by(model) %>%
  roc_curve(y, .pred_Down) %>%
  autoplot()

4. 决策树

# 展示树状
library(rpart.plot); tree_fit %>%
  extract_fit_engine() %>%
  rpart.plot()

# 分类树

# 查看准确率
augment(class_tree_fit, new_data = train_data) %>%
  accuracy(truth = y, estimate = .pred_class)

# 查看混淆矩阵
augment(class_tree_fit, new_data = train_data) %>%
  conf_mat(truth = y, estimate = .pred_class)

# 回归树

# 查看平均方差根
augment(reg_tree_fit, new_data = train_data) %>%
  rmse(truth = medv, estimate = .pred)

# 查看拟合结果
augment(bagging_fit, new_data = test_data) %>%
  ggplot(aes(medv, .pred)) +
  geom_abline() +
  geom_point(alpha = 0.5)

# 查看变量重要性
library(vip)
vip(bagging_fit)

5. 支持向量

# 支持向量分类器

# 可视化支持向量分类器结果
library(kernlab)
svm_linear_fit %>%
  extract_fit_engine() %>%
  plot()

# 支持向量机

# 查看混淆矩阵
augment(svm_rbf_fit, new_data = test_data) %>%
  conf_mat(truth = y, estimate = .pred_class)

# 创建并绘制 ROC 曲线
augment(svm_rbf_fit, new_data = test_data) %>%
  roc_curve(truth = y, estimate = .pred_1) %>% # 创建数据
  autoplot()                                   # 绘制曲线

# 计算 ROC 曲线下面积
augment(svm_rbf_fit, new_data = test_data) %>%
  roc_auc(truth = y, estimate = .pred_1)