fit<-lm(weight ~ height)







lm.model <- lm( weight ~ height - 1, data = women)  # 建立线性回归模型

summary(lm.model)  # 输出模型的统计信息

coefficients(lm.model)  # 输出参数估计值

confint(lm.model, parm = "speed", level = 0.95)  # parm缺省则计算所有参数的置信区间

fitted(lm.model)  # 列出拟合模型的预测值

anova(lm.model)  # 生成一个拟合模型的方差分析表

vcov(lm.model)  # 列出模型参数的协方差矩阵

residuals(lm.model)  # 列出模型的残差

AIC(lm.model)  # 输出AIC值

par(mfrow = c(2, 2))

plot(lm.model)  # 生成评价拟合模型的诊断图





data(Affairs, package = "AER")

# 由于变量affairs为正整数,为了进行Logistic回归先要将其转化为二元变量。

Affairs$ynaffair[Affairs$affairs > 0] <- 1

Affairs$ynaffair[Affairs$affairs == 0] <- 0

Affairs$ynaffair <- factor(Affairs$ynaffair, levels = c(0, 1),

                           labels = c("No", "Yes"))

# 建立Logistic回归模型

model.L <- glm(ynaffair ~ age + yearsmarried + religiousness + rating,

               data = Affairs, family = binomial (link = logit))

summary(model.L)  # 展示拟合模型的详细结果

predictdata <- data.frame(Affairs[, c("age", "yearsmarried", "religiousness", "rating")])

# 由于拟合结果是给每个观测值一个概率值,下面以0.4作为分类界限

predictdata$y <- (predict(model.L, predictdata, type = "response") > 0.4)

predictdata$y[which(predictdata$y == FALSE)] = "No"  # 把预测结果转换成原先的值(Yes或No)

predictdata$y[which(predictdata$y == TRUE)] = "Yes"

confusion <- table(actual = Affairs$ynaffair, predictedclass = predictdata$y)  # 混淆矩阵


(sum(confusion) - sum(diag(confusion))) / sum(confusion)  # 计算错判率





fit <- lm(weight ~ height, data = women)  # 建立线性模型

outlierTest(fit)   # Bonferroni离群点检验

women[10, ] <- c(70, 200)  # 将第10个观测的数据该成height = 70,weight = 200

fit <- lm(weight ~ height, data = women)

outlierTest(fit)  # Bonferroni离群点检验






Data <-  freeny

lm <- lm(y ~ ., data = Data)  # logistic回归模型


lm.step <- step(lm, direction = "both")  # 一切子集回归


lm.step <- step(lm, direction = "forward")  # 前进法


lm.step <- step(lm, direction = "backward")  # 后退法




Data <- read.csv("Telephone.csv",fileEncoding = "GB2312")  # 读入数据

Data[, "流失"] <- as.factor(Data[, "流失"])  # 将目标变量转换成因子型

set.seed(1234)  # 设置随机种子

# 数据集随机抽70%定义为训练数据集,30%为测试数据集

ind <- sample(2, nrow(Data), replace = TRUE, prob = c(0.7, 0.3))

traindata <- Data[ind == 1, ]

testdata <- Data[ind == 2, ]

# 建立决策树模型预测客户是否流失



library(party)  # 加载决策树的包

ctree.model <- ctree(流失 ~ ., data = traindata)  # 建立C4.5决策树模型

plot(ctree.model, type = "simple")  # 输出决策树图

# 预测结果

train_predict <- predict(ctree.model)  # 训练数据集

test_predict <- predict(ctree.model, newdata = testdata)  # 测试数据集

# 输出训练数据的分类结果

# 输出训练数据的分类结果

train_predictdata <- cbind(traindata, predictedclass = train_predict)


(train_confusion <- table(actual = traindata$流失, predictedclass = train_predict) )

# 输出测试数据的分类结果

test_predictdata <- cbind(testdata, predictedclass = test_predict)

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict))



Data <- read.csv("telephone.csv",fileEncoding = "GB2312")  # 读入数据

Data[, "流失"] <- as.factor(Data[, "流失"])  # 将目标变量转换成因子型

set.seed(1234)  # 设置随机种子

# 数据集随机抽70%定义为训练数据集,30%为测试数据集

ind <- sample(2, nrow(Data), replace = TRUE, prob = c(0.7, 0.3))

traindata <- Data[ind == 1, ]

testdata <- Data[ind == 2, ]

# 建立决策树模型预测客户是否流失


library(tree)  # 加载决策树的包

tree.model <- tree(流失 ~ ., data = traindata)  # 建立CART决策树模型

plot(tree.model, type = "uniform")  # 输出决策树图


# 预测结果

train_predict <- predict(tree.model, type = "class")  # 训练数据集

test_predict <- predict(tree.model, newdata = testdata, type = "class")  # 测试数据集

# 输出训练数据的分类结果

train_predictdata <- cbind(traindata, predictedclass = train_predict)

# 输出训练数据的混淆矩阵

(train_confusion <- table(actual = traindata$流失, predictedclass = train_predict))

# 输出测试数据的分类结果

test_predictdata <- cbind(testdata, predictedclass = test_predict)

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict))



Data <- read.csv("telephone.csv",fileEncoding = "GB2312")  # 读入数据

Data[, "流失"] <- as.factor(Data[, "流失"])  # 将目标变量转换成因子型

set.seed(1234)  # 设置随机种子

# 数据集随机抽70%定义为训练数据集,30%为测试数据集

ind <- sample(2, nrow(Data), replace = TRUE, prob = c(0.7, 0.3))

traindata <- Data[ind == 1, ]

testdata <- Data[ind == 2, ]

# 建立决策树模型预测客户是否流失


library(C50)  # 加载决策树的包

c50.model <- C5.0(流失 ~ ., data = traindata)  # 建立C5.0决策树模型

plot(c50.model)  # 输出决策树图

# 预测结果

train_predict <- predict(c50.model, newdata = traindata, type = "class")  # 训练数据集

test_predict <- predict(c50.model, newdata = testdata, type = "class")  # 测试数据集

# 输出训练数据的分类结果

train_predictdata <- cbind(traindata, predictedclass = train_predict)

# 输出训练数据的混淆矩阵

(train_confusion <- table(actual = traindata$流失, predictedclass = train_predict))

# 输出测试数据的分类结果

test_predictdata <- cbind(testdata, predictedclass = test_predict)

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict))



Data[, "流失"] <- as.factor(Data[, "流失"])  # 将目标变量转换成因子型

set.seed(1234)  # 设置随机种子

# 数据集随机抽70%定义为训练数据集,30%为测试数据集

ind <- sample(2, nrow(Data), replace = TRUE, prob = c(0.7, 0.3))

traindata <- Data[ind == 1, ]

testdata <- Data[ind == 2, ]

# BP神经网络建模

library(nnet) #加载nnet包

# 设置参数

size <- 10  # 隐层节点数为10

decay <- 0.05  # 权值的衰减参数为0.05

nnet.model <- nnet(流失 ~ ., traindata, size = size, decay = decay)  # 建立BP神经网络模型

summary(nnet.model)  # 输出模型概要

# 预测结果

train_predict <- predict(nnet.model, newdata = traindata, type = "class")  # 训练数据集

test_predict <- predict(nnet.model, newdata = testdata, type = "class")  # 测试数据集

# 输出训练数据的分类结果

train_predictdata <- cbind(traindata, predictedclass = train_predict)

# 输出训练数据的混淆矩阵

(train_confusion <- table(actual = traindata$流失, predictedclass = train_predict))

# 输出测试数据的分类结果

test_predictdata <- cbind(testdata, predictedclass = test_predict)

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict))



Data[, "流失"] <- as.factor(Data[, "流失"])  # 将目标变量转换成因子型

set.seed(1234)  # 设置随机种子

# 数据集随机抽70%定义为训练数据集,30%为测试数据集

ind <- sample(2, nrow(Data), replace = TRUE, prob = c(0.7, 0.3))

traindata <- Data[ind == 1, ]

testdata <- Data[ind == 2, ]

# 使用kknn函数建立knn分类模型


library(kknn)  # 加载kknn包

# knn分类模型

kknn.model <- kknn(流失 ~ ., train = traindata, test = traindata, k = 5)  # 训练数据

kknn.model2 <- kknn(流失 ~ ., train = traindata, test = testdata, k = 5)  # 测试数据

summary(kknn.model)  # 输出模型概要

# 预测结果

train_predict <- predict(kknn.model)  # 训练数据

test_predict <- predict(kknn.model2)  # 测试数据

# 输出训练数据的混淆矩阵

(train_confusion <- table(actual = traindata$流失, predictedclass = train_predict))

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict))

# 使用knn函数建立knn分类模型

library(class)  # 加载class包

# 建立knn分类模型

knn.model <- knn(traindata, testdata, cl = traindata[, "流失"])

# 输出测试数据的混淆矩阵

(test_confusion = table(actual = testdata$流失, predictedclass = knn.model))

# 使用train函数建立knn分类模型


library(caret)  # 加载caret包

# 建立knn分类模型

train.model <- train(traindata, traindata[, "流失"], method = "knn")

# 预测结果

train_predict <- predict(train.model, newdata = traindata)      #训练数据集

test_predict <- predict(train.model, newdata = testdata)       #测试数据集

# 输出训练数据的混淆矩阵

(train_confusion <- table(actual = traindata$流失, predictedclass = train_predict))

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict))




Data[, "流失"] <- as.factor(Data[, "流失"])  # 将目标变量转换成因子型

set.seed(1234)  # 设置随机种子

# 数据集随机抽70%定义为训练数据集,30%为测试数据集

ind <- sample(2, nrow(Data), replace = TRUE, prob = c(0.7, 0.3))

traindata <- Data[ind == 1, ]

testdata <- Data[ind == 2, ]

# 使用naiveBayes函数建立朴素贝叶斯分类模型

library(e1071)  # 加载e1071包

naiveBayes.model <- naiveBayes(流失 ~ ., data = traindata)  # 建立朴素贝叶斯分类模型

# 预测结果

train_predict <- predict(naiveBayes.model, newdata = traindata)  # 训练数据集

test_predict <- predict(naiveBayes.model, newdata = testdata)  # 测试数据集

# 输出训练数据的分类结果

train_predictdata <- cbind(traindata, predictedclass = train_predict)

# 输出训练数据的混淆矩阵

(train_confusion <- table(actual = traindata$流失, predictedclass = train_predict))

# 输出测试数据的分类结果

test_predictdata <- cbind(testdata, predictedclass = test_predict)

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict))

# 使用NaiveBayes函数建立朴素贝叶斯分类模型


library(klaR)  # 加载klaR包

NaiveBayes.model <- NaiveBayes(流失 ~ ., data = traindata)  # 建立朴素贝叶斯分类模型

# 预测结果

train_predict <- predict(NaiveBayes.model)  # 训练数据集

test_predict <- predict(NaiveBayes.model, newdata = testdata)  # 测试数据集

# 输出训练数据的分类结果

train_predictdata <- cbind(traindata, predictedclass = train_predict$class)

# 输出训练数据的混淆矩阵

(train_confusion <- table(actual = traindata$流失, predictedclass = train_predict$class))

# 输出测试数据的分类结果

test_predictdata <- cbind(testdata, predictedclass = test_predict$class)

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict$class))



Data[, "流失"] <- as.factor(Data[, "流失"]) #将目标变量转换成因子型

set.seed(1234)  # 设置随机种子

# 数据集随机抽70%定义为训练数据集,30%为测试数据集

ind <- sample(2, nrow(Data), replace = TRUE, prob = c(0.7, 0.3))

traindata <- Data[ind == 1, ]

testdata <- Data[ind == 2, ]

# 建立lda分类模型



lda.model <- lda(流失 ~ ., data = traindata)

# 预测结果

train_predict <- predict(lda.model, newdata = traindata)  # 训练数据集

test_predict <- predict(lda.model, newdata = testdata)  # 测试数据集

# 输出训练数据的分类结果

train_predictdata <- cbind(traindata, predictedclass = train_predict$class)

# 输出训练数据的混淆矩阵

(train_confusion <- table(actual = traindata$流失, predictedclass = train_predict$class))

# 输出测试数据的分类结果

test_predictdata <- cbind(testdata, predictedclass = test_predict$class)

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict$class))



Data[, "流失"] <- as.factor(Data[, "流失"])  # 将目标变量转换成因子型

set.seed(1234)  # 设置随机种子

# 数据集随机抽70%定义为训练数据集,30%为测试数据集

ind <- sample(2, nrow(Data), replace = TRUE, prob = c(0.7, 0.3))

traindata <- Data[ind == 1, ]

testdata <- Data[ind == 2, ]

# 建立rpart分类模型




rpart.model <- rpart(流失 ~ ., data = traindata, method = "class", cp = 0.03)  # cp为复杂的参数

# 输出决策树图

rpart.plot(rpart.model, branch = 1, branch.type = 2, type = 1, extra = 102, 

           border.col = "blue", split.col = "red", 

           split.cex = 1, main = "客户流失决策树")

# 预测结果

train_predict <- predict(rpart.model, newdata = traindata, type = "class")  # 训练数据集

test_predict <- predict(rpart.model, newdata = testdata, type = "class")  # 测试数据集

# 输出训练数据的分类结果

train_predictdata <- cbind(traindata, predictedclass = train_predict)

# 输出训练数据的混淆矩阵

(train_confusion <- table(actual = traindata$流失, predictedclass = train_predict))

# 输出测试数据的分类结果

test_predictdata <- cbind(testdata, predictedclass = test_predict)

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict))



Data[, "流失"] <- as.factor(Data[, "流失"])  # 将目标变量转换成因子型

set.seed(1234)  # 设置随机种子

# 数据集随机抽70%定义为训练数据集,30%为测试数据集

ind <- sample(2, nrow(Data), replace = TRUE, prob = c(0.7, 0.3))

traindata <- Data[ind == 1, ]

testdata <- Data[ind == 2, ]

# 建立bagging分类模型



bagging.model <- bagging(流失 ~ ., data = traindata)

# 预测结果

train_predict <- predict(bagging.model, newdata = traindata)  # 训练数据集

test_predict <- predict(bagging.model, newdata = testdata)  # 测试数据集

# 输出训练数据的分类结果

train_predictdata <- cbind(traindata, predictedclass = train_predict$class)

# 输出训练数据的混淆矩阵

(train_confusion <- table(actual = traindata$流失, predictedclass = train_predict$class))

# 输出测试数据的分类结果

test_predictdata <- cbind(testdata, predictedclass = test_predict$class)

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict$class))



Data[, "流失"] <- as.factor(Data[, "流失"])  # 将目标变量转换成因子型

set.seed(1234)  # 设置随机种子

# 数据集随机抽70%定义为训练数据集,30%为测试数据集

ind <- sample(2, nrow(Data), replace = TRUE, prob = c(0.7, 0.3))

traindata <- Data[ind == 1, ]

testdata <- Data[ind == 2, ]

# 建立randomForest模型



randomForest.model <- randomForest(流失 ~ ., data = traindata)

# 预测结果

test_predict <- predict(randomForest.model, newdata = testdata)  # 测试数据集

# 输出训练数据的混淆矩阵

(train_confusion <- randomForest.model$confusion)

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict))



Data[, "流失"] = as.factor(Data[, "流失"])  # 将目标变量转换成因子型

set.seed(1234)  # 设置随机种子

# 数据集随机抽70%定义为训练数据集,30%为测试数据集

ind <- sample(2, nrow(Data), replace = TRUE, prob = c(0.7, 0.3))

traindata <- Data[ind == 1, ]

testdata <- Data[ind == 2, ]

# 建立svm模型



svm.model <- svm(流失 ~ ., data = traindata)

# 预测结果

train_predict <- predict(svm.model, newdata = traindata)  # 训练数据集

test_predict <- predict(svm.model, newdata = testdata)  # 测试数据集

# 输出训练数据的分类结果

train_predictdata <- cbind(traindata, predictedclass = train_predict)

# 输出训练数据的混淆矩阵

(train_confusion <- table(actual = traindata$流失, predictedclass = train_predict))

# 输出测试数据的分类结果

test_predictdata <- cbind(testdata, predictedclass = test_predict)

# 输出测试数据的混淆矩阵

(test_confusion <- table(actual = testdata$流失, predictedclass = test_predict))






# 预测结果

train_predict <- predict(lda.model, newdata = traindata)  # 训练数据集

test_predict <- predict(lda.model, newdata = testdata)  # 测试数据集

par(mfrow = c(1, 2))

# ROC曲线

# 训练集

predi <- prediction(train_predict$posterior[, 2], traindata$流失)

perfor <- performance(predi, "tpr", "fpr")

plot(perfor, col = "red", type = "l", main = "ROC曲线", lty = 1)  # 训练集的ROC曲线

# 测试集

predi2 <- prediction(test_predict$posterior[, 2], testdata$流失)

perfor2 <- performance(predi2, "tpr", "fpr")

par(new = T)

plot(perfor2, col = "blue", type = "l", pch = 2, lty = 2)  # 测试集的ROC曲线

abline(0, 1)

legend("bottomright", legend = c("训练集", "测试集"), bty = "n",

       lty = c(1, 2), col = c("red", "blue"))  # 图例

# PR曲线

# 训练集

perfor <- performance(predi, "prec", "rec")

plot(perfor, col = "red", type = "l", main = "PR曲线", xlim = c(0, 1),

     ylim = c(0, 1), lty = 1)  # 训练集的PR曲线

# 测试集

perfor2 <- performance(predi2, "prec", "rec")

par(new = T)

plot(perfor2, col = "blue", type = "l", pch = 2, xlim = c(0, 1),

     ylim = c(0, 1), lty = 2)  # 测试集的PR曲线

abline(1, -1)

legend("bottomleft", legend = c("训练集", "测试集"), bty = "n",

       lty = c(1, 2), col = c("red", "blue"))  # 图例





Data <- read.csv("arima_data.csv", header = T,fileEncoding = "GB2312")[, 2]

sales <- ts(Data)

plot.ts(sales, xlab = "时间", ylab = "销量 / 元")

# 一阶差分

difsales <- diff(sales)

# BIC图

res <- armasubsets(y = difsales, nar = 5, nma = 5, y.name = 'test',

                   ar.method = 'ols')




Data <- read.csv("bankloan.csv",fileEncoding = "GB2312")[2:701, ]

# 数据命名

colnames(Data) <- c("x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "y")

# logistic回归模型

glm <- glm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8,

           family = binomial(link = logit), data = Data)


# 逐步寻优法

logit.step <- step(glm, direction = "both")


# 前向选择法

logit.step <- step(glm, direction = "forward")


# 后向选择法

logit.step <- step(glm, direction = "backward")




data <- read.csv("sales_data.csv",fileEncoding = "GB2312")[, 2:5]

# 数据命名

colnames(data) <- c("x1", "x2", "x3", "result")

# 计算一列数据的信息熵

calculateEntropy <- function(data) {

  t <- table(data)   

  sum <- sum(t)      

  t <- t[t != 0]       

  entropy <- -sum(log2(t / sum) * (t / sum))



# 计算两列数据的信息熵

calculateEntropy2 <- function(data) {

  var <- table(data[1])

  p <- var/sum(var)

  varnames <- names(var)

  array <- c()

  for (name in varnames) {

    array <- append(array, calculateEntropy(subset(data, data[1] == name,

                                                   select = 2)))


  return(sum(array * p))


buildTree <- function(data) {

  if (length(unique(data$result)) == 1) {




  if (length(names(data)) == 1) {




  entropy <- calculateEntropy(data$result) 

  labels <- names(data)

  label <- ""

  temp <- Inf

  subentropy <- c()

  for (i in 1:(length(data) - 1)) {

    temp2 <- calculateEntropy2(data[c(i, length(labels))])

    if (temp2 < temp) {        

      temp <- temp2         

      label <- labels[i]     


    subentropy <- append(subentropy,temp2) 




  nextLabels <- labels[labels != label]

  for (value in unlist(unique(data[label]))) {


    buildTree(subset(data,data[label] == value, select = nextLabels))





# 构建分类树




Data <- read.csv("sales_data.csv",fileEncoding = "GB2312")[, 2:5]

# 数据命名


colnames(Data) <- c("x1", "x2", "x3", "y")



Data$y <- as.factor(Data$y)


# 最终模型

model1 <- nnet(y ~ ., data = Data, size = 6, decay = 5e-4, maxit = 1000) 

pred <- predict(model1, Data[, 1:3], type = "class")

(P <- sum(as.numeric(pred == Data$y)) / nrow(Data))

table(Data$y, pred)

prop.table(table(Data$y, pred), 1)



