library(emo)
想看預告片可以點我
前言: 如何應用有限資料創造大數據的力量 以小雜貨店為例,帶入情境介紹預測性模型的基本觀念,以及如何應用。並以雜貨店老闆的提問,來帶入我們要預測的目標為何。 之後我們降利用此資料來預測顧客下一期行為 Y: 預測顧客會不會來買以及會買多少錢 購買金額 (amount) 基本資料檢視、資料視覺化,可以幫助我們快速了解這筆資料。 購買與否 (Buy) X: 如何重新彙整顧客資料以及產品銷售資訊
Chapter 1: 資料彙整流程
Z
rm(list=ls(all=T))
# Sys.setlocale("LC_ALL","C")
library(dplyr)
library(ggplot2)
library(caTools)
do.call-rbind-lapply
ComboZ = do.call(rbind, lapply(
dir('data/TaFengDataSet','.*csv$',full.names=T),
read.csv, header=F)
) %>%
setNames(c("date","cust","age","area","cat","prod","qty","cost","price"))
nrow(Z)
## [1] 817741
Z$date = as.Date(as.character(Z$date))
summary(Z)
## date cust age area
## Min. :2000-11-01 Min. : 1069 D :181213 E :312501
## 1st Qu.:2000-11-28 1st Qu.: 969222 E :151023 F :245213
## Median :2001-01-01 Median : 1587722 C :140805 G : 72092
## Mean :2000-12-30 Mean : 1406620 F : 99719 C : 71640
## 3rd Qu.:2001-01-30 3rd Qu.: 1854930 B : 66432 H : 40666
## Max. :2001-02-28 Max. :20002000 G : 53719 D : 38674
## (Other):124830 (Other): 36955
## cat prod qty
## Min. :100101 Min. :2.001e+07 Min. : 1.000
## 1st Qu.:110106 1st Qu.:4.710e+12 1st Qu.: 1.000
## Median :130106 Median :4.710e+12 Median : 1.000
## Mean :284951 Mean :4.462e+12 Mean : 1.382
## 3rd Qu.:520314 3rd Qu.:4.713e+12 3rd Qu.: 1.000
## Max. :780510 Max. :9.790e+12 Max. :1200.000
##
## cost price
## Min. : 0.0 Min. : 1.0
## 1st Qu.: 35.0 1st Qu.: 42.0
## Median : 62.0 Median : 76.0
## Mean : 112.1 Mean : 131.9
## 3rd Qu.: 112.0 3rd Qu.: 132.0
## Max. :432000.0 Max. :444000.0
##
sapply(Z[,7:9], quantile, prob=c(.99, .999, .9995))
## qty cost price
## 99% 6 858.0 1014.00
## 99.9% 14 2722.0 3135.82
## 99.95% 24 3799.3 3999.00
Z = subset(Z, qty<=24 & cost<=3800 & price<=4000)
nrow(Z)
## [1] 817182
Z$tid = group_indices(Z, date, cust)
sapply(Z[,c("cust","cat","prod","tid")], n_distinct)
## cust cat prod tid
## 32256 2007 23789 119422
summary(Z)
## date cust age area
## Min. :2000-11-01 Min. : 1069 D :181089 E :312358
## 1st Qu.:2000-11-28 1st Qu.: 968775 E :150947 F :245079
## Median :2001-01-01 Median : 1587685 C :140721 G : 71905
## Mean :2000-12-30 Mean : 1406500 F : 99641 C : 71600
## 3rd Qu.:2001-01-30 3rd Qu.: 1854701 B : 66353 H : 40647
## Max. :2001-02-28 Max. :20002000 G : 53689 D : 38654
## (Other):124742 (Other): 36939
## cat prod qty cost
## Min. :100101 Min. :2.001e+07 Min. : 1.000 Min. : 0.0
## 1st Qu.:110106 1st Qu.:4.710e+12 1st Qu.: 1.000 1st Qu.: 35.0
## Median :130106 Median :4.710e+12 Median : 1.000 Median : 62.0
## Mean :284784 Mean :4.462e+12 Mean : 1.358 Mean : 106.2
## 3rd Qu.:520311 3rd Qu.:4.713e+12 3rd Qu.: 1.000 3rd Qu.: 112.0
## Max. :780510 Max. :9.790e+12 Max. :24.000 Max. :3798.0
##
## price tid
## Min. : 1.0 Min. : 1
## 1st Qu.: 42.0 1st Qu.: 28783
## Median : 76.0 Median : 59391
## Mean : 125.5 Mean : 58845
## 3rd Qu.: 132.0 3rd Qu.: 87391
## Max. :4000.0 Max. :119422
##
X
X = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 119422
summary(X)
## tid date cust age
## Min. : 1 Min. :2000-11-01 Min. : 1069 D :23775
## 1st Qu.: 29856 1st Qu.:2000-11-29 1st Qu.: 927093 C :19661
## Median : 59712 Median :2001-01-01 Median : 1615661 E :19596
## Mean : 59712 Mean :2000-12-31 Mean : 1402548 F :13992
## 3rd Qu.: 89567 3rd Qu.:2001-02-02 3rd Qu.: 1894493 B :10515
## Max. :119422 Max. :2001-02-28 Max. :20002000 G : 8493
## (Other):23390
## area items pieces total
## E :50532 Min. : 1.000 Min. : 1.000 Min. : 5
## F :33826 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 227
## G : 9498 Median : 5.000 Median : 6.000 Median : 510
## C : 8527 Mean : 6.843 Mean : 9.294 Mean : 859
## H : 7502 3rd Qu.: 9.000 3rd Qu.: 12.000 3rd Qu.: 1082
## D : 5108 Max. :112.000 Max. :339.000 Max. :30171
## (Other): 4429
## gross
## Min. :-1645.0
## 1st Qu.: 21.0
## Median : 68.0
## Mean : 132.3
## 3rd Qu.: 169.0
## Max. : 8069.0
##
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
## items pieces total gross
## 99.9% 54 81.0000 9009.579 1824.737
## 99.95% 62 94.2895 10611.579 2179.817
## 99.99% 82 133.0000 16044.401 3226.548
X = subset(X, items<=62 & pieces<95 & total<16000) # 119328
par(cex=0.8)
hist(X$date, "weeks", freq=T, border='lightgray', col='darkcyan',
las=2, main="No. Transaction per Week")
## Warning in axis(2, ...): "border" 不是一個繪圖參數
## Warning in axis(side, at = z, labels = labels, ...): "border" 不是一個繪圖
## 參數
A
d0 = max(X$date)
A = group_by(X, cust) %>% summarise(
r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = first(age), # age group
area = first(area), # area code
) %>% data.frame # 33241
summary(A)
## cust r s f
## Min. : 1069 Min. : 1.00 Min. : 1.00 Min. : 1.000
## 1st Qu.: 1088519 1st Qu.: 9.00 1st Qu.: 56.00 1st Qu.: 1.000
## Median : 1663402 Median : 26.00 Median : 92.00 Median : 2.000
## Mean : 1473585 Mean : 37.45 Mean : 80.78 Mean : 3.701
## 3rd Qu.: 1958089 3rd Qu.: 60.00 3rd Qu.:110.00 3rd Qu.: 4.000
## Max. :20002000 Max. :120.00 Max. :120.00 Max. :85.000
##
## m rev raw age
## Min. : 8.0 Min. : 8 Min. : -784.0 D :6580
## 1st Qu.: 365.0 1st Qu.: 707 1st Qu.: 75.0 C :5915
## Median : 705.7 Median : 1750 Median : 241.0 E :5080
## Mean : 993.1 Mean : 3152 Mean : 484.6 F :3719
## 3rd Qu.: 1291.0 3rd Qu.: 3968 3rd Qu.: 612.0 B :3193
## Max. :12636.0 Max. :127686 Max. :20273.0 G :2183
## (Other):5571
## area
## E :10800
## F : 8539
## G : 3695
## C : 3683
## D : 2166
## H : 1453
## (Other): 1905
par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m'))
hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)
A0 = A; X0 = X; Z0 = Z
save(Z0, X0, A0, file="data/tf0.rdata")
range(X$date)
## [1] "2000-11-01" "2001-02-28"
使用一月底(含2001-01-31)以前的資料,建立模型來預測每一位顧客:
Chapter 2: 資料準備流程
rm(list=ls(all=TRUE))
load("data/tf0.rdata")
Remove data after the demarcation date
feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01) # 618212
X = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 88387
summary(X)
## tid date cust age
## Min. : 1 Min. :2000-11-01 Min. : 1069 D :17541
## 1st Qu.:22098 1st Qu.:2000-11-23 1st Qu.: 923910 C :14624
## Median :44194 Median :2000-12-12 Median : 1607000 E :14578
## Mean :44194 Mean :2000-12-15 Mean : 1395768 F :10354
## 3rd Qu.:66291 3rd Qu.:2001-01-12 3rd Qu.: 1888874 B : 7817
## Max. :88387 Max. :2001-01-31 Max. :20002000 G : 6308
## (Other):17165
## area items pieces total
## E :37496 Min. : 1.000 Min. : 1.000 Min. : 5.0
## F :25412 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 230.0
## G : 6787 Median : 5.000 Median : 6.000 Median : 522.0
## C : 6329 Mean : 6.994 Mean : 9.453 Mean : 888.7
## H : 5524 3rd Qu.: 9.000 3rd Qu.: 12.000 3rd Qu.: 1120.0
## D : 3655 Max. :112.000 Max. :339.000 Max. :30171.0
## (Other): 3184
## gross
## Min. :-1645.0
## 1st Qu.: 23.0
## Median : 72.0
## Mean : 138.3
## 3rd Qu.: 174.0
## Max. : 8069.0
##
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
## items pieces total gross
## 99.9% 56.0000 84.0000 9378.684 1883.228
## 99.95% 64.0000 98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
A: 整理並在最後用來跑預測性模型之資料
d0 = max(X$date)
A = group_by(X, cust) %>% summarise(
r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = first(age), # age group
area = first(area), # area code
) %>% data.frame # 28584
feb = filter(X0, date>= feb01) %>% group_by(cust) %>%
summarise(amount = sum(total)) # 16899
A$amount
Simply a Left Joint
A = merge(A, feb, by="cust", all.x=T)
A$buy
A$buy = !is.na(A$amount)
summary(A)
## cust r s f
## Min. : 1069 Min. : 1.00 Min. : 1.00 Min. : 1.000
## 1st Qu.: 1060898 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
## Median : 1654100 Median :21.00 Median :68.00 Median : 2.000
## Mean : 1461070 Mean :32.12 Mean :61.27 Mean : 3.089
## 3rd Qu.: 1945003 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
## Max. :20002000 Max. :92.00 Max. :92.00 Max. :60.000
##
## m rev raw age
## Min. : 8.0 Min. : 8 Min. : -742.0 D :5832
## 1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0 C :5238
## Median : 709.5 Median : 1566 Median : 218.0 E :4514
## Mean : 1012.4 Mean : 2711 Mean : 420.8 F :3308
## 3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0 B :2802
## Max. :10634.0 Max. :99597 Max. :15565.0 G :1940
## (Other):4950
## area amount buy
## E :9907 Min. : 8 Mode :logical
## F :7798 1st Qu.: 454 FALSE:15342
## C :3169 Median : 993 TRUE :13242
## G :3052 Mean : 1499
## D :1778 3rd Qu.: 1955
## H :1295 Max. :28089
## (Other):1585 NA's :15342
tapply(A$buy, A$age, mean) %>% barplot
abline(h = mean(A$buy), col='red')
tapply(A$buy, A$area, mean) %>% barplot
abline(h = mean(A$buy), col='red')
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))
## [1] 28584 20008 8576
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
set.seed(2018); spl2 = sample.split(A2$amount, SplitRatio=0.7)
c(nrow(A2), sum(spl2), sum(!spl2))
## [1] 13242 9601 3641
save(Z, X, A, spl, spl2, file="data/tf2.rdata")
Chapter3: 迴歸模型
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
TR = subset(A, spl)
TS = subset(A, !spl)
glm1 = glm(buy ~ ., TR[,c(2:9, 11)], family=binomial())
summary(glm1)
##
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9,
## 11)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7931 -0.8733 -0.6991 1.0384 1.8735
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.259e+00 1.261e-01 -9.985 < 2e-16 ***
## r -1.227e-02 8.951e-04 -13.708 < 2e-16 ***
## s 9.566e-03 9.101e-04 10.511 < 2e-16 ***
## f 2.905e-01 1.593e-02 18.233 < 2e-16 ***
## m -3.028e-05 2.777e-05 -1.090 0.27559
## rev 4.086e-05 1.940e-05 2.106 0.03521 *
## raw -2.306e-04 8.561e-05 -2.693 0.00708 **
## ageB -4.194e-02 8.666e-02 -0.484 0.62838
## ageC 1.772e-02 7.992e-02 0.222 0.82456
## ageD 7.705e-02 7.921e-02 0.973 0.33074
## ageE 8.699e-02 8.132e-02 1.070 0.28476
## ageF 1.928e-02 8.457e-02 0.228 0.81962
## ageG 1.745e-02 9.323e-02 0.187 0.85155
## ageH 1.752e-01 1.094e-01 1.602 0.10926
## ageI 6.177e-02 1.175e-01 0.526 0.59904
## ageJ 2.652e-01 1.047e-01 2.533 0.01131 *
## ageK -1.419e-01 1.498e-01 -0.947 0.34347
## areaB -4.105e-02 1.321e-01 -0.311 0.75603
## areaC -2.075e-01 1.045e-01 -1.986 0.04703 *
## areaD 3.801e-02 1.111e-01 0.342 0.73214
## areaE 2.599e-01 9.682e-02 2.684 0.00727 **
## areaF 1.817e-01 9.753e-02 1.863 0.06243 .
## areaG -4.677e-02 1.045e-01 -0.448 0.65435
## areaH -1.695e-01 1.232e-01 -1.375 0.16912
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27629 on 20007 degrees of freedom
## Residual deviance: 23295 on 19984 degrees of freedom
## AIC: 23343
##
## Number of Fisher Scoring iterations: 5
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
## predict
## actual FALSE TRUE
## FALSE 3730 873
## TRUE 1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.69998
## [1] 0.6999767
colAUC(pred, TS$buy) # 0.7556
## [,1]
## FALSE vs. TRUE 0.7556038
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)])
summary(lm1)
##
## Call:
## lm(formula = amount ~ ., data = TR2[, c(2:6, 8:10)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.04733 -0.23360 0.04677 0.28484 1.67880
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.164e+00 4.891e-02 23.797 < 2e-16 ***
## r 4.027e-04 3.094e-04 1.301 0.193127
## s -3.277e-05 3.132e-04 -0.105 0.916668
## f 2.630e-02 1.704e-03 15.441 < 2e-16 ***
## m 5.136e-01 3.669e-02 14.000 < 2e-16 ***
## rev 5.205e-02 3.552e-02 1.465 0.142934
## ageB 2.510e-02 2.480e-02 1.012 0.311352
## ageC 8.571e-02 2.288e-02 3.746 0.000181 ***
## ageD 1.015e-01 2.256e-02 4.498 6.93e-06 ***
## ageE 9.441e-02 2.303e-02 4.100 4.16e-05 ***
## ageF 6.508e-02 2.399e-02 2.713 0.006684 **
## ageG 4.403e-02 2.626e-02 1.677 0.093550 .
## ageH 4.840e-02 3.089e-02 1.567 0.117186
## ageI 1.610e-02 3.252e-02 0.495 0.620547
## ageJ -3.803e-02 2.819e-02 -1.349 0.177398
## ageK 7.642e-02 3.955e-02 1.932 0.053373 .
## areaB 4.065e-02 4.130e-02 0.984 0.324960
## areaC -8.146e-03 3.348e-02 -0.243 0.807754
## areaD -2.134e-02 3.537e-02 -0.603 0.546389
## areaE -2.820e-02 3.056e-02 -0.923 0.356161
## areaF -6.067e-03 3.078e-02 -0.197 0.843755
## areaG -1.427e-03 3.318e-02 -0.043 0.965695
## areaH -2.296e-02 3.761e-02 -0.610 0.541547
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4282 on 9578 degrees of freedom
## Multiple R-squared: 0.2972, Adjusted R-squared: 0.2956
## F-statistic: 184.1 on 22 and 9578 DF, p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) - TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
## [1] 0.2972137 0.2337665
Chapter4: 決策樹
Sys.setlocale("LC_ALL","C")
## [1] "C"
library(Matrix)
library(slam)
library(rpart)
library(rpart.plot)
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A2 = subset(A, buy)
c(sum(spl), sum(spl2))
## [1] 20008 9601
X = X %>% mutate(wday = format(date, "%w"))
table(X$wday)
##
## 0 1 2 3 4 5 6
## 18011 12615 11288 9898 11245 10651 14587
mx = xtabs(~ cust + wday, X)
dim(mx)
## [1] 28584 7
mx[1:5,]
## wday
## cust 0 1 2 3 4 5 6
## 1069 1 1 0 0 0 0 0
## 1113 2 1 0 0 0 0 1
## 1359 0 1 0 0 0 0 0
## 1823 0 1 0 1 1 0 0
## 2189 0 0 0 1 0 0 1
mx = mx / rowSums(mx)
mx[1:5,]
## wday
## cust 0 1 2 3 4 5
## 1069 0.5000000 0.5000000 0.0000000 0.0000000 0.0000000 0.0000000
## 1113 0.5000000 0.2500000 0.0000000 0.0000000 0.0000000 0.0000000
## 1359 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## 1823 0.0000000 0.3333333 0.0000000 0.3333333 0.3333333 0.0000000
## 2189 0.0000000 0.0000000 0.0000000 0.5000000 0.0000000 0.0000000
## wday
## cust 6
## 1069 0.0000000
## 1113 0.2500000
## 1359 0.0000000
## 1823 0.0000000
## 2189 0.5000000
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>%
setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>%
right_join(A, by='cust')
head(A)
## cust W1 W2 W3 W4 W5 W6 W7 r s f m rev
## 1 1069 0.5 0.5000000 0.0 0.0000000 0.0000000 0.0 0.00 11 80 2 579.0 1158
## 2 1113 0.5 0.2500000 0.0 0.0000000 0.0000000 0.0 0.25 26 81 4 557.5 2230
## 3 1359 0.0 1.0000000 0.0 0.0000000 0.0000000 0.0 0.00 59 59 1 364.0 364
## 4 1823 0.0 0.3333333 0.0 0.3333333 0.3333333 0.0 0.00 8 91 3 869.0 2607
## 5 2189 0.0 0.0000000 0.0 0.5000000 0.0000000 0.0 0.50 29 61 2 7028.0 14056
## 6 3667 0.0 0.0000000 0.5 0.0000000 0.0000000 0.5 0.00 37 55 2 2379.5 4759
## raw age area amount buy
## 1 129 K E 786 TRUE
## 2 241 K F NA FALSE
## 3 104 K G NA FALSE
## 4 498 K D NA FALSE
## 5 3299 K B NA FALSE
## 6 351 K G 1570 TRUE
TR = subset(A, spl)
TS = subset(A, !spl)
library(rpart)
library(rpart.plot)
rpart1 = rpart(buy ~ ., TR[,c(2:16,18)], method="class")
pred = predict(rpart1, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred > 0.5); cm
## predict
## actual FALSE TRUE
## FALSE 3730 873
## TRUE 1643 2330
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.70662
## [1] 0.7066231
colAUC(pred, TS$buy) # 0.6984
## [,1]
## FALSE vs. TRUE 0.6983998
rpart.plot(rpart1,cex=0.6)
rpart2 = rpart(buy ~ ., TR[,c(2:16,18)], method="class",cp=0.001)
pred = predict(rpart2, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred > 0.5); cm
## predict
## actual FALSE TRUE
## FALSE 3878 725
## TRUE 1812 2161
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.70417
## [1] 0.7041744
colAUC(pred, TS$buy) # 0.7169
## [,1]
## FALSE vs. TRUE 0.7168957
rpart.plot(rpart2,cex=0.6)
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
rpart3 = rpart(amount ~ ., TR2[,c(2:17)], cp=0.002)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart3, TS2) - TS2$amount)^2)
1 - (SSE/SST)
## [1] 0.2172772
Chapter5: 交叉驗證與參數調校
點我看影片
Sys.setlocale("LC_ALL","C")
## [1] "C"
library(caret)
library(doParallel)
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A$buy = factor(ifelse(A$buy, "yes", "no")) # comply to the rule of caret
TR = A[spl, c(2:9,11)]
TS = A[!spl, c(2:9,11)]
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
## [1] 4
ctrl = trainControl(
method="repeatedcv", number=10, # 10-fold, Repeated CV
savePredictions = "final", classProbs=TRUE,
summaryFunction=twoClassSummary)
rpart()
, Classification Treectrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
buy ~ ., data=TR, method="rpart",
trControl=ctrl, metric="ROC",
tuneGrid = expand.grid(cp = seq(0.0002,0.001,0.0001) ) )
Sys.time() - t0
## Time difference of 22.62487 secs
plot(cv.rpart)
cv.rpart$results
## cp ROC Sens Spec ROCSD SensSD SpecSD
## 1 2e-04 0.7084066 0.7598480 0.5791341 0.01036383 0.01447771 0.01901226
## 2 3e-04 0.7155533 0.7800992 0.5742276 0.01040769 0.02913860 0.02550630
## 3 4e-04 0.7178635 0.7970962 0.5671579 0.01550732 0.02893758 0.02621973
## 4 5e-04 0.7182132 0.8049660 0.5621939 0.01717161 0.02798866 0.02456524
## 5 6e-04 0.7169737 0.8104121 0.5605775 0.01699893 0.02177565 0.02136082
## 6 7e-04 0.7138179 0.8131120 0.5611177 0.01761331 0.01663849 0.01652607
## 7 8e-04 0.7087418 0.8162777 0.5594460 0.01303339 0.01317749 0.01386506
## 8 9e-04 0.7077703 0.8177675 0.5592302 0.01047743 0.01467971 0.01499422
## 9 1e-03 0.7075750 0.8181864 0.5591763 0.01057787 0.01443186 0.01550779
rpart1 = rpart(buy ~ ., TR, method="class", cp=0.0005)
predict(rpart1, TS, type="prob")[,2] %>%
colAUC(TS$buy)
## [,1]
## no vs. yes 0.7401771
glm()
, General Linear Model(邏輯式回歸)ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.glm = train(
buy ~ ., data=TR, method="glm",
trControl=ctrl, metric="ROC")
Sys.time() - t0
## Time difference of 5.473367 secs
cv.glm$results
## parameter ROC Sens Spec ROCSD SensSD SpecSD
## 1 none 0.7487897 0.8057559 0.568562 0.01017789 0.01321789 0.01461063
glm()
, Final Modelglm1 = b=glm(buy ~ ., TR, family=binomial)
predict(glm1, TS, type="response") %>% colAUC(TS$buy)
## [,1]
## no vs. yes 0.7556038
A2 = subset(A, A$buy == "yes") %>% mutate_at(c("m","rev","amount"), log10)
TR2 = A2[ spl2, c(2:10)]
TS2 = A2[!spl2, c(2:10)]
ctrl2 = trainControl(
method="repeatedcv", number=10, # 10-fold, Repeated CV
savePredictions = "final")
rpart()
Regression Treectrl$repeats = 2
set.seed(2)
cv.rpart2 = train(
amount ~ ., data=TR2, method="rpart",
trControl=ctrl2, metric="Rsquared",
tuneGrid = expand.grid(cp = seq(0.0008,0.0024,0.0001) ) )
plot(cv.rpart2)
cv.rpart2$results
## cp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 0.0008 0.4305827 0.2905017 0.3303821 0.007464477 0.03219394 0.005085291
## 2 0.0009 0.4293302 0.2939861 0.3295331 0.008253932 0.03375719 0.004812277
## 3 0.0010 0.4292746 0.2940469 0.3295785 0.008023515 0.03436858 0.004455055
## 4 0.0011 0.4291072 0.2944978 0.3293753 0.007711396 0.03432309 0.004471973
## 5 0.0012 0.4292391 0.2941081 0.3296183 0.007848534 0.03501906 0.004701831
## 6 0.0013 0.4294395 0.2933438 0.3300743 0.007774047 0.03384411 0.004634104
## 7 0.0014 0.4298924 0.2919842 0.3303230 0.007834039 0.03382697 0.004660953
## 8 0.0015 0.4301352 0.2911416 0.3305803 0.008048291 0.03440911 0.004937880
## 9 0.0016 0.4305736 0.2897460 0.3309383 0.007600250 0.03343885 0.004733278
## 10 0.0017 0.4301419 0.2908523 0.3308101 0.008022654 0.03374871 0.004775529
## 11 0.0018 0.4304798 0.2896502 0.3310972 0.008164540 0.03515746 0.005038499
## 12 0.0019 0.4304533 0.2896931 0.3310614 0.008157967 0.03511973 0.005058048
## 13 0.0020 0.4309692 0.2879635 0.3315247 0.008736241 0.03667922 0.005503196
## 14 0.0021 0.4313581 0.2867167 0.3317925 0.008921059 0.03738517 0.005477327
## 15 0.0022 0.4316905 0.2856461 0.3320903 0.009050195 0.03710472 0.005613428
## 16 0.0023 0.4317147 0.2855091 0.3325167 0.009162067 0.03742653 0.005620368
## 17 0.0024 0.4321351 0.2840164 0.3328951 0.008312290 0.03530018 0.005297591
rpart()
, Regression Tree Final Modelrpart2 = rpart(amount ~ ., data=TR2, cp=0.0016)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart2, TS2) - TS2$amount)^2)
(r2.ts.rpart2 = 1 - (SSE/SST))
## [1] 0.2174555
lm()
, Linear Modelctrl$repeats = 2
set.seed(2)
cv.lm2 = train(
amount ~ ., data=TR2, method="lm",
trControl=ctrl2, metric="Rsquared",
tuneGrid = expand.grid( intercept = seq(0,5,0.5) )
)
plot(cv.lm2)
cv.lm2$results
## intercept RMSE Rsquared MAE RMSESD RsquaredSD
## 1 0.0 0.4410099 0.2778923 0.3360522 0.011482983 0.03697934
## 2 0.5 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 3 1.0 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 4 1.5 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 5 2.0 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 6 2.5 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 7 3.0 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 8 3.5 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 9 4.0 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 10 4.5 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 11 5.0 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## MAESD
## 1 0.006465768
## 2 0.005825454
## 3 0.005825454
## 4 0.005825454
## 5 0.005825454
## 6 0.005825454
## 7 0.005825454
## 8 0.005825454
## 9 0.005825454
## 10 0.005825454
## 11 0.005825454
lm()
Final Modellm2 = lm(amount ~ ., TR2)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm2, TS2) - TS2$amount)^2)
(r2.ts.lm2 = 1 - (SSE/SST))
## [1] 0.2381007
stopCluster(clust)