library(emo)

👩‍🏫

想看預告片可以點我

前言: 如何應用有限資料創造大數據的力量 以小雜貨店為例,帶入情境介紹預測性模型的基本觀念,以及如何應用。並以雜貨店老闆的提問,來帶入我們要預測的目標為何。 之後我們降利用此資料來預測顧客下一期行為 Y: 預測顧客會不會來買以及會買多少錢 購買金額 (amount) 基本資料檢視、資料視覺化,可以幫助我們快速了解這筆資料。 購買與否 (Buy) X: 如何重新彙整顧客資料以及產品銷售資訊

Chapter 1: 資料彙整流程


1. 交易項目計錄:Z

rm(list=ls(all=T))
# Sys.setlocale("LC_ALL","C")
library(dplyr)
library(ggplot2)
library(caTools)
1.1 The do.call-rbind-lapply Combo
Z = do.call(rbind, lapply(
    dir('data/TaFengDataSet','.*csv$',full.names=T),
    read.csv, header=F) 
  ) %>% 
  setNames(c("date","cust","age","area","cat","prod","qty","cost","price"))
nrow(Z)
## [1] 817741
Data Convresion
Z$date = as.Date(as.character(Z$date))
summary(Z)
##       date                 cust               age              area       
##  Min.   :2000-11-01   Min.   :    1069   D      :181213   E      :312501  
##  1st Qu.:2000-11-28   1st Qu.:  969222   E      :151023   F      :245213  
##  Median :2001-01-01   Median : 1587722   C      :140805   G      : 72092  
##  Mean   :2000-12-30   Mean   : 1406620   F      : 99719   C      : 71640  
##  3rd Qu.:2001-01-30   3rd Qu.: 1854930   B      : 66432   H      : 40666  
##  Max.   :2001-02-28   Max.   :20002000   G      : 53719   D      : 38674  
##                                          (Other):124830   (Other): 36955  
##       cat              prod                qty          
##  Min.   :100101   Min.   :2.001e+07   Min.   :   1.000  
##  1st Qu.:110106   1st Qu.:4.710e+12   1st Qu.:   1.000  
##  Median :130106   Median :4.710e+12   Median :   1.000  
##  Mean   :284951   Mean   :4.462e+12   Mean   :   1.382  
##  3rd Qu.:520314   3rd Qu.:4.713e+12   3rd Qu.:   1.000  
##  Max.   :780510   Max.   :9.790e+12   Max.   :1200.000  
##                                                         
##       cost              price         
##  Min.   :     0.0   Min.   :     1.0  
##  1st Qu.:    35.0   1st Qu.:    42.0  
##  Median :    62.0   Median :    76.0  
##  Mean   :   112.1   Mean   :   131.9  
##  3rd Qu.:   112.0   3rd Qu.:   132.0  
##  Max.   :432000.0   Max.   :444000.0  
## 
  • 將date變成文字型態
  • 利用summary查看原始資料之敘述統計量
Quantile of Variables
sapply(Z[,7:9], quantile, prob=c(.99, .999, .9995))
##        qty   cost   price
## 99%      6  858.0 1014.00
## 99.9%   14 2722.0 3135.82
## 99.95%  24 3799.3 3999.00
Get rid of Outliers
Z = subset(Z, qty<=24 & cost<=3800 & price<=4000) 
nrow(Z)  
## [1] 817182
  • 就算有一大筆資料,只要有一筆離群值,就可能造成估計上的偏差
  • 找出並過濾掉離群值
Assign Transaction ID
Z$tid = group_indices(Z, date, cust)
No. Customers, Categories, Product Items & Transactions
sapply(Z[,c("cust","cat","prod","tid")], n_distinct)
##   cust    cat   prod    tid 
##  32256   2007  23789 119422
  • 總共有32256位不同的顧客、2007種不同產品…等
Summary of Item Records
summary(Z)
##       date                 cust               age              area       
##  Min.   :2000-11-01   Min.   :    1069   D      :181089   E      :312358  
##  1st Qu.:2000-11-28   1st Qu.:  968775   E      :150947   F      :245079  
##  Median :2001-01-01   Median : 1587685   C      :140721   G      : 71905  
##  Mean   :2000-12-30   Mean   : 1406500   F      : 99641   C      : 71600  
##  3rd Qu.:2001-01-30   3rd Qu.: 1854701   B      : 66353   H      : 40647  
##  Max.   :2001-02-28   Max.   :20002000   G      : 53689   D      : 38654  
##                                          (Other):124742   (Other): 36939  
##       cat              prod                qty              cost       
##  Min.   :100101   Min.   :2.001e+07   Min.   : 1.000   Min.   :   0.0  
##  1st Qu.:110106   1st Qu.:4.710e+12   1st Qu.: 1.000   1st Qu.:  35.0  
##  Median :130106   Median :4.710e+12   Median : 1.000   Median :  62.0  
##  Mean   :284784   Mean   :4.462e+12   Mean   : 1.358   Mean   : 106.2  
##  3rd Qu.:520311   3rd Qu.:4.713e+12   3rd Qu.: 1.000   3rd Qu.: 112.0  
##  Max.   :780510   Max.   :9.790e+12   Max.   :24.000   Max.   :3798.0  
##                                                                        
##      price             tid        
##  Min.   :   1.0   Min.   :     1  
##  1st Qu.:  42.0   1st Qu.: 28783  
##  Median :  76.0   Median : 59391  
##  Mean   : 125.5   Mean   : 58845  
##  3rd Qu.: 132.0   3rd Qu.: 87391  
##  Max.   :4000.0   Max.   :119422  
## 
  • 再看一次去掉離群值後的敘述統計


2. 交易計錄:X

交易資料彙整
X = group_by(Z, tid) %>% summarise(
  date = first(date),  # 交易日期
  cust = first(cust),  # 顧客 ID
  age = first(age),    # 顧客 年齡級別
  area = first(area),  # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame  # 119422
  • 將交易資料依據交易ID排序??
交易摘要
summary(X)    
##       tid              date                 cust               age       
##  Min.   :     1   Min.   :2000-11-01   Min.   :    1069   D      :23775  
##  1st Qu.: 29856   1st Qu.:2000-11-29   1st Qu.:  927093   C      :19661  
##  Median : 59712   Median :2001-01-01   Median : 1615661   E      :19596  
##  Mean   : 59712   Mean   :2000-12-31   Mean   : 1402548   F      :13992  
##  3rd Qu.: 89567   3rd Qu.:2001-02-02   3rd Qu.: 1894493   B      :10515  
##  Max.   :119422   Max.   :2001-02-28   Max.   :20002000   G      : 8493  
##                                                           (Other):23390  
##       area           items             pieces            total      
##  E      :50532   Min.   :  1.000   Min.   :  1.000   Min.   :    5  
##  F      :33826   1st Qu.:  2.000   1st Qu.:  3.000   1st Qu.:  227  
##  G      : 9498   Median :  5.000   Median :  6.000   Median :  510  
##  C      : 8527   Mean   :  6.843   Mean   :  9.294   Mean   :  859  
##  H      : 7502   3rd Qu.:  9.000   3rd Qu.: 12.000   3rd Qu.: 1082  
##  D      : 5108   Max.   :112.000   Max.   :339.000   Max.   :30171  
##  (Other): 4429                                                      
##      gross        
##  Min.   :-1645.0  
##  1st Qu.:   21.0  
##  Median :   68.0  
##  Mean   :  132.3  
##  3rd Qu.:  169.0  
##  Max.   : 8069.0  
## 
  • X與Z之summary結果為何不同?
Check Quantile & Remove Outliers
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
##        items   pieces     total    gross
## 99.9%     54  81.0000  9009.579 1824.737
## 99.95%    62  94.2895 10611.579 2179.817
## 99.99%    82 133.0000 16044.401 3226.548
X = subset(X, items<=62 & pieces<95 & total<16000) # 119328
  • 去除離群值
Weekly Transactions
par(cex=0.8)
hist(X$date, "weeks", freq=T, border='lightgray', col='darkcyan', 
     las=2, main="No. Transaction per Week")
## Warning in axis(2, ...): "border" 不是一個繪圖參數
## Warning in axis(side, at = z, labels = labels, ...): "border" 不是一個繪圖
## 參數

  • 由直方圖看每周交易筆數差異
  • 可看見聖誕節當周交易量特別低,同學可以想想其背後商業意涵唷


3. 顧客資料:A

顧客資料彙整
d0 = max(X$date)
A = group_by(X, cust) %>% summarise(
  r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
  s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
  f = n(),            # frquency
  m = mean(total),    # monetary
  rev = sum(total),   # total revenue contribution
  raw = sum(gross),   # total gross profit contribution
  age = first(age),   # age group
  area = first(area), # area code
  ) %>% data.frame    # 33241
  • 由顧客資料依照rfm分析製作新變數,rfm分析介紹請看:
  • rfm分析: 從交易記錄到顧客產品矩陣
  • r: 距今最近一次購買
  • s: 顧客第一次購買
  • f: 顧客購買頻率
  • m: 平均交易金額
顧客摘要
summary(A) 
##       cust                r                s                f         
##  Min.   :    1069   Min.   :  1.00   Min.   :  1.00   Min.   : 1.000  
##  1st Qu.: 1088519   1st Qu.:  9.00   1st Qu.: 56.00   1st Qu.: 1.000  
##  Median : 1663402   Median : 26.00   Median : 92.00   Median : 2.000  
##  Mean   : 1473585   Mean   : 37.45   Mean   : 80.78   Mean   : 3.701  
##  3rd Qu.: 1958089   3rd Qu.: 60.00   3rd Qu.:110.00   3rd Qu.: 4.000  
##  Max.   :20002000   Max.   :120.00   Max.   :120.00   Max.   :85.000  
##                                                                       
##        m                rev              raw               age      
##  Min.   :    8.0   Min.   :     8   Min.   : -784.0   D      :6580  
##  1st Qu.:  365.0   1st Qu.:   707   1st Qu.:   75.0   C      :5915  
##  Median :  705.7   Median :  1750   Median :  241.0   E      :5080  
##  Mean   :  993.1   Mean   :  3152   Mean   :  484.6   F      :3719  
##  3rd Qu.: 1291.0   3rd Qu.:  3968   3rd Qu.:  612.0   B      :3193  
##  Max.   :12636.0   Max.   :127686   Max.   :20273.0   G      :2183  
##                                                       (Other):5571  
##       area      
##  E      :10800  
##  F      : 8539  
##  G      : 3695  
##  C      : 3683  
##  D      : 2166  
##  H      : 1453  
##  (Other): 1905
par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m')) 
  hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)

  • 藉由直方圖,將rfm等變數視覺化,看圖說故事
Dupliate & Save
A0 = A; X0 = X; Z0 = Z
save(Z0, X0, A0, file="data/tf0.rdata")


4. Objective of the Contest

range(X$date)
## [1] "2000-11-01" "2001-02-28"

使用一月底(含2001-01-31)以前的資料,建立模型來預測每一位顧客:

  1. 她在2月份(2001-02-01 ~ 2001-02-28)會不會來買?
  2. 如果她來買的話,會買多少錢?


Chapter 2: 資料準備流程

  • 本章節中,我們要將資料分成預測變數與目標變數
  • 由上圖可知,我們將2月作為分界點
  • 2月份之前之購買行為做為預測變數,將這些預測變數用來預測2月份之購買行為

Preparing The Predictors (X)

rm(list=ls(all=TRUE))
load("data/tf0.rdata")
The Demarcation Date

Remove data after the demarcation date

feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01)    # 618212
  • 僅留下2月份以前資料作為預測變數
Aggregate for the Transaction Records
X = group_by(Z, tid) %>% summarise(
  date = first(date),  # 交易日期
  cust = first(cust),  # 顧客 ID
  age = first(age),    # 顧客 年齡級別
  area = first(area),  # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame  # 88387
  • tid: 同一天、同一位顧客的交易會有相同tid
  • X以日期排序
  • 前4項以first函數編排是本身就固定的變數,以first擷取出來而已
  • items: 此交易紀錄中總共購買幾種商品,兩個高麗菜一瓶牛奶記為2
  • pieces:此交易紀錄總共購買幾件商品,兩個高麗菜一瓶牛奶記為3
summary(X)
##       tid             date                 cust               age       
##  Min.   :    1   Min.   :2000-11-01   Min.   :    1069   D      :17541  
##  1st Qu.:22098   1st Qu.:2000-11-23   1st Qu.:  923910   C      :14624  
##  Median :44194   Median :2000-12-12   Median : 1607000   E      :14578  
##  Mean   :44194   Mean   :2000-12-15   Mean   : 1395768   F      :10354  
##  3rd Qu.:66291   3rd Qu.:2001-01-12   3rd Qu.: 1888874   B      : 7817  
##  Max.   :88387   Max.   :2001-01-31   Max.   :20002000   G      : 6308  
##                                                          (Other):17165  
##       area           items             pieces            total        
##  E      :37496   Min.   :  1.000   Min.   :  1.000   Min.   :    5.0  
##  F      :25412   1st Qu.:  2.000   1st Qu.:  3.000   1st Qu.:  230.0  
##  G      : 6787   Median :  5.000   Median :  6.000   Median :  522.0  
##  C      : 6329   Mean   :  6.994   Mean   :  9.453   Mean   :  888.7  
##  H      : 5524   3rd Qu.:  9.000   3rd Qu.: 12.000   3rd Qu.: 1120.0  
##  D      : 3655   Max.   :112.000   Max.   :339.000   Max.   :30171.0  
##  (Other): 3184                                                        
##      gross        
##  Min.   :-1645.0  
##  1st Qu.:   23.0  
##  Median :   72.0  
##  Mean   :  138.3  
##  3rd Qu.:  174.0  
##  Max.   : 8069.0  
## 
Check Quantile and Remove Outlier
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
##          items   pieces     total    gross
## 99.9%  56.0000  84.0000  9378.684 1883.228
## 99.95% 64.0000  98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
Aggregate for Customer Records

A: 整理並在最後用來跑預測性模型之資料

d0 = max(X$date)
A = group_by(X, cust) %>% summarise(
  r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
  s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
  f = n(),            # frquency
  m = mean(total),    # monetary
  rev = sum(total),   # total revenue contribution
  raw = sum(gross),   # total gross profit contribution
  age = first(age),   # age group
  area = first(area), # area code
  ) %>% data.frame    # 28584



Preparing the Target Variables (Y)

Aggregate Feb’s Transaction by Customer
feb = filter(X0, date>= feb01) %>% group_by(cust) %>% 
  summarise(amount = sum(total))  # 16899
The Target for Regression - A$amount

Simply a Left Joint

A = merge(A, feb, by="cust", all.x=T)
  • 將顧客2月之購買行為,也就是我們要預測的Y合併進入A資料框
The Target for Classification - A$buy
A$buy = !is.na(A$amount)
Summary of the Dataset
summary(A)
##       cust                r               s               f         
##  Min.   :    1069   Min.   : 1.00   Min.   : 1.00   Min.   : 1.000  
##  1st Qu.: 1060898   1st Qu.:11.00   1st Qu.:47.00   1st Qu.: 1.000  
##  Median : 1654100   Median :21.00   Median :68.00   Median : 2.000  
##  Mean   : 1461070   Mean   :32.12   Mean   :61.27   Mean   : 3.089  
##  3rd Qu.: 1945003   3rd Qu.:53.00   3rd Qu.:83.00   3rd Qu.: 4.000  
##  Max.   :20002000   Max.   :92.00   Max.   :92.00   Max.   :60.000  
##                                                                     
##        m                rev             raw               age      
##  Min.   :    8.0   Min.   :    8   Min.   : -742.0   D      :5832  
##  1st Qu.:  359.4   1st Qu.:  638   1st Qu.:   70.0   C      :5238  
##  Median :  709.5   Median : 1566   Median :  218.0   E      :4514  
##  Mean   : 1012.4   Mean   : 2711   Mean   :  420.8   F      :3308  
##  3rd Qu.: 1315.0   3rd Qu.: 3426   3rd Qu.:  535.0   B      :2802  
##  Max.   :10634.0   Max.   :99597   Max.   :15565.0   G      :1940  
##                                                      (Other):4950  
##       area          amount         buy         
##  E      :9907   Min.   :    8   Mode :logical  
##  F      :7798   1st Qu.:  454   FALSE:15342    
##  C      :3169   Median :  993   TRUE :13242    
##  G      :3052   Mean   : 1499                  
##  D      :1778   3rd Qu.: 1955                  
##  H      :1295   Max.   :28089                  
##  (Other):1585   NA's   :15342
The Association of Categorial Predictors
tapply(A$buy, A$age, mean) %>% barplot
abline(h = mean(A$buy), col='red')

tapply(A$buy, A$area, mean) %>% barplot
abline(h = mean(A$buy), col='red')

Contest Dataset
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))
## [1] 28584 20008  8576
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
set.seed(2018); spl2 = sample.split(A2$amount, SplitRatio=0.7)
c(nrow(A2), sum(spl2), sum(!spl2))
## [1] 13242  9601  3641
save(Z, X, A, spl, spl2, file="data/tf2.rdata")
  • 將X與Z資料框結合進A資料框中
  • 並將A資料框以7:3之比例分為訓練與測試資料
  • set.seed為避免大家的訓練與測試資料都不同,只要seed的數字一樣,就會有一樣的切割方式
  • 將預測會不會來買以及買多少之資料框分開為A和A2,其中將A2中m,rev,amount等級距較大之欄位取log避免差異過大

Chapter3: 迴歸模型

Fig-1: The First Model

Fig-1: The First Model


Loading & Preparing Data

Loading Data
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
Spliting for Classification
TR = subset(A, spl)
TS = subset(A, !spl)


  • 將顧客資料分成訓練資料及測試資料
  • 利用訓練資料來製作模型,並且預測測試資料看此模型準不準

Classification Model

glm1 = glm(buy ~ ., TR[,c(2:9, 11)], family=binomial()) 
summary(glm1)
## 
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9, 
##     11)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7931  -0.8733  -0.6991   1.0384   1.8735  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.259e+00  1.261e-01  -9.985  < 2e-16 ***
## r           -1.227e-02  8.951e-04 -13.708  < 2e-16 ***
## s            9.566e-03  9.101e-04  10.511  < 2e-16 ***
## f            2.905e-01  1.593e-02  18.233  < 2e-16 ***
## m           -3.028e-05  2.777e-05  -1.090  0.27559    
## rev          4.086e-05  1.940e-05   2.106  0.03521 *  
## raw         -2.306e-04  8.561e-05  -2.693  0.00708 ** 
## ageB        -4.194e-02  8.666e-02  -0.484  0.62838    
## ageC         1.772e-02  7.992e-02   0.222  0.82456    
## ageD         7.705e-02  7.921e-02   0.973  0.33074    
## ageE         8.699e-02  8.132e-02   1.070  0.28476    
## ageF         1.928e-02  8.457e-02   0.228  0.81962    
## ageG         1.745e-02  9.323e-02   0.187  0.85155    
## ageH         1.752e-01  1.094e-01   1.602  0.10926    
## ageI         6.177e-02  1.175e-01   0.526  0.59904    
## ageJ         2.652e-01  1.047e-01   2.533  0.01131 *  
## ageK        -1.419e-01  1.498e-01  -0.947  0.34347    
## areaB       -4.105e-02  1.321e-01  -0.311  0.75603    
## areaC       -2.075e-01  1.045e-01  -1.986  0.04703 *  
## areaD        3.801e-02  1.111e-01   0.342  0.73214    
## areaE        2.599e-01  9.682e-02   2.684  0.00727 ** 
## areaF        1.817e-01  9.753e-02   1.863  0.06243 .  
## areaG       -4.677e-02  1.045e-01  -0.448  0.65435    
## areaH       -1.695e-01  1.232e-01  -1.375  0.16912    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27629  on 20007  degrees of freedom
## Residual deviance: 23295  on 19984  degrees of freedom
## AIC: 23343
## 
## Number of Fisher Scoring iterations: 5
pred =  predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm
##        predict
## actual  FALSE TRUE
##   FALSE  3730  873
##   TRUE   1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts          # 0.69998
## [1] 0.6999767
colAUC(pred, TS$buy)                                   # 0.7556
##                     [,1]
## FALSE vs. TRUE 0.7556038
  • 檢視此模型,我們可以查看各個X對於Y的顯著程度
  • AIC 越小越好
  • 檢視acc , AUC


Regression Model

A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)])
summary(lm1)
## 
## Call:
## lm(formula = amount ~ ., data = TR2[, c(2:6, 8:10)])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.04733 -0.23360  0.04677  0.28484  1.67880 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.164e+00  4.891e-02  23.797  < 2e-16 ***
## r            4.027e-04  3.094e-04   1.301 0.193127    
## s           -3.277e-05  3.132e-04  -0.105 0.916668    
## f            2.630e-02  1.704e-03  15.441  < 2e-16 ***
## m            5.136e-01  3.669e-02  14.000  < 2e-16 ***
## rev          5.205e-02  3.552e-02   1.465 0.142934    
## ageB         2.510e-02  2.480e-02   1.012 0.311352    
## ageC         8.571e-02  2.288e-02   3.746 0.000181 ***
## ageD         1.015e-01  2.256e-02   4.498 6.93e-06 ***
## ageE         9.441e-02  2.303e-02   4.100 4.16e-05 ***
## ageF         6.508e-02  2.399e-02   2.713 0.006684 ** 
## ageG         4.403e-02  2.626e-02   1.677 0.093550 .  
## ageH         4.840e-02  3.089e-02   1.567 0.117186    
## ageI         1.610e-02  3.252e-02   0.495 0.620547    
## ageJ        -3.803e-02  2.819e-02  -1.349 0.177398    
## ageK         7.642e-02  3.955e-02   1.932 0.053373 .  
## areaB        4.065e-02  4.130e-02   0.984 0.324960    
## areaC       -8.146e-03  3.348e-02  -0.243 0.807754    
## areaD       -2.134e-02  3.537e-02  -0.603 0.546389    
## areaE       -2.820e-02  3.056e-02  -0.923 0.356161    
## areaF       -6.067e-03  3.078e-02  -0.197 0.843755    
## areaG       -1.427e-03  3.318e-02  -0.043 0.965695    
## areaH       -2.296e-02  3.761e-02  -0.610 0.541547    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4282 on 9578 degrees of freedom
## Multiple R-squared:  0.2972, Adjusted R-squared:  0.2956 
## F-statistic: 184.1 on 22 and 9578 DF,  p-value: < 2.2e-16
  • 檢視此預測模型
  • 斜率的+/-表示正/負相關,大小表示對應變數影響程度
  • R2表示此模型能夠解釋的變異程度
  • 星號代表顯著的自變數
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) -  TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(r2.tr, r2.ts)
## [1] 0.2972137 0.2337665
  • 即總變異(SST)=已解釋變異(SSR)+ 未解釋變異(SSE)







Chapter4: 決策樹

  • 除了迴歸模型外,決策樹也是個用來預測的好工具
Fig-1: Feature Engineering

Fig-1: Feature Engineering

Fig-2: Feature Engr. & Data Spliting Process

Fig-2: Feature Engr. & Data Spliting Process



Loading & Preparing Data

Sys.setlocale("LC_ALL","C")
## [1] "C"
library(Matrix)
library(slam)
library(rpart)
library(rpart.plot)
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A2 = subset(A, buy)
c(sum(spl), sum(spl2))
## [1] 20008  9601


Weekday Percentage: W1 ~ W7

X = X %>% mutate(wday = format(date, "%w"))
table(X$wday)
## 
##     0     1     2     3     4     5     6 
## 18011 12615 11288  9898 11245 10651 14587
mx = xtabs(~ cust + wday, X)
dim(mx)
## [1] 28584     7
mx[1:5,]
##       wday
## cust   0 1 2 3 4 5 6
##   1069 1 1 0 0 0 0 0
##   1113 2 1 0 0 0 0 1
##   1359 0 1 0 0 0 0 0
##   1823 0 1 0 1 1 0 0
##   2189 0 0 0 1 0 0 1
mx = mx / rowSums(mx)
mx[1:5,]
##       wday
## cust           0         1         2         3         4         5
##   1069 0.5000000 0.5000000 0.0000000 0.0000000 0.0000000 0.0000000
##   1113 0.5000000 0.2500000 0.0000000 0.0000000 0.0000000 0.0000000
##   1359 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##   1823 0.0000000 0.3333333 0.0000000 0.3333333 0.3333333 0.0000000
##   2189 0.0000000 0.0000000 0.0000000 0.5000000 0.0000000 0.0000000
##       wday
## cust           6
##   1069 0.0000000
##   1113 0.2500000
##   1359 0.0000000
##   1823 0.0000000
##   2189 0.5000000
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>% 
  setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>% 
  right_join(A, by='cust')
head(A)
##   cust  W1        W2  W3        W4        W5  W6   W7  r  s f      m   rev
## 1 1069 0.5 0.5000000 0.0 0.0000000 0.0000000 0.0 0.00 11 80 2  579.0  1158
## 2 1113 0.5 0.2500000 0.0 0.0000000 0.0000000 0.0 0.25 26 81 4  557.5  2230
## 3 1359 0.0 1.0000000 0.0 0.0000000 0.0000000 0.0 0.00 59 59 1  364.0   364
## 4 1823 0.0 0.3333333 0.0 0.3333333 0.3333333 0.0 0.00  8 91 3  869.0  2607
## 5 2189 0.0 0.0000000 0.0 0.5000000 0.0000000 0.0 0.50 29 61 2 7028.0 14056
## 6 3667 0.0 0.0000000 0.5 0.0000000 0.0000000 0.5 0.00 37 55 2 2379.5  4759
##    raw age area amount   buy
## 1  129   K    E    786  TRUE
## 2  241   K    F     NA FALSE
## 3  104   K    G     NA FALSE
## 4  498   K    D     NA FALSE
## 5 3299   K    B     NA FALSE
## 6  351   K    G   1570  TRUE


Classification (Buy) Model

TR = subset(A, spl)
TS = subset(A, !spl)
library(rpart)
library(rpart.plot)
rpart1 = rpart(buy ~ ., TR[,c(2:16,18)], method="class")
pred =  predict(rpart1, TS)[,2]  # predict prob
cm = table(actual = TS$buy, predict = pred > 0.5); cm
##        predict
## actual  FALSE TRUE
##   FALSE  3730  873
##   TRUE   1643 2330
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts   # 0.70662          
## [1] 0.7066231
colAUC(pred, TS$buy)                            # 0.6984
##                     [,1]
## FALSE vs. TRUE 0.6983998
  • 利用CART – Classification & Regression Tree建立預測模型
  • 使用CART 預測 類別
  • 檢視測試資料的準確度
  • 檢視AUC
rpart.plot(rpart1,cex=0.6)

rpart2 = rpart(buy ~ ., TR[,c(2:16,18)], method="class",cp=0.001)
pred =  predict(rpart2, TS)[,2]  # predict prob
cm = table(actual = TS$buy, predict = pred > 0.5); cm
##        predict
## actual  FALSE TRUE
##   FALSE  3878  725
##   TRUE   1812 2161
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts   # 0.70417          
## [1] 0.7041744
colAUC(pred, TS$buy)                            # 0.7169         
##                     [,1]
## FALSE vs. TRUE 0.7168957
rpart.plot(rpart2,cex=0.6)

Regression (Amount) Model

A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
  • 由於是預測數量,將資料取Log10可以避免單位不同所造成的數字差異
rpart3 = rpart(amount ~ ., TR2[,c(2:17)], cp=0.002)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart3, TS2) -  TS2$amount)^2)
1 - (SSE/SST)
## [1] 0.2172772
  • 即總變異(SST)=已解釋變異(SSR)+ 未解釋變異(SSE)

Chapter5: 交叉驗證與參數調校

  • 何謂交叉驗證(cross validation)?
  • 將交叉驗證之結果應用在我們的預測性模型上

點我看影片

交叉驗證與參數調校流程

Fig-1: Supervised Learning Process

Fig-1: Supervised Learning Process

Fig-2: CV, Model Sel. & Parameter Tuning

Fig-2: CV, Model Sel. & Parameter Tuning



Libraries
Sys.setlocale("LC_ALL","C")
## [1] "C"
library(caret)
library(doParallel)
Loading and Spliting
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A$buy = factor(ifelse(A$buy, "yes", "no"))  # comply to the rule of caret
TR = A[spl, c(2:9,11)]
TS = A[!spl, c(2:9,11)]
Turn on Parallel Processing
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
## [1] 4
  • 開啟平行運算,將電腦的每一個CPU都叫出來工作,以免執行交叉驗證的等待時間過長
  • 可以看到自己的電腦有幾顆CPU

決策樹之交叉驗證

CV Control for Classification
ctrl = trainControl(
  method="repeatedcv", number=10,    # 10-fold, Repeated CV
  savePredictions = "final", classProbs=TRUE,
  summaryFunction=twoClassSummary)
  • 設定交叉驗證要將原本的資料切成幾塊(執行幾次)
CV: rpart(), Classification Tree
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
  buy ~ ., data=TR, method="rpart", 
  trControl=ctrl, metric="ROC",
  tuneGrid = expand.grid(cp = seq(0.0002,0.001,0.0001) ) )
Sys.time() - t0
## Time difference of 22.62487 secs
plot(cv.rpart)

cv.rpart$results 
##      cp       ROC      Sens      Spec      ROCSD     SensSD     SpecSD
## 1 2e-04 0.7084066 0.7598480 0.5791341 0.01036383 0.01447771 0.01901226
## 2 3e-04 0.7155533 0.7800992 0.5742276 0.01040769 0.02913860 0.02550630
## 3 4e-04 0.7178635 0.7970962 0.5671579 0.01550732 0.02893758 0.02621973
## 4 5e-04 0.7182132 0.8049660 0.5621939 0.01717161 0.02798866 0.02456524
## 5 6e-04 0.7169737 0.8104121 0.5605775 0.01699893 0.02177565 0.02136082
## 6 7e-04 0.7138179 0.8131120 0.5611177 0.01761331 0.01663849 0.01652607
## 7 8e-04 0.7087418 0.8162777 0.5594460 0.01303339 0.01317749 0.01386506
## 8 9e-04 0.7077703 0.8177675 0.5592302 0.01047743 0.01467971 0.01499422
## 9 1e-03 0.7075750 0.8181864 0.5591763 0.01057787 0.01443186 0.01550779
  • 如同影片所說,複雜度越高不一定越“準”,因此透過參數調校,找出最適合的複雜度和參數組合
Classification Tree, Final Model
rpart1 = rpart(buy ~ ., TR, method="class", cp=0.0005)
predict(rpart1, TS, type="prob")[,2] %>% 
  colAUC(TS$buy)
##                 [,1]
## no vs. yes 0.7401771


CV: glm(), General Linear Model(邏輯式回歸)
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.glm = train(
  buy ~ ., data=TR, method="glm", 
  trControl=ctrl, metric="ROC")
Sys.time() - t0
## Time difference of 5.473367 secs
cv.glm$results
##   parameter       ROC      Sens     Spec      ROCSD     SensSD     SpecSD
## 1      none 0.7487897 0.8057559 0.568562 0.01017789 0.01321789 0.01461063
glm(), Final Model
glm1 = b=glm(buy ~ ., TR, family=binomial)
predict(glm1, TS, type="response") %>% colAUC(TS$buy)
##                 [,1]
## no vs. yes 0.7556038
  • 執行完CV後,決策樹之AUC上升至0.7556038


線性迴歸之交叉驗證

Spliting Data
A2 = subset(A, A$buy == "yes") %>% mutate_at(c("m","rev","amount"), log10)
TR2 = A2[ spl2, c(2:10)]
TS2 = A2[!spl2, c(2:10)]
CV Control for Regression
ctrl2 = trainControl(
  method="repeatedcv", number=10,    # 10-fold, Repeated CV
  savePredictions = "final")
  • 同樣將資料切為10等分
CV: rpart() Regression Tree
ctrl$repeats = 2
set.seed(2)
cv.rpart2 = train(
  amount ~ ., data=TR2, method="rpart", 
  trControl=ctrl2, metric="Rsquared",
  tuneGrid = expand.grid(cp = seq(0.0008,0.0024,0.0001) ) )
plot(cv.rpart2)

  • 透過參數調校找出最佳複雜度
cv.rpart2$results
##        cp      RMSE  Rsquared       MAE      RMSESD RsquaredSD       MAESD
## 1  0.0008 0.4305827 0.2905017 0.3303821 0.007464477 0.03219394 0.005085291
## 2  0.0009 0.4293302 0.2939861 0.3295331 0.008253932 0.03375719 0.004812277
## 3  0.0010 0.4292746 0.2940469 0.3295785 0.008023515 0.03436858 0.004455055
## 4  0.0011 0.4291072 0.2944978 0.3293753 0.007711396 0.03432309 0.004471973
## 5  0.0012 0.4292391 0.2941081 0.3296183 0.007848534 0.03501906 0.004701831
## 6  0.0013 0.4294395 0.2933438 0.3300743 0.007774047 0.03384411 0.004634104
## 7  0.0014 0.4298924 0.2919842 0.3303230 0.007834039 0.03382697 0.004660953
## 8  0.0015 0.4301352 0.2911416 0.3305803 0.008048291 0.03440911 0.004937880
## 9  0.0016 0.4305736 0.2897460 0.3309383 0.007600250 0.03343885 0.004733278
## 10 0.0017 0.4301419 0.2908523 0.3308101 0.008022654 0.03374871 0.004775529
## 11 0.0018 0.4304798 0.2896502 0.3310972 0.008164540 0.03515746 0.005038499
## 12 0.0019 0.4304533 0.2896931 0.3310614 0.008157967 0.03511973 0.005058048
## 13 0.0020 0.4309692 0.2879635 0.3315247 0.008736241 0.03667922 0.005503196
## 14 0.0021 0.4313581 0.2867167 0.3317925 0.008921059 0.03738517 0.005477327
## 15 0.0022 0.4316905 0.2856461 0.3320903 0.009050195 0.03710472 0.005613428
## 16 0.0023 0.4317147 0.2855091 0.3325167 0.009162067 0.03742653 0.005620368
## 17 0.0024 0.4321351 0.2840164 0.3328951 0.008312290 0.03530018 0.005297591
rpart(), Regression Tree Final Model
rpart2 = rpart(amount ~ ., data=TR2, cp=0.0016)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart2, TS2) -  TS2$amount)^2)
(r2.ts.rpart2 = 1 - (SSE/SST))
## [1] 0.2174555
CV: lm(), Linear Model
ctrl$repeats = 2
set.seed(2)
cv.lm2 = train(
  amount ~ ., data=TR2, method="lm", 
  trControl=ctrl2, metric="Rsquared",
    tuneGrid = expand.grid( intercept = seq(0,5,0.5) ) 
  )
plot(cv.lm2)

cv.lm2$results
##    intercept      RMSE  Rsquared       MAE      RMSESD RsquaredSD
## 1        0.0 0.4410099 0.2778923 0.3360522 0.011482983 0.03697934
## 2        0.5 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 3        1.0 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 4        1.5 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 5        2.0 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 6        2.5 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 7        3.0 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 8        3.5 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 9        4.0 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 10       4.5 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
## 11       5.0 0.4270667 0.3006707 0.3276134 0.008599816 0.03440496
##          MAESD
## 1  0.006465768
## 2  0.005825454
## 3  0.005825454
## 4  0.005825454
## 5  0.005825454
## 6  0.005825454
## 7  0.005825454
## 8  0.005825454
## 9  0.005825454
## 10 0.005825454
## 11 0.005825454
lm() Final Model
lm2 = lm(amount ~ ., TR2)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm2, TS2) -  TS2$amount)^2)
(r2.ts.lm2 = 1 - (SSE/SST))
## [1] 0.2381007
  • 線性迴歸做完交叉驗證後之R^2為0.2381007
要記得關閉平行運算功能喔!
stopCluster(clust)