KDD2009vtreat
================
John Mount

Przykład wykorzystania pakietu `vtreat` na zestawie danych KDD2009.

``` r
date()
```

    ## [1] "Tue Oct 20 14:13:20 2020"

``` r
# wczytuje wymagane biblioteki
library('vtreat')
```

    ## Loading required package: wrapr

``` r
library('WVPlots') 
library('sigr')
library('parallel')
library('xgboost')
```

    ## Warning: package 'xgboost' was built under R version 4.0.3

``` r
# Listingi/r08_Zaawansowane_przygotowywanie_danych/00327_przyklad_8.1_z_punktu_8.2.1.R 
# przykład 8.1 z punktu 8.2.1 
# (przykład 8.1 z punktu 8.2.1)  : Zaawansowane przygotowywanie danych : Konkurs KDD i zestaw danych KDD Cup 2009 : Pierwsze kroki z danymi KDD Cup 2009 
# Tytuł: Przygotowanie danych KDD do analizy 

d <- read.table('orange_small_train.data.gz',   # Uwaga 1. 
   header = TRUE,
   sep = '\t',
   na.strings = c('NA', ''))    # Uwaga 2. 
                                                
churn <- read.table('orange_small_train_churn.labels.txt',
   header = FALSE, sep = '\t')  # Uwaga 3. 
d$churn <- churn$V1     # Uwaga 4. 

set.seed(729375)    # Uwaga 5. 
rgroup <- base::sample(c('train', 'test'),  # Uwaga 6.
   nrow(d), 
   prob = c(0.9, 0.1),
   replace = TRUE)
dTrain <- d[rgroup=='train', , drop = FALSE]
dTest <- d[rgroup == 'test', , drop = FALSE]
                                                
outcome <- 'churn' 
vars <- setdiff(colnames(dTrain), outcome)

                                                
rm(list=c('d', 'churn', 'rgroup'))  # Uwaga 7.

# Uwaga 1. 
#   Wczytuje plik zawierający zmienne niezależne. Wszystkie dane są dostępne na stronie https://github.com/WinVector/PDSwR2/tree/master/KDD2009.  

# Uwaga 2. 
#   Traktuje wszystkie wartości NA i puste łańcuchy znaków jako brakujące dane. 

# Uwaga 3. 
#   Odczytuje znane wyniki odpływu. 

# Uwaga 4. 
#   Dodaje odpływ jako nową kolumnę. 

# Uwaga 5. 
#   Dzięki wyznaczeniu ziarna generatora liczb pseudolosowych sprawiamy, że nasz model będzie odtwarzalny: inna osoba realizująca kod będzie otrzymywała dokładnie te same wyniki. 

# Uwaga 6. 
#   Dzieli dane na zbiory uczący, kalibracyjny i testowy. Jawnie wyznacza funkcję base::sample() po to, aby uniknąć konfliktu z funkcją dplyr::sample() w przypadku, gdyby pakiet dplyr był wczytany. 

# Uwaga 7. 
#   Usuwa niepotrzebne obiekty z przestrzeni roboczej. 

set.seed(239525)

ncore <- parallel::detectCores()
(cl = parallel::makeCluster(ncore))
```

    ## socket cluster with 6 nodes on host 'localhost'

``` r
yName <- "churn"
yTarget <- 1

date()
```

    ## [1] "Tue Oct 20 14:13:25 2020"

``` r
date()
```

    ## [1] "Tue Oct 20 14:13:25 2020"

``` r
var_values <- vtreat::value_variables_C(dTrain,
    vars,yName,yTarget,
    smFactor=2.0, 
    parallelCluster=cl
    )


summary(var_values$sig < 1/nrow(var_values))
```

    ##    Mode   FALSE    TRUE 
    ## logical      59     153

``` r
length(vars)
```

    ## [1] 230

``` r
vars <- var_values$var[var_values$sig < 1/nrow(var_values)]
length(vars)
```

    ## [1] 153

``` r
date()
```

    ## [1] "Tue Oct 20 14:15:04 2020"

``` r
date()
```

    ## [1] "Tue Oct 20 14:15:04 2020"

``` r
# Przetwarza pozostałe modele (przy kodowaniu/uczeniu rozdzielonym we właściwy sposób).
#
# Uzyskuje wartość AUC rzędu 0,74.

customCoders = list('c.PiecewiseV.num' = vtreat::solve_piecewise,
                    'n.PiecewiseV.num' = vtreat::solve_piecewise,
                    'c.knearest.num' = vtreat::square_window,
                    'n.knearest.num' = vtreat::square_window)
cfe = mkCrossFrameCExperiment(dTrain,
                              vars,yName,yTarget,
                              customCoders=customCoders,
                              smFactor=2.0, 
                              parallelCluster=cl)
```

    ## [1] "vtreat 1.6.1 start initial treatment design Tue Oct 20 14:15:04 2020"
    ## [1] " start cross frame work Tue Oct 20 14:16:54 2020"
    ## [1] " vtreat::mkCrossFrameCExperiment done Tue Oct 20 14:17:46 2020"

``` r
treatmentsC = cfe$treatments
scoreFrame = treatmentsC$scoreFrame
table(scoreFrame$code)
```

    ## 
    ##       catB       catP      clean      isBAD   knearest        lev PiecewiseV 
    ##         28         28        122        120          2        121        118

``` r
selvars <- scoreFrame$varName
treatedTrainM <- cfe$crossFrame[,c(yName,selvars),drop=FALSE]
treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget

treatedTest = prepare(treatmentsC,
                      dTest,
                      pruneSig=NULL, 
                      varRestriction = selvars,
                      parallelCluster=cl)
treatedTest[[yName]] = treatedTest[[yName]]==yTarget

# przygotowuje ramki wykresów
treatedTrainP = treatedTrainM[, yName, drop=FALSE]
treatedTestP = treatedTest[, yName, drop=FALSE]
date()
```

    ## [1] "Tue Oct 20 14:17:47 2020"

``` r
date()
```

    ## [1] "Tue Oct 20 14:17:47 2020"

``` r
mname = 'xgbPred'
print(paste(mname,length(selvars)))
```

    ## [1] "xgbPred 539"

``` r
params <- list(max_depth = 5, 
              objective = "binary:logistic",
              nthread = ncore)
model <- xgb.cv(data = as.matrix(treatedTrainM[, selvars, drop = FALSE]),
                label = treatedTrainM[[yName]],
                nrounds = 400,
                params = params,
                nfold = 5,
                early_stopping_rounds = 10,
                eval_metric = "logloss")
```

    ## [1]  train-logloss:0.503126+0.000588 test-logloss:0.504113+0.001188 
    ## Multiple eval metrics are present. Will use test_logloss for early stopping.
    ## Will train until test_logloss hasn't improved in 10 rounds.
    ## 
    ## [2]  train-logloss:0.400365+0.001088 test-logloss:0.402315+0.002020 
    ## [3]  train-logloss:0.338357+0.001314 test-logloss:0.341262+0.003089 
    ## [4]  train-logloss:0.299300+0.001547 test-logloss:0.303083+0.003967 
    ## [5]  train-logloss:0.274160+0.001651 test-logloss:0.279142+0.004478 
    ## [6]  train-logloss:0.257497+0.001689 test-logloss:0.263882+0.004994 
    ## [7]  train-logloss:0.246378+0.001771 test-logloss:0.253806+0.005573 
    ## [8]  train-logloss:0.238690+0.001723 test-logloss:0.247646+0.006122 
    ## [9]  train-logloss:0.233024+0.001892 test-logloss:0.243349+0.006373 
    ## [10] train-logloss:0.228929+0.001845 test-logloss:0.240968+0.006782 
    ## [11] train-logloss:0.225695+0.001971 test-logloss:0.239253+0.007092 
    ## [12] train-logloss:0.222703+0.001933 test-logloss:0.238106+0.007320 
    ## [13] train-logloss:0.220350+0.001977 test-logloss:0.237529+0.007695 
    ## [14] train-logloss:0.218418+0.001876 test-logloss:0.237213+0.007898 
    ## [15] train-logloss:0.216387+0.001828 test-logloss:0.236944+0.007811 
    ## [16] train-logloss:0.214801+0.002029 test-logloss:0.236712+0.007931 
    ## [17] train-logloss:0.212810+0.001932 test-logloss:0.236665+0.007925 
    ## [18] train-logloss:0.211246+0.001823 test-logloss:0.236770+0.007912 
    ## [19] train-logloss:0.209646+0.001714 test-logloss:0.236836+0.008023 
    ## [20] train-logloss:0.208235+0.002049 test-logloss:0.236916+0.008012 
    ## [21] train-logloss:0.206745+0.002042 test-logloss:0.237074+0.008065 
    ## [22] train-logloss:0.205579+0.001893 test-logloss:0.237088+0.008032 
    ## [23] train-logloss:0.204166+0.001764 test-logloss:0.237182+0.008018 
    ## [24] train-logloss:0.203213+0.001937 test-logloss:0.237281+0.008162 
    ## [25] train-logloss:0.201942+0.001774 test-logloss:0.237209+0.008211 
    ## [26] train-logloss:0.200722+0.002147 test-logloss:0.237525+0.008299 
    ## [27] train-logloss:0.199823+0.001900 test-logloss:0.237645+0.008082 
    ## Stopping. Best iteration:
    ## [17] train-logloss:0.212810+0.001932 test-logloss:0.236665+0.007925

``` r
nrounds <- model$best_iteration
print(paste("nrounds", nrounds))
```

    ## [1] "nrounds 17"

``` r
model <- xgboost(data = as.matrix(treatedTrainM[, selvars, drop = FALSE]),
                 label = treatedTrainM[[yName]],
                 nrounds = nrounds,
                 params = params)
```

    ## [1]  train-error:0.071400 
    ## [2]  train-error:0.071555 
    ## [3]  train-error:0.071844 
    ## [4]  train-error:0.071778 
    ## [5]  train-error:0.071800 
    ## [6]  train-error:0.071578 
    ## [7]  train-error:0.071711 
    ## [8]  train-error:0.071844 
    ## [9]  train-error:0.071800 
    ## [10] train-error:0.071911 
    ## [11] train-error:0.071667 
    ## [12] train-error:0.071134 
    ## [13] train-error:0.071000 
    ## [14] train-error:0.070623 
    ## [15] train-error:0.070556 
    ## [16] train-error:0.070156 
    ## [17] train-error:0.069956

``` r
treatedTrainP[[mname]] = predict(
  model, 
  newdata = as.matrix(treatedTrainM[, selvars, drop = FALSE]), 
  n.trees = nTrees,
  type = 'response')
treatedTestP[[mname]] = predict(
  model,
  newdata = as.matrix(treatedTest[, selvars, drop = FALSE]), 
  n.trees = nTrees,
  type = "response")
date()
```

    ## [1] "Tue Oct 20 14:18:07 2020"

``` r
calcAUC(treatedTestP[[mname]], treatedTestP[[yName]]==yTarget)
```

    ## [1] 0.7387616

``` r
permTestAUC(treatedTestP, mname, yName, yTarget = yTarget)
```

    ## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.7388, s.d.=0.01439, p<1e-05)."

``` r
wrapChiSqTest(treatedTestP, mname, yName, yTarget = yTarget)
```

    ## [1] "Chi-Square Test summary: pseudo-R2=0.1124 (X2(1,N=4972)=286.4, p<1e-05)."

``` r
date()
```

    ## [1] "Tue Oct 20 14:18:07 2020"

``` r
t1 = paste(mname,'trainingM data')
print(DoubleDensityPlot(treatedTrainP, mname, yName, 
                        title=t1))
```

![](KDD2009vtreat_files/figure-gfm/kddplot-1.png)<!-- -->

``` r
print(ROCPlot(treatedTrainP, mname, yName, yTarget,
              title=t1))
```

![](KDD2009vtreat_files/figure-gfm/kddplot-2.png)<!-- -->

``` r
print(WVPlots::PRPlot(treatedTrainP, mname, yName, yTarget,
              title=t1))
```

![](KDD2009vtreat_files/figure-gfm/kddplot-3.png)<!-- -->

``` r
t2 = paste(mname,'test data')
print(DoubleDensityPlot(treatedTestP, mname, yName, 
                        title=t2))
```

![](KDD2009vtreat_files/figure-gfm/kddplot-4.png)<!-- -->

``` r
print(ROCPlot(treatedTestP, mname, yName, yTarget,
              title=t2))
```

![](KDD2009vtreat_files/figure-gfm/kddplot-5.png)<!-- -->

``` r
print(WVPlots::PRPlot(treatedTestP, mname, yName, yTarget,
              title=t2))
```

![](KDD2009vtreat_files/figure-gfm/kddplot-6.png)<!-- -->

``` r
print(date())
```

    ## [1] "Tue Oct 20 14:18:12 2020"

``` r
print("*****************************")
```

    ## [1] "*****************************"

``` r
date()
```

    ## [1] "Tue Oct 20 14:18:12 2020"

``` r
if(!is.null(cl)) {
    parallel::stopCluster(cl)
    cl = NULL
}
```
