[R] regression

Datasets for book "R and Data Mining: Examples and Case Studies"


Notes on KDD Cup 1998 data:
Regarding data of KDD Cup 1998 in this ZIP archive, the training dataset "cup98LRN.txt" contains only the first 10,000 rows out of the 95,412 rows in the original one. Similarly, the validation dataset "cup98VAL.txt" also contains the first 10,000 rows only and "valtargt.txt" provides the actual values of target variables of those 10,000 rows. Readers who want to try the complete original datasets are suggested to download them at http://www.sigkdd.org/kddcup/index.php?section=1998&method=data.


Contact: Yanchang Zhao
Email: yanchang@rdatamining.com
Website: http://www.rdatamining.com
Group on LinkedIn: http://group.rdatamining.com

 

 

################################################### 회귀분석 free memory
rm(list = ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 244713 13.1     407500 21.8   350000 18.7
## Vcells 357326  2.8     786432  6.0   786432  6.0


################################################### 데이터 생성
year <- rep(2008:2010, each = 4)
quarter <- rep(1:4, 3)
cpi <- c(162.2, 164.6, 166.5, 166, 166.2, 167, 168.6, 169.5, 171, 172.1, 173.3, 
    174)
plot(cpi, xaxt = "n", ylab = "CPI", xlab = "")

# draw x-axis
axis(1, labels = paste(year, quarter, sep = "Q"), at = 1:12, las = 3)

plot of chunk unnamed-chunk-1


# 상관계수 : correation coeffication 년간
cor(year, cpi)
## [1] 0.9096

# 분기
cor(quarter, cpi)
## [1] 0.3738

# 회귀분석
fit <- lm(cpi ~ year + quarter)
fit
## 
## Call:
## lm(formula = cpi ~ year + quarter)
## 
## Coefficients:
## (Intercept)         year      quarter  
##    -7644.49         3.89         1.17

(cpi2011 <- fit$coefficients[[1]] + fit$coefficients[[2]] * 2011 + fit$coefficients[[3]] * 
    (1:4))
## [1] 174.4 175.6 176.8 177.9

attributes(fit)
## $names
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"        
## 
## $class
## [1] "lm"
fit$coefficients
## (Intercept)        year     quarter 
##   -7644.488       3.888       1.167

residuals(fit)
##        1        2        3        4        5        6        7        8 
## -0.57917  0.65417  1.38750 -0.27917 -0.46667 -0.83333 -0.40000 -0.66667 
##        9       10       11       12 
##  0.44583  0.37917  0.41250 -0.05417
summary(fit)
## 
## Call:
## lm(formula = cpi ~ year + quarter)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.833 -0.495 -0.167  0.421  1.387 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7644.488    518.654  -14.74  1.3e-07 ***
## year            3.888      0.258   15.06  1.1e-07 ***
## quarter         1.167      0.189    6.19  0.00016 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.73 on 9 degrees of freedom
## Multiple R-squared:  0.967,  Adjusted R-squared:  0.96 
## F-statistic:  133 on 2 and 9 DF,  p-value: 2.11e-07

# 단계별로 보여주기
plot(fit)

plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1 plot of chunk unnamed-chunk-1


# 레이아웃을 조정하여 한페이지에 보여주기
layout(matrix(c(1, 2, 3, 4), 2, 2))  # 4 graphs per page 
plot(fit)

plot of chunk unnamed-chunk-1


layout(matrix(1))  # 단계별로 보여주기 이전으로


################################################### scatterplot3d 라이브러리 활용
library(scatterplot3d)
s3d <- scatterplot3d(year, quarter, cpi, highlight.3d = T, type = "h", lab = c(2, 
    3))
s3d$plane3d(fit)

plot of chunk unnamed-chunk-1


# 예측값 다르게 보여주기
data2011 <- data.frame(year = 2011, quarter = 1:4)
cpi2011 <- predict(fit, newdata = data2011)
style <- c(rep(1, 12), rep(2, 4))
plot(c(cpi, cpi2011), xaxt = "n", ylab = "CPI", xlab = "", pch = style, col = style)
axis(1, at = 1:16, las = 3, labels = c(paste(year, quarter, sep = "Q"), "2011Q1", 
    "2011Q2", "2011Q3", "2011Q4"))

plot of chunk unnamed-chunk-1



################################################### bodyfat 데이터 예제
data("bodyfat", package = "mboost")
myFormula <- DEXfat ~ age + waistcirc + hipcirc + elbowbreadth + kneebreadth
bodyfat.glm <- glm(myFormula, family = gaussian("log"), data = bodyfat)
summary(bodyfat.glm)
## 
## Call:
## glm(formula = myFormula, family = gaussian("log"), data = bodyfat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -11.569   -3.006    0.127    2.831   10.097  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.73429    0.30895    2.38   0.0204 *  
## age           0.00213    0.00145    1.47   0.1456    
## waistcirc     0.01049    0.00248    4.23  7.4e-05 ***
## hipcirc       0.00970    0.00323    3.00   0.0038 ** 
## elbowbreadth  0.00235    0.04569    0.05   0.9590    
## kneebreadth   0.06319    0.02819    2.24   0.0284 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 20.31)
## 
##     Null deviance: 8536.0  on 70  degrees of freedom
## Residual deviance: 1320.4  on 65  degrees of freedom
## AIC: 423
## 
## Number of Fisher Scoring iterations: 5
pred <- predict(bodyfat.glm, type = "response")

plot(bodyfat$DEXfat, pred, xlab = "Observed Values", ylab = "Predicted Values")
abline(a = 0, b = 1)

plot of chunk unnamed-chunk-1

 

◀ PREV 123456789···14 NEXT ▶