> library(MASS) > data(fgl) > names(fgl) [1] "RI" "Na" "Mg" "Al" "Si" "K" "Ca" "Ba" "Fe" "type" > summary(fgl) RI Na Mg Al Min. :-6.8500 Min. :10.73 Min. :0.000 Min. :0.290 1st Qu.:-1.4775 1st Qu.:12.91 1st Qu.:2.115 1st Qu.:1.190 Median :-0.3200 Median :13.30 Median :3.480 Median :1.360 Mean : 0.3654 Mean :13.41 Mean :2.685 Mean :1.445 3rd Qu.: 1.1575 3rd Qu.:13.82 3rd Qu.:3.600 3rd Qu.:1.630 Max. :15.9300 Max. :17.38 Max. :4.490 Max. :3.500 Si K Ca Ba Min. :69.81 Min. :0.0000 Min. : 5.430 Min. :0.0000 1st Qu.:72.28 1st Qu.:0.1225 1st Qu.: 8.240 1st Qu.:0.0000 Median :72.79 Median :0.5550 Median : 8.600 Median :0.0000 Mean :72.65 Mean :0.4971 Mean : 8.957 Mean :0.1750 3rd Qu.:73.09 3rd Qu.:0.6100 3rd Qu.: 9.172 3rd Qu.:0.0000 Max. :75.41 Max. :6.2100 Max. :16.190 Max. :3.1500 Fe type Min. :0.00000 WinF :70 1st Qu.:0.00000 WinNF:76 Median :0.00000 Veh :17 Mean :0.05701 Con :13 3rd Qu.:0.10000 Tabl : 9 Max. :0.51000 Head :29 > help(fgl) > library(rpart) > rout <- rpart(type ~ ., data = fgl, cp = 1e-3) > rout$method [1] "class" > help(rpart) > help(rpart) > print(rout) n= 214 node), split, n, loss, yval, (yprob) * denotes terminal node 1) root 214 138 WinNF (0.33 0.36 0.079 0.061 0.042 0.14) 2) Ba< 0.335 185 110 WinNF (0.37 0.41 0.092 0.065 0.049 0.016) 4) Al< 1.42 113 50 WinF (0.56 0.27 0.12 0.0088 0.027 0.018) 8) Ca< 10.48 101 38 WinF (0.62 0.21 0.13 0 0.02 0.02) 16) RI>=-0.93 85 25 WinF (0.71 0.2 0.071 0 0.012 0.012) 32) Mg< 3.865 77 18 WinF (0.77 0.14 0.065 0 0.013 0.013) 64) Fe< 0.115 57 8 WinF (0.86 0.053 0.053 0 0.018 0.018) * 65) Fe>=0.115 20 10 WinF (0.5 0.4 0.1 0 0 0) 130) Mg< 3.6 10 3 WinF (0.7 0.2 0.1 0 0 0) * 131) Mg>=3.6 10 4 WinNF (0.3 0.6 0.1 0 0 0) * 33) Mg>=3.865 8 2 WinNF (0.12 0.75 0.12 0 0 0) * 17) RI< -0.93 16 9 Veh (0.19 0.25 0.44 0 0.062 0.062) * 9) Ca>=10.48 12 2 WinNF (0 0.83 0 0.083 0.083 0) * 5) Al>=1.42 72 28 WinNF (0.083 0.61 0.056 0.15 0.083 0.014) 10) Mg>=2.26 52 11 WinNF (0.12 0.79 0.077 0 0.019 0) * 11) Mg< 2.26 20 9 Con (0 0.15 0 0.55 0.25 0.05) 22) Na< 13.495 12 1 Con (0 0.083 0 0.92 0 0) * 23) Na>=13.495 8 3 Tabl (0 0.25 0 0 0.62 0.12) * 3) Ba>=0.335 29 3 Head (0.034 0.034 0 0.034 0 0.9) * > plot(rout) > text(rout) > printcp(rout) Classification tree: rpart(formula = type ~ ., data = fgl, cp = 0.001) Variables actually used in tree construction: [1] Al Ba Ca Fe Mg Na RI Root node error: 138/214 = 0.64486 n= 214 CP nsplit rel error xerror xstd 1 0.206522 0 1.00000 1.09420 0.048314 2 0.072464 2 0.58696 0.60145 0.051652 3 0.057971 3 0.51449 0.58696 0.051414 4 0.036232 4 0.45652 0.49275 0.049357 5 0.032609 5 0.42029 0.51449 0.049913 6 0.010870 7 0.35507 0.50725 0.049733 7 0.001000 9 0.33333 0.49275 0.049357 > plotcp(rout) > pout <- prune(rout, cp = 0.05) > print(pout) n= 214 node), split, n, loss, yval, (yprob) * denotes terminal node 1) root 214 138 WinNF (0.33 0.36 0.079 0.061 0.042 0.14) 2) Ba< 0.335 185 110 WinNF (0.37 0.41 0.092 0.065 0.049 0.016) 4) Al< 1.42 113 50 WinF (0.56 0.27 0.12 0.0088 0.027 0.018) 8) Ca< 10.48 101 38 WinF (0.62 0.21 0.13 0 0.02 0.02) * 9) Ca>=10.48 12 2 WinNF (0 0.83 0 0.083 0.083 0) * 5) Al>=1.42 72 28 WinNF (0.083 0.61 0.056 0.15 0.083 0.014) 10) Mg>=2.26 52 11 WinNF (0.12 0.79 0.077 0 0.019 0) * 11) Mg< 2.26 20 9 Con (0 0.15 0 0.55 0.25 0.05) * 3) Ba>=0.335 29 3 Head (0.034 0.034 0 0.034 0 0.9) * > plot(pout) > text(pout) > ###### lme ####### > library(MASS) > data(petrol) > summary(petrol) No SG VP V10 EP A : 4 Min. :31.80 Min. :0.200 Min. :190.0 Min. :205.0 D : 4 1st Qu.:36.62 1st Qu.:1.800 1st Qu.:217.0 1st Qu.:274.5 G : 4 Median :40.00 Median :4.800 Median :231.0 Median :349.0 B : 3 Mean :39.25 Mean :4.181 Mean :241.5 Mean :332.1 C : 3 3rd Qu.:40.92 3rd Qu.:6.100 3rd Qu.:268.8 3rd Qu.:383.0 E : 3 Max. :50.80 Max. :8.600 Max. :316.0 Max. :444.0 (Other):11 Y Min. : 2.80 1st Qu.:11.65 Median :17.80 Mean :19.66 3rd Qu.:27.05 Max. :45.70 > help(petrol) > attach(petrol) > coplot(Y ~ EP | No) Warning message: calling par(new=) with no plot > plot(EP, Y) > coplot(Y ~ EP | No) > xyplot(Y ~ EP | No) Error: couldn't find function "xyplot" > help(xyplot, try.all.packages = TRUE) topic 'xyplot' is not in any loaded package but can be found in package 'lattice' in library '/APPS/8.2/lib/R/library' > library(lattice) > xyplot(Y ~ EP | No) > dev.off() null device 1 > out <- lm(Y ~ No / EP - 1) > summary(out) Call: lm(formula = Y ~ No/EP - 1) Residuals: Min 1Q Median 3Q Max -2.196375 -0.569993 0.006242 0.546432 2.471642 Coefficients: Estimate Std. Error t value Pr(>|t|) NoA -22.66715 3.33414 -6.799 1.91e-05 *** NoB -24.97342 4.93698 -5.058 0.000281 *** NoC -30.68583 4.90793 -6.252 4.24e-05 *** NoD -29.85863 4.01250 -7.441 7.83e-06 *** NoE -56.15986 8.95683 -6.270 4.13e-05 *** NoF -32.87646 5.65611 -5.813 8.31e-05 *** NoG -30.28882 3.70335 -8.179 3.00e-06 *** NoH -43.91059 5.23750 -8.384 2.32e-06 *** NoI -30.23562 11.98730 -2.522 0.026796 * NoJ -36.66183 13.30735 -2.755 0.017440 * NoA:EP 0.16686 0.01051 15.872 2.03e-09 *** NoB:EP 0.14632 0.01737 8.426 2.20e-06 *** NoC:EP 0.17968 0.01755 10.236 2.78e-07 *** NoD:EP 0.15354 0.01201 12.785 2.38e-08 *** NoE:EP 0.22879 0.02500 9.150 9.27e-07 *** NoF:EP 0.16048 0.01627 9.862 4.16e-07 *** NoG:EP 0.13571 0.01127 12.044 4.64e-08 *** NoH:EP 0.17041 0.01414 12.052 4.61e-08 *** NoI:EP 0.12603 0.03080 4.092 0.001494 ** NoJ:EP 0.12900 0.03398 3.796 0.002548 ** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 Residual standard error: 1.59 on 12 degrees of freedom Multiple R-Squared: 0.9981, Adjusted R-squared: 0.9949 F-statistic: 314.6 on 20 and 12 DF, p-value: 2.347e-13 > out.little <- lm(Y ~ No + EP - 1) > summary(out.little) Call: lm(formula = Y ~ No + EP - 1) Residuals: Min 1Q Median 3Q Max -3.13601 -0.93477 -0.08414 1.16652 3.39579 Coefficients: Estimate Std. Error t value Pr(>|t|) NoA -20.163722 1.996062 -10.10 1.62e-09 *** NoB -28.438473 1.930784 -14.73 1.53e-12 *** NoC -24.931068 1.908768 -13.06 1.50e-11 *** NoD -31.558950 2.095101 -15.06 9.92e-13 *** NoE -31.193987 2.308314 -13.51 7.90e-12 *** NoF -32.277592 2.241302 -14.40 2.35e-12 *** NoG -37.677207 2.061947 -18.27 2.26e-14 *** NoH -39.650067 2.350482 -16.87 1.10e-13 *** NoI -42.907727 2.583537 -16.61 1.49e-13 *** NoJ -48.277037 2.483321 -19.44 6.61e-15 *** EP 0.158730 0.005718 27.76 < 2e-16 *** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 Residual standard error: 1.879 on 21 degrees of freedom Multiple R-Squared: 0.9953, Adjusted R-squared: 0.9929 F-statistic: 408.4 on 11 and 21 DF, p-value: < 2.2e-16 > anova(out.little, out) Analysis of Variance Table Model 1: Y ~ No + EP - 1 Model 2: Y ~ No/EP - 1 Res.Df RSS Df Sum of Sq F Pr(>F) 1 21 74.132 2 12 30.329 9 43.803 1.9257 0.1439 > out <- out.little > out.little <- lm(Y ~ EP) > summary(out.little) Call: lm(formula = Y ~ EP) Residuals: Min 1Q Median 3Q Max -14.75837 -6.27829 0.05255 5.16243 17.84805 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -16.66206 6.68721 -2.492 0.0185 * EP 0.10937 0.01972 5.546 4.98e-06 *** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 Residual standard error: 7.659 on 30 degrees of freedom Multiple R-Squared: 0.5063, Adjusted R-squared: 0.4898 F-statistic: 30.76 on 1 and 30 DF, p-value: 4.983e-06 > anova(out.little, out) Analysis of Variance Table Model 1: Y ~ EP Model 2: Y ~ No + EP - 1 Res.Df RSS Df Sum of Sq F Pr(>F) 1 30 1759.69 2 21 74.13 9 1685.56 53.054 1.984e-12 *** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 > out.little <- lm(Y ~ SG + VP + V10 + EP) > summary(out.little) Call: lm(formula = Y ~ SG + VP + V10 + EP) Residuals: Min 1Q Median 3Q Max -3.5804 -1.5223 -0.1098 1.4237 4.6214 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -6.820774 10.123152 -0.674 0.5062 SG 0.227246 0.099937 2.274 0.0311 * VP 0.553726 0.369752 1.498 0.1458 V10 -0.149536 0.029229 -5.116 2.23e-05 *** EP 0.154650 0.006446 23.992 < 2e-16 *** --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 Residual standard error: 2.234 on 27 degrees of freedom Multiple R-Squared: 0.9622, Adjusted R-squared: 0.9566 F-statistic: 171.7 on 4 and 27 DF, p-value: < 2.2e-16 > anova(out.little, out) Analysis of Variance Table Model 1: Y ~ SG + VP + V10 + EP Model 2: Y ~ No + EP - 1 Res.Df RSS Df Sum of Sq F Pr(>F) 1 27 134.804 2 21 74.132 6 60.672 2.8645 0.03368 * --- Signif. codes: 0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 > > ### Is this comparison really nested??? > ### get model matrices > out <- lm(Y ~ No + EP - 1, x = TRUE) > out.little <- lm(Y ~ SG + VP + V10 + EP, x = TRUE) > dim(out$x) [1] 32 11 > dim(out.little$x) [1] 32 5 > ### regress each column of out.little$x on columns of out$x > foo <- function(y) sum(lm(y ~ out$x - 1)$residuals^2) > apply(out.little$x, 2, foo) (Intercept) SG VP V10 EP 3.411721e-31 6.551188e-28 1.407484e-29 3.640476e-26 3.417121e-28 > ### essentially zero RSS means perfect fit so columns of out.little$x > ### ARE linear combinations of columns of out$x and models ARE nested