> library(MASS) > data(cpus) > cpus2 <- cpus[ , 2:8] > summary(cpus2) syct mmin mmax cach Min. : 17.0 Min. : 64 Min. : 64 Min. : 0.00 1st Qu.: 50.0 1st Qu.: 768 1st Qu.: 4000 1st Qu.: 0.00 Median : 110.0 Median : 2000 Median : 8000 Median : 8.00 Mean : 203.8 Mean : 2868 Mean :11796 Mean : 25.21 3rd Qu.: 225.0 3rd Qu.: 4000 3rd Qu.:16000 3rd Qu.: 32.00 Max. :1500.0 Max. :32000 Max. :64000 Max. :256.00 chmin chmax perf Min. : 0.000 Min. : 0.00 Min. : 6.0 1st Qu.: 1.000 1st Qu.: 5.00 1st Qu.: 27.0 Median : 2.000 Median : 8.00 Median : 50.0 Mean : 4.699 Mean : 18.27 Mean : 105.6 3rd Qu.: 6.000 3rd Qu.: 24.00 3rd Qu.: 113.0 Max. :52.000 Max. :176.00 Max. :1150.0 > help(cpus) > library(mgcv) This is mgcv 0.9-3 > attach(cpus2) > out <- gam(log10(perf) ~ syct + mmin + mmax + cach + chmin + chmax) > summary(out) Family: gaussian Link function: identity Formula: log10(perf) ~ syct + mmin + mmax + cach + chmin + chmax Parametric coefficients: Estimate std. err. t ratio Pr(>|t|) (Intercept) 1.4751 0.02679 55.06 < 2.22e-16 syct -0.00036291 5.834e-05 -6.22 2.8053e-09 mmin 1.1763e-05 6.084e-06 1.934 0.054566 mmax 1.8539e-05 2.137e-06 8.674 1.3795e-15 cach 0.0033479 0.0004648 7.203 1.1408e-11 chmin 0.0029189 0.00285 1.024 0.30692 chmax 9.322e-05 0.0007328 0.1272 0.8989 R-sq.(adj) = 0.807 Deviance explained = 81.3% GCV score = 0.041294 Scale est. = 0.039911 n = 209 > out <- gam(log10(perf) ~ s(syct, bs = "cr") + s(mmin, bs = "cr") + + s(mmax, bs = "cr") + s(cach, bs = "cr") + s(chmin, bs = "cr") + + s(chmax, bs = "cr")) > summary(out) Family: gaussian Link function: identity Formula: log10(perf) ~ s(syct, bs = "cr") + s(mmin, bs = "cr") + s(mmax, bs = "cr") + s(cach, bs = "cr") + s(chmin, bs = "cr") + s(chmax, bs = "cr") Parametric coefficients: Estimate std. err. t ratio Pr(>|t|) constant 1.881 0.05161 36.45 < 2.22e-16 Approximate significance of smooth terms: edf chi.sq p-value s(syct) 6.007 15.43 0.020854 s(mmin) 3.596 10.186 0.031031 s(mmax) 5.972 69.17 8.4111e-11 s(cach) 8.9 105.38 3.0267e-14 s(chmin) 7.473 17.703 0.021944 s(chmax) 8.126 17.262 0.035247 R-sq.(adj) = 0.892 Deviance explained = 91.2% GCV score = 0.027944 Scale est. = 0.022466 n = 209 > gam.check(out) Smoothing parameter selection converged after 10 iterations. The RMS GCV score gradiant at convergence was 4.270675e-07 . The Hessian was positive definite. The estimated model rank was 55 (maximum possible: 55) > plot(out) Press return for next page.... Press return for next page.... Press return for next page.... Press return for next page.... Press return for next page.... > > help(plot.gam) > out2 <- gam(log10(perf) ~ s(syct, mmin) + + s(mmax, bs = "cr") + s(cach, bs = "cr") + s(chmin, bs = "cr") + + s(chmax, bs = "cr")) > gam.check(out2) Smoothing parameter selection converged after 11 iterations. The RMS GCV score gradiant at convergence was 1.034848e-06 . The Hessian was positive definite. The estimated model rank was 66 (maximum possible: 66) > summary(out2) Family: gaussian Link function: identity Formula: log10(perf) ~ s(syct, mmin) + s(mmax, bs = "cr") + s(cach, bs = "cr") + s(chmin, bs = "cr") + s(chmax, bs = "cr") Parametric coefficients: Estimate std. err. t ratio Pr(>|t|) constant 1.8967 0.04274 44.38 < 2.22e-16 Approximate significance of smooth terms: edf chi.sq p-value s(syct,mmin) 18.03 54.857 9.1535e-05 s(mmax) 3.597 62.609 4.313e-11 s(cach) 8.833 85.357 1.3071e-11 s(chmin) 7.232 24.218 0.0020691 s(chmax) 8.59 24.904 0.0038496 R-sq.(adj) = 0.897 Deviance explained = 92% GCV score = 0.027458 Scale est. = 0.021259 n = 209 > plot(out2) Press return for next page.... Press return for next page.... Press return for next page.... Press return for next page.... > vis.gam(out2) > > sform <- 'log10(perf) ~ s(syct, bs = "cr") + s(mmin, bs = "cr") + s(mmax, bs ="cr") + s(cach, bs = "cr") + s(chmin, bs = "cr") + s(chmax, bs = "cr")' > sform [1] "log10(perf) ~ s(syct, bs = \"cr\") + s(mmin, bs = \"cr\") + s(mmax, bs =\"cr\") + s(cach, bs = \"cr\") + s(chmin, bs = \"cr\") + s(chmax, bs = \"cr\")" > as.formula(sform) log10(perf) ~ s(syct, bs = "cr") + s(mmin, bs = "cr") + s(mmax, bs = "cr") + s(cach, bs = "cr") + s(chmin, bs = "cr") + s(chmax, bs = "cr") > class(as.formula(sform)) [1] "formula" > typeof(as.formula(sform)) [1] "language" > out <- gam(as.formula(sform)) > summary(out) Family: gaussian Link function: identity Formula: log10(perf) ~ s(syct, bs = "cr") + s(mmin, bs = "cr") + s(mmax, bs = "cr") + s(cach, bs = "cr") + s(chmin, bs = "cr") + s(chmax, bs = "cr") Parametric coefficients: Estimate std. err. t ratio Pr(>|t|) constant 1.881 0.05161 36.45 < 2.22e-16 Approximate significance of smooth terms: edf chi.sq p-value s(syct) 6.007 15.43 0.020854 s(mmin) 3.596 10.186 0.031031 s(mmax) 5.972 69.17 8.4111e-11 s(cach) 8.9 105.38 3.0267e-14 s(chmin) 7.473 17.703 0.021944 s(chmax) 8.126 17.262 0.035247 R-sq.(adj) = 0.892 Deviance explained = 91.2% GCV score = 0.027944 Scale est. = 0.022466 n = 209 > > foo <- NULL > bar <- NULL > baz <- NULL > for (i in 1:5) + for (j in (i+1):6) { + sform <- paste("log10(perf) ~ s(", names(cpus2)[i], ", ", + names(cpus2)[j], ")", sep = "") + for (k in 1:6) { + if (k != i && k != j) + sform <- paste(sform, " + s(", names(cpus2)[k], + ", bs = \"cr\")", sep = "") + } + out <- gam(as.formula(sform)) + foo <- c(foo, sform) + bar <- c(bar, out$deviance) + baz <- c(baz, sum(out$edf)) + } > foo [1] "log10(perf) ~ s(syct, mmin) + s(mmax, bs = \"cr\") + s(cach, bs = \"cr\") + s(chmin, bs = \"cr\") + s(chmax, bs = \"cr\")" [2] "log10(perf) ~ s(syct, mmax) + s(mmin, bs = \"cr\") + s(cach, bs = \"cr\") + s(chmin, bs = \"cr\") + s(chmax, bs = \"cr\")" [3] "log10(perf) ~ s(syct, cach) + s(mmin, bs = \"cr\") + s(mmax, bs = \"cr\") + s(chmin, bs = \"cr\") + s(chmax, bs = \"cr\")" [4] "log10(perf) ~ s(syct, chmin) + s(mmin, bs = \"cr\") + s(mmax, bs = \"cr\") + s(cach, bs = \"cr\") + s(chmax, bs = \"cr\")" [5] "log10(perf) ~ s(syct, chmax) + s(mmin, bs = \"cr\") + s(mmax, bs = \"cr\") + s(cach, bs = \"cr\") + s(chmin, bs = \"cr\")" [6] "log10(perf) ~ s(mmin, mmax) + s(syct, bs = \"cr\") + s(cach, bs = \"cr\") + s(chmin, bs = \"cr\") + s(chmax, bs = \"cr\")" [7] "log10(perf) ~ s(mmin, cach) + s(syct, bs = \"cr\") + s(mmax, bs = \"cr\") + s(chmin, bs = \"cr\") + s(chmax, bs = \"cr\")" [8] "log10(perf) ~ s(mmin, chmin) + s(syct, bs = \"cr\") + s(mmax, bs = \"cr\") + s(cach, bs = \"cr\") + s(chmax, bs = \"cr\")" [9] "log10(perf) ~ s(mmin, chmax) + s(syct, bs = \"cr\") + s(mmax, bs = \"cr\") + s(cach, bs = \"cr\") + s(chmin, bs = \"cr\")" [10] "log10(perf) ~ s(mmax, cach) + s(syct, bs = \"cr\") + s(mmin, bs = \"cr\") + s(chmin, bs = \"cr\") + s(chmax, bs = \"cr\")" [11] "log10(perf) ~ s(mmax, chmin) + s(syct, bs = \"cr\") + s(mmin, bs = \"cr\") + s(cach, bs = \"cr\") + s(chmax, bs = \"cr\")" [12] "log10(perf) ~ s(mmax, chmax) + s(syct, bs = \"cr\") + s(mmin, bs = \"cr\") + s(cach, bs = \"cr\") + s(chmin, bs = \"cr\")" [13] "log10(perf) ~ s(cach, chmin) + s(syct, bs = \"cr\") + s(mmin, bs = \"cr\") + s(mmax, bs = \"cr\") + s(chmax, bs = \"cr\")" [14] "log10(perf) ~ s(cach, chmax) + s(syct, bs = \"cr\") + s(mmin, bs = \"cr\") + s(mmax, bs = \"cr\") + s(chmin, bs = \"cr\")" [15] "log10(perf) ~ s(chmin, chmax) + s(syct, bs = \"cr\") + s(mmin, bs = \"cr\") + s(mmax, bs = \"cr\") + s(cach, bs = \"cr\")" > bar [1] 3.440091 3.476594 3.921316 4.467162 4.441291 3.862615 3.658427 4.243920 [9] 4.079913 3.323344 3.976760 3.866120 3.511496 4.042348 3.749246 > baz [1] 46.28472 42.55493 44.20348 27.73586 27.32756 39.89493 41.25936 32.61469 [9] 34.42520 50.13435 37.38003 38.99765 41.21502 36.34028 42.22130 > out <- gam(log10(perf) ~ s(syct, bs = "cr") + s(mmin, bs = "cr") + + s(mmax, bs = "cr") + s(cach, bs = "cr") + s(chmin, bs = "cr") + + s(chmax, bs = "cr")) > out$deviance [1] 3.774799 > sum(out$edf) [1] 40.07417 > bar - out$deviance [1] -0.33470794 -0.29820416 0.14651693 0.69236379 0.66649257 0.08781674 [7] -0.11637116 0.46912152 0.30511459 -0.45145428 0.20196090 0.09132161 [13] -0.26330260 0.26754949 -0.02555257 > baz - sum(out$edf) [1] 6.2105574 2.4807613 4.1293119 -12.3383105 -12.7466048 -0.1792387 [7] 1.1851979 -7.4594747 -5.6489632 10.0601828 -2.6941371 -1.0765203 [13] 1.1408544 -3.7338838 2.1471377 > out.too <- gam(as.formula(foo[4])) > summary(out.too) Family: gaussian Link function: identity Formula: log10(perf) ~ s(syct, chmin) + s(mmin, bs = "cr") + s(mmax, bs = "cr") + s(cach, bs = "cr") + s(chmax, bs = "cr") Parametric coefficients: Estimate std. err. t ratio Pr(>|t|) constant 1.909 0.03109 61.41 < 2.22e-16 Approximate significance of smooth terms: edf chi.sq p-value s(syct,chmin) 2 7.9554 0.020396 s(mmin) 2.128 17.631 0.00026491 s(mmax) 6.424 76.29 9.9236e-12 s(cach) 8.942 104.16 2.7385e-14 s(chmax) 8.242 40.381 1.3825e-05 R-sq.(adj) = 0.88 Deviance explained = 89.6% GCV score = 0.0287 Scale est. = 0.024767 n = 209 > summary(out) Family: gaussian Link function: identity Formula: log10(perf) ~ s(syct, bs = "cr") + s(mmin, bs = "cr") + s(mmax, bs = "cr") + s(cach, bs = "cr") + s(chmin, bs = "cr") + s(chmax, bs = "cr") Parametric coefficients: Estimate std. err. t ratio Pr(>|t|) constant 1.881 0.05161 36.45 < 2.22e-16 Approximate significance of smooth terms: edf chi.sq p-value s(syct) 6.007 15.43 0.020854 s(mmin) 3.596 10.186 0.031031 s(mmax) 5.972 69.17 8.4111e-11 s(cach) 8.9 105.38 3.0267e-14 s(chmin) 7.473 17.703 0.021944 s(chmax) 8.126 17.262 0.035247 R-sq.(adj) = 0.892 Deviance explained = 91.2% GCV score = 0.027944 Scale est. = 0.022466 n = 209 > > library(rpart) > rout <- rpart(log10(perf) ~ syct + mmin + mmax + cach + chmin + chmax, + cp = 1e-3) > rout$method [1] "anova" > # summary(rout) # large amount of blather > print(rout) n= 209 node), split, n, deviance, yval * denotes terminal node 1) root 209 43.11554000 1.753333 2) cach< 27 143 11.79085000 1.524647 4) mmax< 6100 78 3.89374400 1.374824 8) mmax< 1750 12 0.78425160 1.088732 * 9) mmax>=1750 66 1.94873300 1.426840 18) mmax< 2500 17 0.56676380 1.325292 * 19) mmax>=2500 49 1.14584500 1.462071 38) chmax< 4.5 14 0.35284960 1.354804 * 39) chmax>=4.5 35 0.56747110 1.504978 78) syct< 110 9 0.07741772 1.414738 * 79) syct>=110 26 0.39139370 1.536215 * 5) mmax>=6100 65 4.04520300 1.704434 10) syct>=360 7 0.12908090 1.279749 * 11) syct< 360 58 2.50124700 1.755690 22) chmin< 5.5 46 1.22622900 1.698613 44) cach< 0.5 11 0.20206270 1.530643 * 45) cach>=0.5 35 0.61627500 1.751403 90) chmin>=1.5 15 0.26087250 1.690337 * 91) chmin< 1.5 20 0.25751430 1.797203 182) mmax< 14000 13 0.08761793 1.756702 * 183) mmax>=14000 7 0.10896860 1.872420 * 23) chmin>=5.5 12 0.55071310 1.974483 * 3) cach>=27 66 7.64263500 2.248821 6) mmax< 28000 41 2.34141700 2.061986 12) cach< 96.5 34 1.59195100 2.008124 24) mmax< 11240 14 0.42462370 1.826635 * 25) mmax>=11240 20 0.38340130 2.135166 50) chmax< 14 10 0.07835528 2.037790 * 51) chmax>=14 10 0.11540350 2.232542 * 13) cach>=96.5 7 0.17173020 2.323601 * 7) mmax>=28000 25 1.52286300 2.555230 14) cach< 56 7 0.06929430 2.268365 * 15) cach>=56 18 0.65351270 2.666788 * > plot(rout) > text(rout) > printcp(rout) Regression tree: rpart(formula = log10(perf) ~ syct + mmin + mmax + cach + chmin + chmax, cp = 0.001) Variables actually used in tree construction: [1] cach chmax chmin mmax syct Root node error: 43.116/209 = 0.20629 n= 209 CP nsplit rel error xerror xstd 1 0.5492697 0 1.00000 1.02028 0.097925 2 0.0893390 1 0.45073 0.48857 0.049183 3 0.0876332 2 0.36139 0.43422 0.043776 4 0.0328159 3 0.27376 0.30865 0.031042 5 0.0269220 4 0.24094 0.32475 0.038066 6 0.0185561 5 0.21402 0.30063 0.037301 7 0.0167992 6 0.19546 0.30578 0.036989 8 0.0157908 7 0.17866 0.29330 0.036400 9 0.0094604 9 0.14708 0.26706 0.034656 10 0.0054766 10 0.13762 0.26898 0.035105 11 0.0052307 11 0.13215 0.25528 0.033723 12 0.0043985 12 0.12692 0.25113 0.033630 13 0.0022883 13 0.12252 0.24802 0.033571 14 0.0022704 14 0.12023 0.24786 0.033457 15 0.0014131 15 0.11796 0.24664 0.033439 16 0.0010000 16 0.11655 0.24651 0.033576 > plotcp(rout) > pout <- prune(rout, cp = 0.01) > print(pout) n= 209 node), split, n, deviance, yval * denotes terminal node 1) root 209 43.1155400 1.753333 2) cach< 27 143 11.7908500 1.524647 4) mmax< 6100 78 3.8937440 1.374824 8) mmax< 1750 12 0.7842516 1.088732 * 9) mmax>=1750 66 1.9487330 1.426840 * 5) mmax>=6100 65 4.0452030 1.704434 10) syct>=360 7 0.1290809 1.279749 * 11) syct< 360 58 2.5012470 1.755690 22) chmin< 5.5 46 1.2262290 1.698613 * 23) chmin>=5.5 12 0.5507131 1.974483 * 3) cach>=27 66 7.6426350 2.248821 6) mmax< 28000 41 2.3414170 2.061986 12) cach< 96.5 34 1.5919510 2.008124 24) mmax< 11240 14 0.4246237 1.826635 * 25) mmax>=11240 20 0.3834013 2.135166 * 13) cach>=96.5 7 0.1717302 2.323601 * 7) mmax>=28000 25 1.5228630 2.555230 14) cach< 56 7 0.0692943 2.268365 * 15) cach>=56 18 0.6535127 2.666788 * > plot(pout) > text(pout) >