Databases Reference
In-Depth Information
# plot a histogram with a line marking $12,000
ggplot
(
diamonds
)
+
geom_histogram
(
aes
(
x
=
price
))
+
geom_vline
(
xintercept
=
12000
)
# build a TRUE/FALSE variable indicating if the price is above
our threshold
diamonds
$
Expensive
<-
ifelse
(
diamonds
$
price
>=
12000
,
1
,
0
)
head
(
diamonds
)
# get rid of the price column
diamonds
$
price
<-
NULL
## glmnet
require
(
glmnet
)
# build the predictor matrix, we are leaving out the last
column which is our response
x
<-
model.matrix
(
~
.
,
diamonds
[,
-
ncol
(
diamonds
)])
# build the response vector
y
<-
as.matrix
(
diamonds
$
Expensive
)
# run the glmnet
system.time
(
modGlmnet
<-
glmnet
(
x
=
x
,
y
=
y
,
family
=
"binomial"
))
# plot the coefficient path
plot
(
modGlmnet
,
label
=
TRUE
)
# this illustrates that setting a seed allows you to recreate
random results, run them both a few times
set.seed
(
48872
)
sample
(
1
:
10
)
## decision tree
require
(
rpart
)
# fir a simple decision tree
modTree
<-
rpart
(
Expensive
~
.
,
data
=
diamonds
)
# plot the splits
plot
(
modTree
)
text
(
modTree
)
## bagging (or bootstrap aggregating)
require
(
boot
)
mean
(
diamonds
$
carat
)
sd
(
diamonds
$
carat
)
# function for bootstrapping the mean
boot.mean
<-
function
(
x
,
i
)
{
mean
(
x
[
i
])
}
# allows us to find the variability of the mean
boot
(
data
=
diamonds
$
carat
,
statistic
=
boot.mean
,
R
=
120
)
require
(
adabag
)