Extracting Meaning from Data - Doing Data Science

Databases Reference

In-Depth Information

# plot a histogram with a line marking $12,000

ggplot ( diamonds ) + geom_histogram ( aes ( x = price )) +

geom_vline ( xintercept = 12000 )

# build a TRUE/FALSE variable indicating if the price is above

our threshold

diamonds $ Expensive <- ifelse ( diamonds $ price >= 12000 , 1 , 0 )

head ( diamonds )

# get rid of the price column

diamonds $ price <- NULL

## glmnet

require ( glmnet )

# build the predictor matrix, we are leaving out the last

column which is our response

x <- model.matrix ( ~ . , diamonds [, - ncol ( diamonds )])

# build the response vector

y <- as.matrix ( diamonds $ Expensive )

# run the glmnet

system.time ( modGlmnet <- glmnet ( x = x , y = y , family = "binomial" ))

# plot the coefficient path

plot ( modGlmnet , label = TRUE )

# this illustrates that setting a seed allows you to recreate

random results, run them both a few times

set.seed ( 48872 )

sample ( 1 : 10 )

## decision tree

require ( rpart )

# fir a simple decision tree

modTree <- rpart ( Expensive ~ . , data = diamonds )

# plot the splits

plot ( modTree )

text ( modTree )

## bagging (or bootstrap aggregating)

require ( boot )

mean ( diamonds $ carat )

sd ( diamonds $ carat )

# function for bootstrapping the mean

boot.mean <- function ( x , i )

{

mean ( x [ i ])

}

# allows us to find the variability of the mean

boot ( data = diamonds $ carat , statistic = boot.mean , R = 120 )

require ( adabag )

Doing Data Science

Search WWH ::

Custom Search

Home