Databases Reference
In-Depth Information
require ( class )
setwd ( "~/Documents/Teaching/Stat 4242 Fall 2012/Homework 2" )
mt <- read.xls ( "rollingsales_manhattan.xls" ,
pattern = "BOROUGH" , stringsAsFactors = FALSE )
head ( mt )
summary ( mt )
names ( mt ) <- tolower ( names ( mt ))
mt $ sale.price.n <- as.numeric ( gsub ( "[^[:digit:]]" , "" ,
mt $ sale.price ))
sum ( is.na ( mt $ sale.price.n ))
sum ( mt $ sale.price.n == 0 )
names ( mt ) <- tolower ( names ( mt ))
## clean/format the data with regular expressions
mt $ gross.sqft <- as.numeric ( gsub ( "[^[:digit:]]" , "" ,
mt $ gross.square.feet ))
mt $ land.sqft <- as.numeric ( gsub ( "[^[:digit:]]" , "" ,
mt $ land.square.feet ))
mt $ sale.date <- as.Date ( mt $ sale.date )
mt $ year.built <- as.numeric ( as.character ( mt $ year.built ))
mt $ zip.code <- as.character ( mt $ zip.code )
## - standardize data (set year built start to 0; land and
gross sq ft; sale price (exclude $0 and possibly others); possi
bly tax block; outside dataset for coords of tax block/lot?)
min_price <- 10000
mt <- mt [ which ( mt $ sale.price.n >= min_price ),]
n_obs <- dim ( mt )[ 1 ]
mt $ address.noapt <- gsub ( "[,][[:print:]]*" , "" ,
gsub ( "[ ]+" , " " , trim ( mt $ address )))
mt_add <- unique ( data.frame ( mt $ address.noapt , mt $ zip.code ,
stringsAsFactors = FALSE ))
names ( mt_add ) <- c ( "address.noapt" , "zip.code" )
mt_add <- mt_add [ order ( mt_add $ address.noapt ),]
#find duplicate addresses with different zip codes
dup <- duplicated ( mt_add $ address.noapt )
# remove them
dup_add <- mt_add [ mt_add $ dup , 1 ]
mt_add <- mt_add [( mt_add $ address.noapt != dup_add [ 1 ] &
mt_add $ address.noapt != dup_add [ 2 ]),]
Search WWH ::




Custom Search