这是我在参加kaggle上的bike-sharing-demand 比赛时,遇到的一个问题,我参考这篇文章对数据进行了处理。但是在对数据进行处理之后,添加的month特征对预测效果产生了极大的影响。在添加month之前,提交的分数为0.42,但是在添加month特征之后,分数变成了0.47。按照这个比赛的评分标准,分数越低,预测效果越好。
也就是说,现在在添加了一个特征之后,预测的效果非常明显的下降了。随机森林不是对冗余特征不敏感吗?
处理后的数据我已经上传到了GitHub上,大家可以直接下载下来测试。
使用的是随机森林算法,只设置了参数ntree=250。
我刚刚开始学习数据挖掘,前面被这个问题困住了好几天,怎么也搞不明白这是怎么回事。希望各位大神能够帮忙解答一下。
对数据的具体处理如下
#loading the required libraries
library(rpart)
#library(rattle)
#ibrary(rpart.plot)
library(RColorBrewer)
library(randomForest)
# reading the data files
train=read.csv("train.csv")
test=read.csv("test.csv")
str(train)
# introducing variables in test to combine train and test
# can also be done by removing the same variables from training data
test$registered=0
test$casual=0
test$count=0
data=rbind(train,test)
# getting some information about the combined data
str(data)
summary(data)
# factoring some variables from numeric
data$season=as.factor(data$season)
data$weather=as.factor(data$weather)
data$holiday=as.factor(data$holiday)
data$workingday=as.factor(data$workingday)
# extracting hour from the datetime variable
data$hour=substr(data$datetime,12,13)
data$hour=as.factor(data$hour)
# dividing again into train and test
train=data[as.integer(substr(data$datetime,9,10))<20,]
test=data[as.integer(substr(data$datetime,9,10))>19,]
# creating some boxplots on the count of rentals
#boxplot(train$count~train$hour,xlab="hour", ylab="count of users")
#boxplot(train$casual~train$hour,xlab="hour", ylab="casual users")
#boxplot(train$registered~train$hour,xlab="hour", ylab="registered users")
# extracting days of week from datetime
date=substr(data$datetime,1,10)
days<-weekdays(as.Date(date))
data$day=days
train=data[as.integer(substr(data$datetime,9,10))<20,]
test=data[as.integer(substr(data$datetime,9,10))>19,]
# creating boxplots for rentals with different variables to see the variation
#boxplot(train$registered~train$day,xlab="day", ylab="registered users")
#boxplot(train$casual~train$day,xlab="day", ylab="casual users")
#boxplot(train$registered~train$weather,xlab="weather", ylab="registered users")
#boxplot(train$casual~train$weather,xlab="weather", ylab="casual users")
#boxplot(train$registered~train$temp,xlab="temp", ylab="registered users")
#boxplot(train$casual~train$temp,xlab="temp", ylab="casual users")
# extracting year from data
data$year=substr(data$datetime,1,4)
data$year=as.factor(data$year)
# ignore the division of data again and again, this could have been done together also
train=data[as.integer(substr(data$datetime,9,10))<20,]
test=data[as.integer(substr(data$datetime,9,10))>19,]
# again some boxplots with different variables
# these boxplots give important information about the dependent variable with respect to the independent variables
#boxplot(train$registered~train$year,xlab="year", ylab="registered users")
#boxplot(train$casual~train$year,xlab="year", ylab="casual users")
#boxplot(train$registered~train$windspeed,xlab="year", ylab="registered users")
#boxplot(train$casual~train$windspeed,xlab="year", ylab="casual users")
#boxplot(train$registered~train$humidity,xlab="humidity", ylab="registered users")
#boxplot(train$casual~train$humidity,xlab="humidity", ylab="casual users")
data$hour=as.integer(data$hour)
# created this variable to divide a day into parts, but did not finally use it
data$day_part=0
train=data[as.integer(substr(data$datetime,9,10))<20,]
test=data[as.integer(substr(data$datetime,9,10))>19,]
data=rbind(train,test)
#using decision trees for binning some variables, this was a really important step in feature engineering
d=rpart(registered~hour,data=train)
#fancyRpartPlot(d)
d=rpart(casual~hour,data=train)
#fancyRpartPlot(d)
data=rbind(train,test)
data$dp_reg=0
data$dp_reg[data$hour<8]=1
data$dp_reg[data$hour>=22]=2
data$dp_reg[data$hour>9 & data$hour<18]=3
data$dp_reg[data$hour==8]=4
data$dp_reg[data$hour==9]=5
data$dp_reg[data$hour==20 | data$hour==21]=6
data$dp_reg[data$hour==19 | data$hour==18]=7
data$dp_cas=0
data$dp_cas[data$hour<=8]=1
data$dp_cas[data$hour==9]=2
data$dp_cas[data$hour>=10 & data$hour<=19]=3
data$dp_cas[data$hour>19]=4
data$month=substr(data$datetime,6,7)
data$month=as.integer(data$month)