# read raw data
station <- c('Aotizhongxin', 'Changping', 'Dingling', 'Dongsi', 'Guanyuan', 'Gucheng', 'Huairou', 'Nongzhanguan', 'Shunyi', 'Tiantan', 'Wanliu', 'Wanshouxigong')
data <- c()
for (i in 1:12) {
    tfile <- paste('./data/PRSA_Data_20130301-20170228/','PRSA_Data_',station[i],'_20130301-20170228.csv',sep = "")
    td <- read.csv(tfile)
    data <- rbind(data,td)
}

# scatter plot
names <- colnames(data)
names[17] <- "Wind speed"
sp <- sample(1:nrow(data), 3000, F)
tmp <- list()
for(i in c(7:11, 17)){
    tmp <- c(
                tmp,
                list(data.frame(x = data[sp, i],y= data[sp, 6], variable = names[i], y_name = "PM2.5") )
             )
}
tmp_data <- do.call(rbind, tmp)
tmp_data <- tmp_data[complete.cases(tmp_data),]
library(ggplot2)
scatter_plot <- ggplot(tmp_data,aes(x, y)) +
    geom_point(size = 0.3) +  
    xlab(NULL) + ylab("PM2.5") + 
    facet_wrap(~variable, scales = "free_x", nrow = 1, strip.position = "bottom") +
    theme_classic() +
    theme(strip.background = element_blank(),
          strip.placement = "outside",
          axis.text = element_text(size = rel(0.8)),
          axis.title.y = element_text(size = rel(0.8), angle = 90, face = "plain"),
          panel.spacing.x = unit(1, "lines")
         )
ggsave("data/scatter_plot.eps", scatter_plot, width = 10, height = 3)

# preprocessing data
data <- data[complete.cases(data), ]

data_output <- data[, c(6, 7, 8, 9, 10, 11, 12, 13, 14)]

head(data_output)

# jitter the covariates a little bit to break the ties
for(i in 1:(ncol(data_output)) ){
    data_output[,i] <- data_output[,i] + rnorm(nrow(data_output)) * 10^(-8)
}
write.table(data_output, "data/data.txt", row.names = F, col.names = F)

## a temp column
#data2 <- data[ , c(1)]
#
## station dummy variables
#station_name <- unique(data$station)[-1]
#for(the_name in station_name){
#    data2 <- cbind(data2, as.numeric(data[, 18] == the_name))
#}
#
## remove the temp column
#data2 <- data2 [, - c(1)]
#
#ncol(data2)
#
## month dummy variables
#month_name <- unique(data$month)[-1]
#for(the_name in month_name){
#    data2 <- cbind(data2, as.numeric(data[, 3] == the_name))
#}
#
#ncol(data2)
#
## hour dummy variables
#hour_name <- unique(data$hour)[-1]
#for(the_name in hour_name){
#    data2 <- cbind(data2, as.numeric(data[, 5] == the_name))
#}
#
#ncol(data2)
#
## wind dummy variables
#wd_name <- unique(data$wd)[-1]
#for(the_name in wd_name){
#    data2 <- cbind(data2, as.numeric(data[,16] == the_name))
#}
#
#ncol(data2)
#
## jitter the covariates a little bit to break the ties
#for(i in 1:(ncol(data2)) ){
#    data2[,i] <- data2[,i] + rnorm(nrow(data2)) * 10^(-8)
#}
#
#dim(data2)
## add response
#data2 <- cbind(data2, data[, 6] )
## write data
#write.table(data2, "data/data.txt", row.names = F, col.names = F)
