Predictive machine learning

Overview

The dataset comprises 30,000 power plants by country, capacity, latitude and longitude, fuel source, and year constructed. About 10,000 data points also contain annual electricity generation. To predict the electricity generation of the other 20,000 power plants, we create an ensemble of a deep learning neural network, support vector regression, knn regression, generalized additive model, and linear regression fed to a ridge regressor.

Comparing this ensemble to the baseline, a GBM, finds that it reduces mean squared error by nearly half.

library(tensorflow)
library(keras)
library(FNN)
library(e1071)
library(glmnet) 
library(ggplot2)
library(gam)
set.seed(365)

Read in data, subset out columns, and create an 80%/20% training/test split.

data <- read.csv("data/global_power_plant_database.csv")
data.narm <- data[,c(5,6,7,8,12,21,22)]
data.narm <- na.omit(data.narm)

s <- sample(1:nrow(data.narm), 2000) 
train <- data.narm[-s,]
test <- data.narm[s,]

train_y <- train$generation_gwh_2016 
test_y <- test$generation_gwh_2016

train_x <- train[,-c(6,7)]
test_x <- test[,-c(6,7)]

# create training and test predictions of the GBM baseline
trainxp <- train$estimated_generation_gwh
testxp <- test$estimated_generation_gwh
prevpred <- append(testxp, trainxp)
# Scale both the training and test set with the training set mean and s.d.
train_mean <- apply(train_x[,-c(4)], 2, mean)
train_sd <- apply(train_x[-c(4)], 2, sd)
train_x[,-c(4)] <- scale(train_x[,-c(4)], center = train_mean, scale = train_sd)
test_x[,-c(4)] <- scale(test_x[,-c(4)], center = train_mean, scale = train_sd)

# One hot encode the fuel type
fuel1ts <- to_categorical(as.integer(test_x$fuel1))
fuel1tr <- to_categorical(as.integer(train_x$fuel1))

train_x <- train_x[,-4]
train_x <- cbind(train_x, fuel1tr)

test_x <- test_x[,-4]
test_x <- cbind(test_x, fuel1ts)
train_x <- as.matrix(train_x)
test_x <- as.matrix(test_x)

Neural network

2-hidden layer neural network with dropout layers optimizing for mean squared error over 60 epochs.

build_model <- function() {
  model <- keras_model_sequential() %>%
    layer_dense(units = 800, activation = "relu", input_shape = dim(train_x)[[2]]) %>%
    layer_dropout(rate=0.15) %>%
    layer_dense(units = 800, activation = "relu") %>% layer_dropout(rate=0.15) %>%
    layer_dense(units = 800, activation = "relu") %>% layer_dense(units = 1) # defaults to linear activation
    model %>% compile( optimizer = "rmsprop", loss = "mse",
    metrics = c("mae")
    ) }

model <- build_model()

history <- model %>%
  fit(train_x, train_y, epochs = 60,
      batch_size = 64, validation_split = 0.2, verbose=0)
model %>% evaluate(test_x, test_y)
$loss
[1] 532208.2

$mean_absolute_error
[1] 215.9094
predictions <- model %>% predict_proba(test_x)
predictions2 <- model %>% predict_proba(train_x)
tobind <- test_y
testdata <- cbind(test_x, tobind)
testdata <- as.data.frame(testdata)
testdata$predictions <- predictions
testdata$type <- data.narm$fuel1[s]

tobind <- train_y
traindata <-cbind(train_x, tobind)
traindata <- as.data.frame(traindata) 
type <- data.narm$fuel1[-s] 
traindata$predictions <- predictions2
traindata$type <- type

alldata <- rbind(testdata, traindata)

K-nearest neighbors

The predictive capability of the neural network is supplemented with other models in an ensemble. This approach allows the best machine learning algorithm to be used for each observation. This section selects the best K for a knn regression and then saves the predictions.

mse <- rep(NA, 29)
for(i in 2:30) {
  knn1 <- knn.reg(train=data.frame(train_x)[,c(1,4)],
                  test=data.frame(test_x)[,c(1,4)], k=i, y=train_y)
  mse[i] <- (paste(i, round(mean((knn1$pred - test_y)^2),0)))
}
knn1 <- knn.reg(train=data.frame(train_x)[,c(1,4)], test=data.frame(test_x)[,c(1,4)], k=22, y=train_y)
knn2 <- knn.reg(train=data.frame(train_x)[,c(1,4)], test=data.frame(train_x)[,c(1,4)], k=22, y=train_y)
knnpred <- append(knn1$pred, knn2$pred)
predictions <- data.frame(knn = knnpred,
                             nnet = as.vector(alldata$predictions),
                             real = alldata$tobind,
                             capacity = alldata$capacity_mw,
                             long = alldata$longitude,
                             lat = alldata$latitude,
                             type = alldata$type,
                             gbm = prevpred,
                             gbmscale = scale(prevpred))
predictions$nnetscale <- scale(predictions$nnet)
predictions$knnscale <- scale(predictions$knn) 
predictions$gbmscale <- scale(predictions$gbm) 
predictions$capscale <- scale(predictions$capacity)
predictions$latscale <- scale(predictions$lat) 
predictions$longscale <- scale(predictions$long)
predictions_test <- predictions[s,] 
predictions_train <- predictions[-s,]

Linear model

lm1 <- lm(real~capacity*type, data=predictions_train)
predictions_test$lm <- scale(predict(lm1, newdata = predictions_test))
predictions_train$lm <- scale(predict(lm1, data=predictions_train))

GAM

The fourth model is a generalized additive model with separate smoothing splines with five degrees of freedom.

gam1 <- gam(real~s(capacity,5)+s(lat,5)+s(long,5)+type, family="gaussian", data = predictions_train)
par(mfrow=c(1,4))
plot(gam1, se=T, col="blue")

predictions_test$gam <- scale(predict(gam1, newdata=predictions_test)) 
predictions_train$gam <- scale(predict(gam1, newdata=predictions_train))

Support vector regression

predictions_test[,c(4,5,6)] <- lapply(predictions_test[,c(4,5,6)], scale) 
predictions_train[,c(4,5,6)] <- lapply(predictions_train[,c(4,5,6)], scale)

svm.tune =tune(svm, real~ capacity + long + lat + type, kernal="polynomial", data=predictions_train,
               ranges=list(cost=seq(1,100,10), degree=seq(1,3,1)))

modelsvm <- svm(real~ capacity + long + lat + type, predictions_train, cost=11)

predictions_test$svm <- scale(predict(modelsvm, predictions_test))
predictions_train$svm <- scale(predict(modelsvm, predictions_train))

Stacked ridge regression ensemble

Finally, we take the results from all the above models and feed the predictions into a ridge regression with real production as the response variable. We select the λ based upon the 1 SE rule to reduce the risk of overfitting.

ridge_train <- cv.glmnet(data.matrix(predictions_train[,c(4,5,6,7,9,10,11,15,16,17)]), 
                         predictions_train[,3], alpha=0, standardize=FALSE, nfolds=10)
plot(ridge_train)

ridge_lse <- ridge_train$lambda.1se

predictions_train$predicted <- predict(ridge_train, s=ridge_lse, 
                                       newx=data.matrix(predictions_train[,c(4,5,6,7,9,10,11,15,16,17)]))
predictions_test$predicted <- ridgepredtst <- predict(ridge_train,s=ridge_lse,
                                                      newx=data.matrix(predictions_test[,c(4,5,6,7,9,10,11,15,16,17)]))
mean((predictions_test$predicted-predictions_test$real)^2)
[1] 399556.5
mean((predictions_train$predicted-predictions_train$real)^2)
[1] 418802.8

Error for the GBM

mean((predictions_train$gbm - predictions_train$real)^2)
[1] 772826.2
predictions_all <- rbind(predictions_test, predictions_train)

While the ensemble model is by no means perfect, combining multiple machine learning models into an ensemble reduced the mean squared error from 780,000 to 400,000. The ensemble performs notably better for cogeneration, gas, coal, oil, petcoke, solar, and waste plants.

ggplot(data=predictions_all[!predictions_all$type =="Biomass",],
       aes(x=predicted, y=real))+
  geom_point(aes(x=gbm, y=real), color="red", alpha=0.6,size=0.7)+
  geom_point(alpha=0.6, size=0.7, color="grey20")+
  geom_abline(slope=1)+
  theme_minimal()+
  facet_wrap(~type, scales="free", ncol=3)+ ggtitle("Ensemble model (black) vs GBM (red)")