Para imputar vamos a hacer una prueba con Stan. Que técnicamente no pudimos ejecutar en Python, por problemas con el compilador de C++.

En el caso de RStan, utilizamos una imagen de docker en donde sí se ejecutó sin problemas.

Imputar los datos de PM10

Leemos los datos de Python, para lo cual utilizamos Parquet del paquete Apache Arrow:

camarones_train <- read_parquet("data/camarones_train.parquet")
camarones_train <- camarones_train[
  complete.cases(
    camarones_train %>% 
      select(matches("_PM10"))
    ), ]
camarones_test <- read_parquet("data/camarones_test.parquet")
print("Tienen las mismas dimensiones:")
## [1] "Tienen las mismas dimensiones:"
print(paste("Registros en el set de entrenamiento: ",
            nrow(camarones_train)))
## [1] "Registros en el set de entrenamiento:  882"
print(paste("Registros en el set de pruebas: ",
            nrow(camarones_test)))
## [1] "Registros en el set de pruebas:  468"
ggplot(camarones_test, aes(sample = Camarones_PM10)) +
  geom_qq(distribution = stats::qpois, 
          dparams = list(
            lambda = camarones_test %>% pull(Camarones_PM10) %>% median),
          na.rm = T) +
  ggplot(camarones_test, aes(x = Camarones_PM10)) +
  geom_histogram(binwidth = 10)

Modelo Normal

\[ Y \sim Poi(\mu)\\ \mu = exp[\alpha + \beta X]\\ \ell = (y | \mu) \\ \alpha \sim Normal(a, b) \\ \beta \sim Normal(a, b) \]

X <- camarones_train %>% 
  select(matches("(Merced|Tlalnepantla)_PM10")) %>%
  as.matrix
X_test <- camarones_test %>% 
  select(matches("(Merced|Tlalnepantla)_PM10")) %>%
  as.matrix
y <- camarones_train %>% pull("Camarones_PM10")
y_test <- camarones_test %>% pull("Camarones_PM10")
y <- y[complete.cases(X)]
y_test <- y_test[complete.cases(X_test)]
X <- X[complete.cases(X),]
X_test <- X_test[complete.cases(X_test),]
datos <- list(a = 0,
              b = 100,
              n = nrow(X),
              p = ncol(X),
              x = X,
              y = y)
fit <- stan(file = 'imputaciones/normal-pm10.stan', data = datos, 
            iter = 3000, warmup = 500, chains = 4, init = 0)
yhat_names <- names(fit)
yhat <- fit %>%
  get_posterior_mean %>%
  as_tibble %>%
  add_column(name = yhat_names,
             .before = "mean-chain:1")
yhat 
yhat <- yhat %>% 
  select("name", "mean-all chains") %>%
  filter(str_detect(name, "^yhat")) 
yhat %>%
  head(5)
yhat %>%
  # pull("mean-all chains") %>%
  write_parquet("data/camarones_train_yhat-stan.parquet")

Predicciones sobre el Set de Pruebas

Código de Stan:

cat imputaciones/normal-pm10.stan
## //
## // This Stan program defines a simple model, with a
## // vector of values 'y' modeled as normally distributed
## // with mean 'mu' and standard deviation 'sigma'.
## //
## // Learn more about model development with Stan at:
## //
## //    http://mc-stan.org/users/interfaces/rstan.html
## //    https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
## //
## 
## data {
##   int<lower=0> n;    // number of rows
##   int<lower=0> p;    // number of vars
##   int a;
##   int b;
##   matrix[n, p] x;    // prepare the matrix whose first column is all 1s
##   int<lower=0> y[n]; // cannot use vector because Y must be integers
## }
## 
## parameters {
##   real alpha;
##   vector[p] beta;
## }
## 
## transformed parameters {
##   vector[n] mu;
##   mu = alpha + multiply(x, beta);  // matrix multiplication
## }
## 
## model {
##   y ~ poisson_log(mu);
##   beta ~ normal(a, b);
## }
## 
## generated quantities {
##   int<lower=0> yhat[n];
##   vector[n] log_lik;
##   for (i in 1:n) {
##     yhat[i] = poisson_log_rng(mu[i]);
##     log_lik[i] = poisson_log_lpmf(y[i]|mu[i]);
##   }
## }
fitted_params <- fit %>%
  rstan::extract()
# Function for simulating y based on new x
gen_quant_r <- function(x, fitted_params) {
  linear_combination <- 
    sample(fitted_params$alpha, size = length(x)) %>% 
    matrix(., ncol = 2) +
    sample(fitted_params$beta, size = length(x)) %>% 
    matrix(., ncol = 2) * x
  probs <- 1/(1 + exp(-linear_combination))
  
  
  out <- rnbinom(n = nrow(x), prob = probs, size = max(y))
  return(out)
}
set.seed(175904)
y_pred_r <- gen_quant_r(X_test, fitted_params)
y_pred_r %>% length
## [1] 414
mean(y_pred_r == y_test)
## [1] 0.002415459
(sum((y_pred_r - y_test)^2))/length(y_test)
## [1] 3109.995
tibble(`mean-all chains` = y_pred_r) %>%
  write_parquet("data/camarones_test_yhat-stan.parquet")
yhat %>%
  as.tibble(`mean-all chains` = yhat)
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Evaluación

yhat %>% ggplot() +
  geom_point(aes(x=1:nrow(yhat), 
                 y=y, 
                 colour="y ground truth"), alpha=0.3) +
  geom_point(aes(x=1:nrow(yhat), 
                 y=yhat %>% 
                   pull("mean-all chains"), 
                 colour="yhat"), alpha=0.3) +
  labs(x = "Observaciones", y = "PM10 (ppm)")

tibble(x = 1:length(y_test),
       y_test = y_test, 
       y_test_hat = y_pred_r) %>%
  ggplot(.) +
  geom_point(aes(x=x, 
                 y=y_test, 
                 colour="y test ground truth"), alpha=0.3) +
  geom_point(aes(x=x, 
                 y=y_test_hat, 
                 colour="y test hat "), alpha=0.3) +
  labs(x = "Observaciones", y = "PM10 (ppm)")

Bibliografía