Para imputar vamos a hacer una prueba con Stan. Que técnicamente no pudimos ejecutar en Python, por problemas con el compilador de C++.
En el caso de RStan, utilizamos una imagen de docker en donde sí se ejecutó sin problemas.
Leemos los datos de Python, para lo cual utilizamos Parquet del paquete Apache Arrow:
camarones_train <- read_parquet("data/camarones_train.parquet")
camarones_train <- camarones_train[
complete.cases(
camarones_train %>%
select(matches("_PM10"))
), ]
camarones_test <- read_parquet("data/camarones_test.parquet")
print("Tienen las mismas dimensiones:")
## [1] "Tienen las mismas dimensiones:"
print(paste("Registros en el set de entrenamiento: ",
nrow(camarones_train)))
## [1] "Registros en el set de entrenamiento: 882"
print(paste("Registros en el set de pruebas: ",
nrow(camarones_test)))
## [1] "Registros en el set de pruebas: 468"
ggplot(camarones_test, aes(sample = Camarones_PM10)) +
geom_qq(distribution = stats::qpois,
dparams = list(
lambda = camarones_test %>% pull(Camarones_PM10) %>% median),
na.rm = T) +
ggplot(camarones_test, aes(x = Camarones_PM10)) +
geom_histogram(binwidth = 10)
\[ Y \sim Poi(\mu)\\ \mu = exp[\alpha + \beta X]\\ \ell = (y | \mu) \\ \alpha \sim Normal(a, b) \\ \beta \sim Normal(a, b) \]
X <- camarones_train %>%
select(matches("(Merced|Tlalnepantla)_PM10")) %>%
as.matrix
X_test <- camarones_test %>%
select(matches("(Merced|Tlalnepantla)_PM10")) %>%
as.matrix
y <- camarones_train %>% pull("Camarones_PM10")
y_test <- camarones_test %>% pull("Camarones_PM10")
y <- y[complete.cases(X)]
y_test <- y_test[complete.cases(X_test)]
X <- X[complete.cases(X),]
X_test <- X_test[complete.cases(X_test),]
datos <- list(a = 0,
b = 100,
n = nrow(X),
p = ncol(X),
x = X,
y = y)
fit <- stan(file = 'imputaciones/normal-pm10.stan', data = datos,
iter = 3000, warmup = 500, chains = 4, init = 0)
yhat_names <- names(fit)
yhat <- fit %>%
get_posterior_mean %>%
as_tibble %>%
add_column(name = yhat_names,
.before = "mean-chain:1")
yhat
yhat <- yhat %>%
select("name", "mean-all chains") %>%
filter(str_detect(name, "^yhat"))
yhat %>%
head(5)
yhat %>%
# pull("mean-all chains") %>%
write_parquet("data/camarones_train_yhat-stan.parquet")
Código de Stan:
cat imputaciones/normal-pm10.stan
## //
## // This Stan program defines a simple model, with a
## // vector of values 'y' modeled as normally distributed
## // with mean 'mu' and standard deviation 'sigma'.
## //
## // Learn more about model development with Stan at:
## //
## // http://mc-stan.org/users/interfaces/rstan.html
## // https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
## //
##
## data {
## int<lower=0> n; // number of rows
## int<lower=0> p; // number of vars
## int a;
## int b;
## matrix[n, p] x; // prepare the matrix whose first column is all 1s
## int<lower=0> y[n]; // cannot use vector because Y must be integers
## }
##
## parameters {
## real alpha;
## vector[p] beta;
## }
##
## transformed parameters {
## vector[n] mu;
## mu = alpha + multiply(x, beta); // matrix multiplication
## }
##
## model {
## y ~ poisson_log(mu);
## beta ~ normal(a, b);
## }
##
## generated quantities {
## int<lower=0> yhat[n];
## vector[n] log_lik;
## for (i in 1:n) {
## yhat[i] = poisson_log_rng(mu[i]);
## log_lik[i] = poisson_log_lpmf(y[i]|mu[i]);
## }
## }
fitted_params <- fit %>%
rstan::extract()
# Function for simulating y based on new x
gen_quant_r <- function(x, fitted_params) {
linear_combination <-
sample(fitted_params$alpha, size = length(x)) %>%
matrix(., ncol = 2) +
sample(fitted_params$beta, size = length(x)) %>%
matrix(., ncol = 2) * x
probs <- 1/(1 + exp(-linear_combination))
out <- rnbinom(n = nrow(x), prob = probs, size = max(y))
return(out)
}
set.seed(175904)
y_pred_r <- gen_quant_r(X_test, fitted_params)
y_pred_r %>% length
## [1] 414
mean(y_pred_r == y_test)
## [1] 0.002415459
(sum((y_pred_r - y_test)^2))/length(y_test)
## [1] 3109.995
tibble(`mean-all chains` = y_pred_r) %>%
write_parquet("data/camarones_test_yhat-stan.parquet")
yhat %>%
as.tibble(`mean-all chains` = yhat)
## Warning: `as.tibble()` is deprecated as of tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
yhat %>% ggplot() +
geom_point(aes(x=1:nrow(yhat),
y=y,
colour="y ground truth"), alpha=0.3) +
geom_point(aes(x=1:nrow(yhat),
y=yhat %>%
pull("mean-all chains"),
colour="yhat"), alpha=0.3) +
labs(x = "Observaciones", y = "PM10 (ppm)")
tibble(x = 1:length(y_test),
y_test = y_test,
y_test_hat = y_pred_r) %>%
ggplot(.) +
geom_point(aes(x=x,
y=y_test,
colour="y test ground truth"), alpha=0.3) +
geom_point(aes(x=x,
y=y_test_hat,
colour="y test hat "), alpha=0.3) +
labs(x = "Observaciones", y = "PM10 (ppm)")