%matplotlib inline
from IPython.display import (display, Markdown, Image, 
                             clear_output, Latex, Math)
import pandas as pd
import re, os, sys, shelve, time, dill, io
from pickle import PicklingError
from dill import Pickler, Unpickler
shelve.Pickler = Pickler
shelve.Unpickler = Unpickler
import numpy as np
import matplotlib.pyplot as plt
from plotnine import *
import plotnine.options as p9opts
#figure_size = (6.4, 4.8)
p9opts.figure_size = (2.5, 1.2)
#p9opts.dpi = 84
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

airdata = pd.read_pickle("data/airdata/air-imputated.pickle.gz")
display(Markdown(f"* Rango de fechas obtenidas: {airdata.datetime.min().strftime('%Y-%m-%d %H:%M:%S')} al {airdata.datetime.max().strftime('%Y-%m-%d %H:%M:%S')}"))
display(Markdown(f"* Número de registros: {airdata.shape[0]:3,}"))
display(Markdown(f"* Promedio de IAQ: {airdata.IAQ.mean():.2f} desviación estándar: {airdata.IAQ.std():.2f}"))


_ = (
    ggplot(airdata.sample(frac=0.3, random_state=175904), 
           aes(x = "datetime", y = "temperature", color="pressure")) +
    geom_jitter(alpha=0.05) +
    theme(axis_text_x=element_text(angle=45), 
          plot_title=element_text(size=11)) +
    labs(x="Fecha", y="Temp (C)", color="Presión (hPa)", 
         title="Gráfica de Temperatura y\nPresión a lo Largo del Tiempo.") 
).draw()


#_ = (
#    ggplot(airdata.sample(frac=0.3, random_state=175904), 
#           aes(x = "datetime", y = "IAQ", color="iaqAccuracy")) +
#    geom_jitter(alpha=0.05, size=1.25) +
#    theme(axis_text_x=element_text(angle=45)) +
#    labs(x="Fecha", color="Precisión del Sensor",
#         title="Gráfica de IAQ y Precisión\ndel Sensor a lo Largo del Tiempo")
#).draw()


_ = airdata
_.index = airdata.datetime
_ = _["2021-02-18 23:10":"2021-02-18 23:35"].copy()
_["datetime2"] = _["datetime"].dt.strftime('%H:%M:%S')
#display(_)
_ = (
  ggplot(_) +
  geom_point(aes(x="datetime", y="humidity", color="imputated")) +
  scale_x_datetime(date_breaks='5 minute', date_labels="%H:%M:%S") +
  theme(axis_text_x=element_text(angle=45), 
          plot_title=element_text(size=11)) +
  labs(title="Gráfica Ejemplo de Datos Imputados.")
).draw()


try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=False)
  from google.colab import files
except:
  ;

base_url = ""
# File Loaders
try:
    base_url = "drive/MyDrive/Colab Notebooks/Final"
    uploaded = os.path.join(base_url, "data/sinaica-imputated.pickle.gz")
    if(not os.path.isfile(uploaded)):
        from google.colab import files
        uploaded = files.upload()
except:
    base_url = ""
    uploaded = "data/sinaica/sinaica-imputated.pickle.gz"

def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,
                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                     bbox=[0, 0, 1, 1], header_columns=0,
                     ax=None, **kwargs):
    """
    Taken from https://stackoverflow.com/a/39358722/7323086
    """
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        fig, ax = plt.subplots(figsize=size)
        ax.axis('off')

    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)

    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)

    for k, cell in  six.iteritems(mpl_table._cells):
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w', size=font_size*1.05)
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
    plt.show()

#df.dropna(inplace=True)
clear_output()

def performance_plot(history, a=None, b=None, 
                    metrics=["accuracy", "val_accuracy"],
                    plot_validation=True,
                    title="Gráficas de Desempeño."):
  """
  Prints performance plot from a, to b on a history dict.
  
  Inputs:
  history: dict containing "loss" and "accuracy" keys
  a: epoch start
  b. last epoch
  metrics: plot these metrics (train and validation). Always 2.
  plot_validation: boolean indicating if validation data should be plotted.
  a: from this epoch
  b: to this epoch    
  """
  if a is None:
      a = 0
  if b is None:
      b = len(history['loss'])
  a = np.min((a,b))
  b = np.max((a,b))

  imgrows = (len(metrics) + 1) / 2
  imgrows = np.round(imgrows, 0)
  imgrows = int(imgrows)
  #print(imgrows)

  # Plot loss
  plt.figure(figsize=(14, 5
                      *imgrows))
  plt.suptitle(title)
  plt.subplot(imgrows, 2, 1)
  plt.title('Loss')
  plt.plot(history['loss'][a:b], label='Training', linewidth=2)
  if plot_validation:
    plt.plot(history['val_loss'][a:b], label='Validation', linewidth=2)
  plt.legend()
  plt.xlabel('Epoch')
  plt.ylabel(f'Loss')
  quantiles = np.quantile(range(a, b), 
                          [.2, .4, .6, .8]).round(0).astype(int)
  quantiles = np.insert(quantiles, 0, [a])
  quantiles += 1
  quantiles = np.append(quantiles, [b-1])
  plt.xticks(ticks=quantiles-a,
              labels=quantiles)
  plt.grid(True)

  # Plot accuracy
  for i, metric in enumerate(metrics): 
    #print(f"metric: {metric}, i: {i}")
    #print(f"mean metric: {np.mean(history[metric])}")
    plt.subplot(imgrows, 2, i+2)
    plt.title(metric)
    plt.plot(history[metric][a:b], label='Training', 
              linewidth=2)
    if plot_validation:
      plt.plot(history["val_" + metric][a:b], 
                label='Validation', linewidth=2)
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel(metric)
    #plt.xlim(a, b)
    #print(range(0, b-a))
    plt.xticks(ticks=quantiles-a, 
                labels=quantiles)
    plt.grid(True)

  plt.show()

#render_mpl_table(df.head().applymap(shorten), col_width=5)

sinaica = pd.read_pickle(uploaded)
airdata = pd.read_pickle(os.path.join(base_url, "data/airdata/air-imputated.pickle.gz"))
#sinaica.head()


models = []
object_names = []
models_path = os.path.join(base_url, "models-sinaica")

for y in [x for x in os.listdir(models_path) if x.endswith("dill")]:
  model_path = os.path.join(models_path, y)
  with io.open(model_path, 'rb') as file:
      object_name = re.sub(r"\.", "_", y)
      object_name = re.sub(r"_dill", "", object_name)
      globals()[object_name] = dill.load(file)
      object_names.append(object_name)
#display(Markdown("Objetos cargados: \n\n>" + 
#         ", ".join(object_names)))

model_times = [o for o in object_names if o.endswith("_time")]
perf_table = pd.DataFrame({
  "Modelo": [re.sub("_time$", "", model_time) for model_time in model_times],
  "Tiempo": [globals()[model_time] for model_time in model_times]
})
df_n_params = pd.DataFrame(data={
    "Modelo": [x for x in model_n_params.keys()],
    "# Params": [x for x in model_n_params.values()]
})
perf_table = perf_table.merge(df_n_params, on="Modelo")
model_path = os.path.join(models_path, "model_n_params.dill")
model_n_params = dill.load(open(model_path, 'rb'))
model_histories = [o 
                   for o in object_names 
                   if o.endswith("_hist")
                  ]
model_metrics = [k 
                 for k in globals()[model_histories[0]].keys()
                 if re.search("^(val_|loss)", k) is None
                ]

for metric in model_metrics:
  perf_table["val_" + metric] = [np.mean(globals()[o]["val_" + metric]) for o in model_histories]
for metric in model_metrics:
  perf_table[metric] = [np.mean(globals()[o][metric]) for o in model_histories]
perf_table.rename({
  "mean_squared_logarithmic_error": "msle",
  "val_mean_squared_logarithmic_error": "val_msle"
}, axis=1, inplace=True)
perf_table.drop(["loss", "val_loss"], axis=1, inplace=True, errors='ignore')
perf_table.sort_values("val_mse", inplace=True)
perf_table["Tiempo"] = (perf_table["Tiempo"] // 60).astype('int').astype("str") + "m" + \
(perf_table["Tiempo"] % 60).round(3).apply(lambda x: f"{x:2.2f}") + "s"
perf_table.reset_index(inplace=True, drop=True)
#perf_table.round(4)
excluded_columns = ["iaqAccuracy", "datetime", "datetime-1", "delta", 
                    "imputated", "year"]
train, test = train_test_split(sinaica[[x 
                                        for x in sinaica.columns 
                                        if x not in excluded_columns]], 
                               train_size=0.8, random_state=175904, shuffle=False)
scaler_iaq = MinMaxScaler().fit(train[["IAQ"]])
perf_data_iaq = scaler_iaq.inverse_transform(perf_table.select_dtypes("float64"))
#scaler_gr = MinMaxScaler().fit(train[["gasResistance"]])
#perf_data_gr = scaler_gr.inverse_transform(perf_table.select_dtypes("float64"))
perf_data_iaq = pd.DataFrame(perf_data_iaq, 
                             columns=perf_table.select_dtypes("float64").columns)
#perf_data_gr  = pd.DataFrame(perf_data_gr, 
#                             columns=perf_table.select_dtypes("float64").columns)
perf_data_iaq.insert(0, "Tiempo", perf_table["Tiempo"], )
#perf_data_gr.insert(0,  "Tiempo", perf_table["Tiempo"], )
perf_data_iaq.insert(0, "Modelo", perf_table["Modelo"], )
#perf_data_gr.insert(0,  "Modelo", perf_table["Modelo"], )
perf_data_iaq["# Params"] = perf_table["# Params"]
#perf_data_gr["# Params"]  = perf_table["# Params"]
perf_data = perf_table.copy()
perf_data.sort_values("val_mse", inplace=True)
perf_data.reset_index(inplace=True, drop=True)
model_number_rows = [int(re.sub("[^0-9]", "", x)) for x in perf_table["Modelo"]]
#is_gr_row = [(x % 2) == 0 for x in model_number_rows]
is_iaq_row = [(x % 2) == 1 for x in model_number_rows]
#perf_data.iloc[is_gr_row] = perf_data_gr
perf_data.iloc[is_iaq_row] = perf_data_iaq
cols = ["Modelo", "Tiempo", "# Params", "val_mae", "mae"]
Markdown(perf_data.round(2)[cols].head(5).to_markdown())


models = []
object_names = []
models_path = os.path.join(base_url, "models")

for y in [x for x in os.listdir(models_path) if x.endswith("dill")]:
  model_path = os.path.join(models_path, y)
  with io.open(model_path, 'rb') as file:
      object_name = re.sub(r"\.", "_", y)
      object_name = re.sub(r"_dill", "", object_name)
      globals()[object_name] = dill.load(file)
      object_names.append(object_name)
#display(Markdown("Objetos cargados: \n\n>" + 
#         ", ".join(object_names)))

model_times = [o for o in object_names if o.endswith("_time")]
perf_table = pd.DataFrame({
  "Modelo": [re.sub("_time$", "", model_time) for model_time in model_times],
  "Tiempo": [globals()[model_time] for model_time in model_times]
})
model_path = os.path.join(models_path, "model_n_params.dill")
model_n_params = dill.load(open(model_path, 'rb'))
#pd.DataFrame(model_n_params, columns=["# Parametros"])
df_n_params = pd.DataFrame(data={
    "Modelo": [x for x in model_n_params.keys()],
    "# Params": [x for x in model_n_params.values()]
})
perf_table = perf_table.merge(df_n_params, on="Modelo")
#df_n_params = perf_table.pop("# Params")
#perf_table.insert(2, "# Params", df_n_params)
model_histories = [o 
                   for o in object_names 
                   if o.endswith("_hist")
                  ]
model_metrics = [k 
                 for k in globals()[model_histories[0]].keys()
                 if re.search("^(val_|loss)", k) is None
                ]

for metric in model_metrics:
  perf_table["val_" + metric] = [np.mean(globals()[o]["val_" + metric]) for o in model_histories]
for metric in model_metrics:
  perf_table[metric] = [np.mean(globals()[o][metric]) for o in model_histories]
perf_table.rename({
  "mean_squared_logarithmic_error": "msle",
  "val_mean_squared_logarithmic_error": "val_msle"
}, axis=1, inplace=True)
perf_table.drop(["loss", "val_loss"], axis=1, inplace=True, errors='ignore')
perf_table.sort_values("val_mse", inplace=True)
perf_table["Tiempo"] = (perf_table["Tiempo"] // 60).astype('int').astype("str") + "m" + \
(perf_table["Tiempo"] % 60).round(3).apply(lambda x: f"{x:2.2f}") + "s"
perf_table.reset_index(inplace=True, drop=True)
#perf_table.round(4)
excluded_columns = ["iaqAccuracy", "datetime", "datetime-1", "delta", 
                    "imputated", "year"]
train, test = train_test_split(airdata[[x 
                                        for x in airdata.columns 
                                        if x not in excluded_columns]], 
                               train_size=0.8, random_state=175904, shuffle=False)
scaler_iaq = MinMaxScaler().fit(train[["IAQ"]])
perf_data_iaq = scaler_iaq.inverse_transform(perf_table.select_dtypes("float64"))
#scaler_gr = MinMaxScaler().fit(train[["gasResistance"]])
#perf_data_gr = scaler_gr.inverse_transform(perf_table.select_dtypes("float64"))
perf_data_iaq = pd.DataFrame(perf_data_iaq, 
                             columns=perf_table.select_dtypes("float64").columns)
#perf_data_gr  = pd.DataFrame(perf_data_gr, 
#                             columns=perf_table.select_dtypes("float64").columns)
perf_data_iaq.insert(0, "Tiempo", perf_table["Tiempo"], )
#perf_data_gr.insert(0,  "Tiempo", perf_table["Tiempo"], )
perf_data_iaq.insert(0, "Modelo", perf_table["Modelo"], )
perf_data_iaq["# Params"] = perf_table["# Params"]
#perf_data_gr["# Params"]  = perf_table["# Params"]
#perf_data_gr.insert(0,  "Modelo", perf_table["Modelo"], )
perf_data2 = perf_table.copy()
perf_data2.sort_values("val_mse", inplace=True)
perf_data2.reset_index(inplace=True, drop=True)
model_number_rows = [int(re.sub("[^0-9]", "", x)) for x in perf_table["Modelo"]]
#is_gr_row = [(x % 2) == 0 for x in model_number_rows]
is_iaq_row = [(x % 2) == 1 for x in model_number_rows]
#perf_data.iloc[is_gr_row] = perf_data_gr
perf_data2.iloc[is_iaq_row] = perf_data_iaq
cols = ["Modelo", "Tiempo", "# Params", "val_mae", "mae"]
perf_data2 = perf_data2.iloc[is_iaq_row]
Markdown(perf_data2.round(2)[cols].head(5).to_markdown())


Modelos_list = perf_data2["Modelo"].tolist()
Modelos = pd.Categorical(perf_data2["Modelo"], 
                         categories=Modelos_list)
perf_data2["Modelo2"] = Modelos
perf_data["src"]  = "Sensor-Gov"
perf_data2["src"] = "Sensor"
_ = (
  ggplot(perf_data2.head(5), aes(x="Modelo2", y="val_mae", fill="Modelo2")) +
  geom_bar(stat="identity") +
  #geom_bar(aes(y="mae"), stat="identity") +
  labs(y="Error Medio\nAbsoluto", x="Modelos", 
       title="Gráfica Comparativa entre los Modelos\ncon Datos del Sensor"
      ) +
  theme(legend_position="none", axis_text_x=element_text(rotation=90), 
        axis_text_y=element_text(size=8),
        plot_title=element_text(size=11),) 
).draw()


models = []
object_names = []
for y in [x for x in os.listdir("models") if x.endswith("dill")]:
    with io.open(f"models/{y}", 'rb') as file:
        object_name = re.sub(r"\.", "_", y)
        object_name = re.sub(r"_dill", "", object_name)
        globals()[object_name] = dill.load(file)
        object_names.append(object_name)

models_path = "models"
model_path = os.path.join(models_path, "model_n_params.dill")
model_n_params = dill.load(open(model_path, 'rb'))

#Markdown("Objetos cargados: \n\n>" + 
#         ", ".join(object_names))
def models_plot(histories, a=None, b=None, 
                    metric='loss',
                    plot_validation=True):
  """
  Prints performance plot from a, to b on a list with object names in strings:
  i.e., ["model00", "model01", "modelltsm00", ..]
  
  Inputs:
  histories: an array with dicts containing the metrics key
  a: epoch start
  b. last epoch
  metric: plot this metric.
  plot_validation: boolean indicating if validation data should be plotted.
  a: from this epoch
  b: to this epoch    
  """
  
  """
  # Plot loss
  plt.subplot(imgrows, 2, 1)
  plt.title('Loss')
  plt.plot(history['loss'][a:b], label='Training', linewidth=2)
  if plot_validation:
    plt.plot(history['val_loss'][a:b], label='Validation', linewidth=2)
  plt.legend()
  plt.xlabel('Epoch')
  plt.ylabel(f'Loss')
  ### quantiles
  plt.xticks(ticks=quantiles-a,
              labels=quantiles)
  plt.grid(True)

  # Plot accuracy
  """
  models = [globals()[h] for h in histories]
  max_epochs = np.max([len(o[metric]) for o in models])
  if a is None:
      a = 0
  if b is None:
      b = max_epochs
  a = np.min((a,b))
  b = np.max((a,b))
  #print(f"a={a}, b={b}")
  
  imgrows = (len(histories)) / 2.
  imgrows = np.round(imgrows + .1, 0)
  imgrows = int(imgrows) #+ 1
  #print(f"length={len(histories)}")
  #print(f"imgrows={imgrows}")
  
  # x ticks
  quantiles = np.quantile(range(a, b), 
                          [.2, .4, .6, .8]).round(0).astype(int)
  quantiles = np.insert(quantiles, 0, [a])
  quantiles += 1
  quantiles = np.append(quantiles, [b-1])
  
  if plot_validation:
    model_order = {h: np.mean(v["val_" + metric]) for h, v in zip(histories, models)}
    #print(model_order)
  else:
    model_order = {h: np.mean(v[metric]) for h, v in zip(histories, models)}
  model_order = {m: model_order[m] for m in sorted(model_order, 
                                                   key=model_order.get)}
  
  # init the plot
  plt.figure(figsize=(14., 4.*imgrows), frameon=False, tight_layout=True)
  plt.suptitle("Gráfica de Desempeño de los Mejores Modelos con Datos del Sensor", 
               va='top', weight='bold', size='x-large')
  # create plot for every model
  #for i, (model_name, model) in enumerate(zip(histories, models)): 
  plt.autoscale(tight=True)
  for i, (model_name) in enumerate(model_order.keys()):
    model = globals()[model_name]
    
    plt.subplot(imgrows, 2, i+1)
    plt.title(model_name)
    if plot_validation:
      mean_label="Mean Validation"
    else:
      mean_label="Mean Training"
    plt.hlines(y=model_order[model_name], xmin=0, linestyles="dashed",
               xmax=b-a, label=mean_label, color="green")
    plt.plot(model[metric][a:b], label='Training', 
              linewidth=2)
    if plot_validation:
      plt.plot(model["val_" + metric][a:b], 
                label='Validation', linewidth=2)
      #print(f"{model_name}: {model_order[model_name]}")
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel(metric)
    plt.xticks(ticks=quantiles-a, 
                labels=quantiles)
    plt.grid(True)

  plt.show()
  
model_histories = [o 
                   for o in object_names 
                   if o.endswith("_hist")
                  ]
model_metrics = [k 
                 for k in globals()[model_histories[0]].keys()
                 if re.search("^(val_|loss|mean_square_logarithmic_error)", k) is None
                ]
top_models = perf_data2.head(4)["Modelo"]
model_histories = [m + "_hist" for m in top_models]
models_plot(model_histories, metric='loss')

	Modelo	Tiempo	# Params	val_mae	mae
0	model_best01b	9m34.02s	169,795	80.26	61.04
1	model_lstm01	2m24.19s	54,273	83.18	52.96
2	model_lstm03	6m13.69s	185,345	81.68	55.41
3	model_best01a	15m37.22s	575,745	83.38	53.35
4	model_conv01	2m53.16s	116,225	90.47	61.12

	Modelo	Tiempo	# Params	val_mae	mae
1	model_dnn01	1m40.90s	4,609	74.06	61.15
2	model_best03a	14m0.58s	485,633	75.94	55.8
8	model_conv01	14m3.57s	294,401	127.78	5.79
9	model_conv03	6m33.58s	419,841	129.51	6.13
12	model_best03b	8m50.63s	162,115	164.43	9.28

Resumen¶

Introducción¶

Fuentes de Datos y Variables¶

Problemáticas¶

Análisis Exploratorio Inicial¶

Modelos¶

Trabajos relacionados¶

Solución¶

Preprocesamiento¶

Datos del Gobierno y del Sensor¶

Modelo Propuesto¶

Resultados¶

Curvas de Desempeño¶

Conclusiones¶

Bibliografía¶