import dill, os, io, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plotnine import *
from IPython.display import display, display_markdown, Markdown
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


models = []
object_names = []
for y in [x for x in os.listdir("models-sinaica") if x.endswith("dill")]:
    with io.open(f"models-sinaica/{y}", 'rb') as file:
        object_name = re.sub(r"\.", "_", y)
        object_name = re.sub(r"_dill", "", object_name)
        globals()[object_name] = dill.load(file)
        object_names.append(object_name)

models_path = "models-sinaica"
model_path = os.path.join(models_path, "model_n_params.dill")
model_n_params = dill.load(open(model_path, 'rb'))
        
Markdown("Objetos cargados: \n\n>" + 
         ", ".join(object_names))


sinaica = pd.read_pickle("data/sinaica/sinaica-imputated.pickle.gz")
sinaica.head(3)


def models_plot(histories, a=None, b=None, 
                    metric='loss',
                    plot_validation=True):
  """
  Prints performance plot from a, to b on a list with object names in strings:
  i.e., ["model00", "model01", "modelltsm00", ..]
  
  Inputs:
  histories: an array with dicts containing the metrics key
  a: epoch start
  b. last epoch
  metric: plot this metric.
  plot_validation: boolean indicating if validation data should be plotted.
  a: from this epoch
  b: to this epoch    
  """
  
  """
  # Plot loss
  plt.subplot(imgrows, 2, 1)
  plt.title('Loss')
  plt.plot(history['loss'][a:b], label='Training', linewidth=2)
  if plot_validation:
    plt.plot(history['val_loss'][a:b], label='Validation', linewidth=2)
  plt.legend()
  plt.xlabel('Epoch')
  plt.ylabel(f'Loss')
  ### quantiles
  plt.xticks(ticks=quantiles-a,
              labels=quantiles)
  plt.grid(True)

  # Plot accuracy
  """
  models = [globals()[h] for h in histories]
  max_epochs = np.max([len(o[metric]) for o in models])
  if a is None:
      a = 0
  if b is None:
      b = max_epochs
  a = np.min((a,b))
  b = np.max((a,b))
  #print(f"a={a}, b={b}")
  
  imgrows = (len(histories)) / 2.
  imgrows = np.round(imgrows + .1, 0)
  imgrows = int(imgrows) #+ 1
  #print(f"length={len(histories)}")
  #print(f"imgrows={imgrows}")
  
  # x ticks
  quantiles = np.quantile(range(a, b), 
                          [.2, .4, .6, .8]).round(0).astype(int)
  quantiles = np.insert(quantiles, 0, [a])
  quantiles += 1
  quantiles = np.append(quantiles, [b-1])
  
  if plot_validation:
    model_order = {h: np.mean(v["val_" + metric]) for h, v in zip(histories, models)}
  else:
    model_order = {h: np.mean(v[metric]) for h, v in zip(histories, models)}
  model_order = {m: model_order[m] for m in sorted(model_order, 
                                                   key=model_order.get)}
  
  # init the plot
  plt.figure(figsize=(14., 4.*imgrows), frameon=False, tight_layout=True)
  plt.suptitle(metric, va='top', weight='bold', size='x-large')
  # create plot for every model
  #for i, (model_name, model) in enumerate(zip(histories, models)): 
  for i, (model_name) in enumerate(model_order.keys()):
    model = globals()[model_name]
    
    plt.subplot(imgrows, 2, i+1)
    plt.title(model_name)
    if plot_validation:
      mean_label="Mean Validation"
    else:
      mean_label="Mean Training"
    plt.hlines(y=model_order[model_name], xmin=0, linestyles="dashed",
               xmax=b-a, label=mean_label, color="green")
    plt.plot(model[metric][a:b], label='Training', 
              linewidth=2)
    if plot_validation:
      plt.plot(model["val_" + metric][a:b], 
                label='Validation', linewidth=2)
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel(metric)
    plt.xticks(ticks=quantiles-a, 
                labels=quantiles)
    plt.grid(True)

  plt.show()
  
model_histories = [o 
                   for o in object_names 
                   if o.endswith("_hist")
                  ]
model_metrics = [k 
                 for k in globals()[model_histories[0]].keys()
                 if re.search("^(val_|loss)", k) is None
                ]
for model_metric in model_metrics:
  models_plot(model_histories, metric=model_metric)


for model_metric in model_metrics:
  models_plot(model_histories, metric=model_metric, a=90)


model_times = [o for o in object_names if o.endswith("_time")]
perf_table = pd.DataFrame({
  "Modelo": [re.sub("_time$", "", model_time) for model_time in model_times],
  "Tiempo": [globals()[model_time] for model_time in model_times]
})
df_n_params = pd.DataFrame(data={
    "Modelo": [x for x in model_n_params.keys()],
    "# Params": [x for x in model_n_params.values()]
})
perf_table = perf_table.merge(df_n_params, on="Modelo")
for metric in model_metrics:
  perf_table["val_" + metric] = [np.mean(globals()[o]["val_" + metric]) for o in model_histories]
for metric in model_metrics:
  perf_table[metric] = [np.mean(globals()[o][metric]) for o in model_histories]
perf_table.rename({
  "mean_squared_logarithmic_error": "msle",
  "val_mean_squared_logarithmic_error": "val_msle"
}, axis=1, inplace=True)
perf_table.drop(["loss", "val_loss"], axis=1, inplace=True, errors='ignore')
perf_table.sort_values("val_mse", inplace=True)
perf_table["Tiempo"] = (perf_table["Tiempo"] // 60).astype('int').astype("str") + "m" + \
(perf_table["Tiempo"] % 60).round(3).apply(lambda x: f"{x:2.2f}") + "s"
perf_table.reset_index(inplace=True, drop=True)
perf_table.round(4)


Markdown(f"min IAQ: {sinaica['IAQ'].min():3,.2f}; " + \
f"max IAQ: {sinaica['IAQ'].max():3,.2f}")


sinaica = pd.read_pickle("data/sinaica/sinaica-imputated.pickle.gz")
excluded_columns = ["iaqAccuracy", "datetime", "datetime-1", "delta", 
                    "imputated", "year"]
train, test = train_test_split(sinaica[[x 
                                        for x in sinaica.columns 
                                        if x not in excluded_columns]], 
                               train_size=0.8, random_state=175904, shuffle=False)
scaler_iaq = MinMaxScaler().fit(train[["IAQ"]])
perf_data_iaq = scaler_iaq.inverse_transform(perf_table.select_dtypes("float64"))
#scaler_gr = MinMaxScaler().fit(train[["gasResistance"]])
#perf_data_gr = scaler_gr.inverse_transform(perf_table.select_dtypes("float64"))
perf_data_iaq = pd.DataFrame(perf_data_iaq, 
                             columns=perf_table.select_dtypes("float64").columns)
#perf_data_gr  = pd.DataFrame(perf_data_gr, 
#                             columns=perf_table.select_dtypes("float64").columns)
perf_data_iaq.insert(0, "Tiempo", perf_table["Tiempo"], )
#perf_data_gr.insert(0,  "Tiempo", perf_table["Tiempo"], )
perf_data_iaq.insert(0, "Modelo", perf_table["Modelo"], )
#perf_data_gr.insert(0,  "Modelo", perf_table["Modelo"], )
perf_data_iaq["# Params"] = perf_table["# Params"]
perf_data = perf_table.copy()
perf_data.sort_values("val_mse", inplace=True)
perf_data.reset_index(inplace=True, drop=True)
model_number_rows = [int(re.sub("[^0-9]", "", x)) for x in perf_table["Modelo"]]
#is_gr_row = [(x % 2) == 0 for x in model_number_rows]
is_iaq_row = [(x % 2) == 1 for x in model_number_rows]
#perf_data.iloc[is_gr_row] = perf_data_gr
perf_data.iloc[is_iaq_row] = perf_data_iaq
cols = ["Modelo", "Tiempo", "# Params", "val_mae", "mae"]
perf_data.round(2)[cols]

	CO	NO	NO2	NOx	O3	PM10	PM2.5	SO2	month	day	hour	datetime	minute	temperature	pressure	humidity	gasResistance	IAQ
987	2.2	0.205	0.031	0.207	0.002	45.0	22.0	0.004	2	12	6	2021-02-12 06:05:35.846304417	35.0	21.51	777.41	44.04	152149.0	34.7
988	2.2	0.205	0.031	0.207	0.002	45.0	22.0	0.004	2	12	6	2021-02-12 06:05:38.837326527	34.0	21.51	777.41	43.98	152841.0	33.6
989	2.2	0.205	0.031	0.207	0.002	45.0	22.0	0.004	2	12	6	2021-02-12 06:05:47.812360048	32.0	21.54	777.41	43.73	153259.0	31.5

	Modelo	Tiempo	# Params	val_mse	val_mae	val_msle	mse	mae	msle
0	model_conv01	2m53.16s	116,225	0.0453	0.1774	0.0258	0.0280	0.1349	0.0162
1	model_conv03	6m56.74s	509,953	0.0466	0.1838	0.0258	0.0234	0.1170	0.0141
2	model_baseline01	1m9.73s	16	0.0476	0.1805	0.0266	0.0248	0.1224	0.0145
3	model_rnn03	11m58.17s	288,833	0.0484	0.1843	0.0281	0.0231	0.1179	0.0136
4	model_best01b	9m34.02s	169,795	0.0595	0.1999	0.0340	0.0313	0.1351	0.0178
5	model_dnn03	3m14.81s	26,689	0.0676	0.2112	0.0362	0.0198	0.1062	0.0118
6	model_dnn01	1m33.90s	8,705	0.0696	0.2258	0.0409	0.0199	0.1031	0.0117
7	model_lstm03	6m13.69s	185,345	0.1202	0.2723	0.0538	0.0302	0.1365	0.0181
8	model_lstm01	2m24.19s	54,273	0.1663	0.3206	0.0620	0.1588	0.2578	0.0455
9	model_best01a	15m37.22s	575,745	0.3231	0.4207	0.0983	0.3622	0.4349	0.0858
10	model_rnn01	12m7.97s	270,849	1.7005	1.0162	0.0989	0.0532	0.1514	0.0212

	Modelo	Tiempo	# Params	val_mae	mae
0	model_conv01	2m53.16s	116,225	80.26	61.04
1	model_conv03	6m56.74s	509,953	83.18	52.96
2	model_baseline01	1m9.73s	16	81.68	55.41
3	model_rnn03	11m58.17s	288,833	83.38	53.35
4	model_best01b	9m34.02s	169,795	90.47	61.12
5	model_dnn03	3m14.81s	26,689	95.57	48.06
6	model_dnn01	1m33.90s	8,705	102.19	46.67
7	model_lstm03	6m13.69s	185,345	123.23	61.76
8	model_lstm01	2m24.19s	54,273	145.06	116.66
9	model_best01a	15m37.22s	575,745	190.37	196.80
10	model_rnn01	12m7.97s	270,849	459.84	68.53

Proyecto Final: Selección de Modelo en Modo Secuencial¶

Gráficas de Desempeño¶

Gráficas de Desempeño de Todas las Épocas¶

Gráficas de Desempeño de las Últimas Épocas¶

Tabla Comparativa¶

Valores de Desempeño Quitándoles Escalamiento¶

Referencias¶