import dill, os, io, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plotnine import *
from IPython.display import display, display_markdown, Markdown
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split


models = []
object_names = []
for y in [x for x in os.listdir("models") if x.endswith("dill")]:
    with io.open(f"models/{y}", 'rb') as file:
        object_name = re.sub(r"\.", "_", y)
        object_name = re.sub(r"_dill", "", object_name)
        globals()[object_name] = dill.load(file)
        object_names.append(object_name)

models_path = "models"
model_path = os.path.join(models_path, "model_n_params.dill")
model_n_params = dill.load(open(model_path, 'rb'))

Markdown("Objetos cargados: \n\n>" + 
         ", ".join(object_names))


def models_plot(histories, a=None, b=None, 
                    metric='loss',
                    plot_validation=True):
  """
  Prints performance plot from a, to b on a list with object names in strings:
  i.e., ["model00", "model01", "modelltsm00", ..]
  
  Inputs:
  histories: an array with dicts containing the metrics key
  a: epoch start
  b. last epoch
  metric: plot this metric.
  plot_validation: boolean indicating if validation data should be plotted.
  a: from this epoch
  b: to this epoch    
  """
  
  """
  # Plot loss
  plt.subplot(imgrows, 2, 1)
  plt.title('Loss')
  plt.plot(history['loss'][a:b], label='Training', linewidth=2)
  if plot_validation:
    plt.plot(history['val_loss'][a:b], label='Validation', linewidth=2)
  plt.legend()
  plt.xlabel('Epoch')
  plt.ylabel(f'Loss')
  ### quantiles
  plt.xticks(ticks=quantiles-a,
              labels=quantiles)
  plt.grid(True)

  # Plot accuracy
  """
  models = [globals()[h] for h in histories]
  max_epochs = np.max([len(o[metric]) for o in models])
  if a is None:
      a = 0
  if b is None:
      b = max_epochs
  a = np.min((a,b))
  b = np.max((a,b))
  #print(f"a={a}, b={b}")
  
  imgrows = (len(histories)) / 2.
  imgrows = np.round(imgrows + .1, 0)
  imgrows = int(imgrows) #+ 1
  #print(f"length={len(histories)}")
  #print(f"imgrows={imgrows}")
  
  # x ticks
  quantiles = np.quantile(range(a, b), 
                          [.2, .4, .6, .8]).round(0).astype(int)
  quantiles = np.insert(quantiles, 0, [a])
  quantiles += 1
  quantiles = np.append(quantiles, [b-1])
  
  if plot_validation:
    model_order = {h: np.mean(v["val_" + metric]) for h, v in zip(histories, models)}
  else:
    model_order = {h: np.mean(v[metric]) for h, v in zip(histories, models)}
  model_order = {m: model_order[m] for m in sorted(model_order, 
                                                   key=model_order.get)}
  
  # init the plot
  plt.figure(figsize=(14., 4.*imgrows), frameon=False, tight_layout=True)
  plt.suptitle(metric, va='top', weight='bold', size='x-large')
  # create plot for every model
  #for i, (model_name, model) in enumerate(zip(histories, models)): 
  for i, (model_name) in enumerate(model_order.keys()):
    model = globals()[model_name]
    
    plt.subplot(imgrows, 2, i+1)
    plt.title(model_name)
    if plot_validation:
      mean_label="Mean Validation"
    else:
      mean_label="Mean Training"
    plt.hlines(y=model_order[model_name], xmin=0, linestyles="dashed",
               xmax=b-a, label=mean_label, color="green")
    plt.plot(model[metric][a:b], label='Training', 
              linewidth=2)
    if plot_validation:
      plt.plot(model["val_" + metric][a:b], 
                label='Validation', linewidth=2)
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel(metric)
    plt.xticks(ticks=quantiles-a, 
                labels=quantiles)
    plt.grid(True)

  plt.show()
  
model_histories = [o 
                   for o in object_names 
                   if o.endswith("_hist")
                  ]
model_metrics = [k 
                 for k in globals()[model_histories[0]].keys()
                 if re.search("^(val_|loss)", k) is None
                ]
for model_metric in model_metrics:
  models_plot(model_histories, metric=model_metric)


for model_metric in model_metrics:
  models_plot(model_histories, metric=model_metric, a=90)


model_times = [o for o in object_names if o.endswith("_time")]
perf_table = pd.DataFrame({
  "Modelo": [re.sub("_time$", "", model_time) for model_time in model_times],
  "Tiempo": [globals()[model_time] for model_time in model_times]
})
df_n_params = pd.DataFrame(data={
    "Modelo": [x for x in model_n_params.keys()],
    "# Params": [x for x in model_n_params.values()]
})
perf_table = perf_table.merge(df_n_params, on="Modelo")
for metric in model_metrics:
  perf_table["val_" + metric] = [np.mean(globals()[o]["val_" + metric]) for o in model_histories]
for metric in model_metrics:
  perf_table[metric] = [np.mean(globals()[o][metric]) for o in model_histories]
perf_table.rename({
  "mean_squared_logarithmic_error": "msle",
  "val_mean_squared_logarithmic_error": "val_msle"
}, axis=1, inplace=True)
perf_table.drop(["loss", "val_loss"], axis=1, inplace=True, errors='ignore')
perf_table.sort_values("val_mse", inplace=True)
perf_table["Tiempo"] = (perf_table["Tiempo"] // 60).astype('int').astype("str") + "m" + \
(perf_table["Tiempo"] % 60).round(3).apply(lambda x: f"{x:2.2f}") + "s"
perf_table.reset_index(inplace=True, drop=True)
perf_table.round(4)


airdata = pd.read_pickle("data/airdata/air-imputated.pickle.gz")
excluded_columns = ["iaqAccuracy", "datetime", "datetime-1", "delta", 
                    "imputated", "year"]
train, test = train_test_split(airdata[[x 
                                        for x in airdata.columns 
                                        if x not in excluded_columns]], 
                               train_size=0.8, random_state=175904, shuffle=False)
scaler_iaq = MinMaxScaler().fit(train[["IAQ"]])
perf_data_iaq = scaler_iaq.inverse_transform(perf_table.select_dtypes("float64"))
scaler_gr = MinMaxScaler().fit(train[["gasResistance"]])
perf_data_gr = scaler_gr.inverse_transform(perf_table.select_dtypes("float64"))
perf_data_iaq = pd.DataFrame(perf_data_iaq, 
                             columns=perf_table.select_dtypes("float64").columns)
perf_data_gr  = pd.DataFrame(perf_data_gr, 
                             columns=perf_table.select_dtypes("float64").columns)
perf_data_iaq.insert(0, "Tiempo", perf_table["Tiempo"], )
perf_data_gr.insert(0,  "Tiempo", perf_table["Tiempo"], )
perf_data_iaq.insert(0, "Modelo", perf_table["Modelo"], )
perf_data_gr.insert(0,  "Modelo", perf_table["Modelo"], )
perf_data_iaq.insert(2,  "# Params", perf_table["# Params"])
perf_data_gr.insert(2,  "# Params", perf_table["# Params"])
perf_data = perf_table.copy()
perf_data.sort_values("val_mse", inplace=True)
perf_data.reset_index(inplace=True, drop=True)
model_number_rows = [int(re.sub("[^0-9]", "", x)) for x in perf_table["Modelo"]]
is_gr_row = [(x % 2) == 0 for x in model_number_rows]
is_iaq_row = [(x % 2) == 1 for x in model_number_rows]
perf_data.iloc[is_gr_row] = perf_data_gr
perf_data.iloc[is_iaq_row] = perf_data_iaq
cols = ["Modelo", "Tiempo", "# Params", "val_mae", "mae"]
perf_data.round(2)[cols]


Markdown(f"min gasResistance: {airdata['gasResistance'].min():3,.2f}; " + \
f"max gasResistance: {airdata['gasResistance'].max():3,.2f}")


Markdown(f"min IAQ: {airdata['IAQ'].min():3,.2f}; " + \
f"max IAQ: {airdata['IAQ'].max():3,.2f}")


perf_data.iloc[is_iaq_row].round(2).reset_index(drop=True)[cols]

	Modelo	Tiempo	# Params	val_mse	val_mae	val_msle	mse	mae	msle
0	model_lstm02	9m11.20s	183,297	0.0209	0.1255	0.0129	0.0209	0.1255	0.0129
1	model_lstm03	9m11.55s	183,297	0.0326	0.1481	0.0194	0.0230	0.1223	0.0139
2	model_baseline01	0m57.19s	8	0.0332	0.1519	0.0199	0.0201	0.1116	0.0125
3	model_conv03	6m33.58s	419,841	0.0443	0.1645	0.0265	0.0052	0.0415	0.0030
4	model_conv00	1m58.18s	26,113	0.0533	0.1724	0.0300	0.0059	0.0429	0.0034
5	model_best03a	14m0.58s	485,633	0.0566	0.1755	0.0308	0.0061	0.0434	0.0035
6	model_lstm01	1m40.80s	52,225	0.0570	0.2030	0.0364	0.0003	0.0121	0.0003
7	model_rnn02	7m34.56s	284,737	0.0774	0.2508	0.0518	0.0002	0.0100	0.0002
8	model_best03b	8m50.63s	162,115	0.0796	0.2556	0.0529	0.0005	0.0116	0.0003
9	model_conv02	6m34.26s	419,841	0.0812	0.2590	0.0546	0.0005	0.0123	0.0003
10	model_baseline00	0m57.07s	8	0.0998	0.2628	0.0512	0.0255	0.1225	0.0148
11	model_dnn00	1m42.52s	4,609	0.1270	0.3033	0.0589	0.0312	0.1412	0.0050
12	model_rnn01	7m56.31s	266,753	0.1332	0.3289	0.0658	0.0015	0.0186	0.0007
13	model_rnn00	7m37.76s	266,753	0.1583	0.2694	0.0525	0.0851	0.1268	0.0177
14	model_dnn02	4m13.19s	22,593	0.1625	0.2845	0.0616	0.1330	0.1840	0.0278
15	model_lstm00	9m39.10s	52,225	0.1877	0.3550	0.0614	0.3653	0.5150	0.0413
16	model_conv01	14m3.57s	294,401	0.3791	0.4453	0.0819	0.1221	0.2307	0.0330
17	model_dnn03	4m13.19s	22,593	0.7706	0.7625	0.0849	0.0925	0.2030	0.0313
18	model_dnn01	1m40.90s	4,609	1.5727	0.9295	0.3496	0.0659	0.1529	0.0225
19	model_rnn03	7m32.10s	284,737	4.7755	1.8987	0.8301	0.1151	0.0991	0.0137

	Modelo	Tiempo	# Params	val_mae	mae
0	model_lstm02	9m11.20s	183,297	315284.78	315165.69
1	model_lstm03	9m11.55s	183,297	74.06	61.15
2	model_baseline01	0m57.19s	8	75.94	55.80
3	model_conv03	6m33.58s	419,841	82.23	20.75
4	model_conv00	1m58.18s	26,113	404453.23	157983.06
5	model_best03a	14m0.58s	485,633	87.76	21.71
6	model_lstm01	1m40.80s	52,225	101.49	6.05
7	model_rnn02	7m34.56s	284,737	553683.86	95495.68
8	model_best03b	8m50.63s	162,115	127.78	5.79
9	model_conv02	6m34.26s	419,841	569305.27	99731.48
10	model_baseline00	0m57.07s	8	576572.41	309605.81
11	model_dnn00	1m42.52s	4,609	653625.07	345048.26
12	model_rnn01	7m56.31s	266,753	164.43	9.28
13	model_rnn00	7m37.76s	266,753	589039.62	317796.11
14	model_dnn02	4m13.19s	22,593	617843.41	426592.04
15	model_lstm00	9m39.10s	52,225	752017.47	1056449.06
16	model_conv01	14m3.57s	294,401	222.64	115.35
17	model_dnn03	4m13.19s	22,593	381.25	101.50
18	model_dnn01	1m40.90s	4,609	464.77	76.45
19	model_rnn03	7m32.10s	284,737	949.37	49.54

	Modelo	Tiempo	# Params	val_mae	mae
0	model_lstm03	9m11.55s	183,297	74.06	61.15
1	model_baseline01	0m57.19s	8	75.94	55.80
2	model_conv03	6m33.58s	419,841	82.23	20.75
3	model_best03a	14m0.58s	485,633	87.76	21.71
4	model_lstm01	1m40.80s	52,225	101.49	6.05
5	model_best03b	8m50.63s	162,115	127.78	5.79
6	model_rnn01	7m56.31s	266,753	164.43	9.28
7	model_conv01	14m3.57s	294,401	222.64	115.35
8	model_dnn03	4m13.19s	22,593	381.25	101.50
9	model_dnn01	1m40.90s	4,609	464.77	76.45
10	model_rnn03	7m32.10s	284,737	949.37	49.54

Proyecto Final: Selección de Modelo para los datos del Sensor.¶

Gráficas de Desempeño¶

Gráficas de Desempeño de Todas las Épocas¶

Gráficas de Desempeño de las Últimas Épocas¶

Tabla Comparativa¶

Valores de Desempeño Quitándoles Escalamiento¶

Referencias¶