import re, os, sys, shelve, time, dill
from pickle import PicklingError
from dill import Pickler, Unpickler
shelve.Pickler = Pickler
shelve.Unpickler = Unpickler
from IPython.display import display, Markdown, Math, clear_output, Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from plotnine import *
from tqdm.keras import TqdmCallback
from tqdm.notebook import tqdm
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, \
  SimpleRNN, Input, Conv1D
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
tf.get_logger().setLevel('ERROR')

try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=False)
  from google.colab import files
except:
  ;

base_url = ""
# File Loaders
try:
    base_url = "drive/MyDrive/Colab Notebooks/proyecto-final"
    uploaded = os.path.join(base_url, "data/air-imputated.pickle.gz")
    if(not os.path.isfile(uploaded)):
        from google.colab import files
        uploaded = files.upload()
except:
    uploaded = "air-imputated.pickle.gz"

def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,
                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                     bbox=[0, 0, 1, 1], header_columns=0,
                     ax=None, **kwargs):
    """
    Taken from https://stackoverflow.com/a/39358722/7323086
    """
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        fig, ax = plt.subplots(figsize=size)
        ax.axis('off')

    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)

    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)

    for k, cell in  six.iteritems(mpl_table._cells):
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w', size=font_size*1.05)
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
    plt.show()

#df.dropna(inplace=True)
clear_output()
###########################################
# Restaura todos los objetos del ambiente #
###########################################
def load_h5_models(shelvepath = "drive/MyDrive/Colab Notebooks/proyecto-final/models/"):
  """With this function we try to load the models on the environment (global 
  vars). Whenever possible we use Pickle/Dill within a shelve. Otherwise, for
  tensorflow models.
  """
  h5files = [re.sub("\.h5", "", h5file) for h5file in os.listdir(shelvepath) \
             if h5file.endswith('.h5')]
  for h5file in h5files:
    h5file_ = os.path.join(shelvepath, h5file + ".h5")
    print(f"Loading {h5file_} file...", end=" ")
    try:
      globals()[h5file] = tf.keras.models.load_model(h5file_,
                                                     custom_objects={
                                                         #'KerasLayer': hub.KerasLayer, 
                                                         #'AdamWeightDecay': optimization.AdamWeightDecay
                                                         })
      print("Done!")
    except Exception as e:
      print("Failed!", e)
      raise(e)
  
  shelvefile = [shelve for shelve in os.listdir(shelvepath) \
             if shelve.endswith('.out.db')][0]
  shelvefile = os.path.join(shelvepath, shelvefile)

  print(f"Loading {shelvefile} file...", end=" ")

  shelvefile = re.sub("\.db$", "", shelvefile)
  try:
    my_shelf = shelve.open(shelvefile)
    keys = []
    for key in tqdm(my_shelf):
      try:
        globals()[key] = my_shelf[key]
      except Exception as e:
        if(key != "stopwords"): 
          keys.append(key)
        else:
          raise(e)
    my_shelf.close()
    print("Done!")
    keys = ", ".join(keys)
    if (len(keys) > 0): print(f"(Vars not loaded: {keys})")
  except Exception as e:
    print("Failed!\n", e)
  finally:
    my_shelf.close()
  clear_output()

def performance_plot(history, a=None, b=None, 
                    metrics=["accuracy", "val_accuracy"],
                    plot_validation=True,
                    title="Gráficas de Desempeño."):
  """
  Prints performance plot from a, to b on a history dict.
  
  Inputs:
  history: dict containing "loss" and "accuracy" keys
  a: epoch start
  b. last epoch
  metrics: plot these metrics (train and validation). Always 2.
  plot_validation: boolean indicating if validation data should be plotted.
  a: from this epoch
  b: to this epoch    
  """
  if a is None:
      a = 0
  if b is None:
      b = len(history['loss'])
  a = np.min((a,b))
  b = np.max((a,b))

  imgrows = (len(metrics) + 1) / 2
  imgrows = np.round(imgrows, 0)
  imgrows = int(imgrows)
  #print(imgrows)

  # Plot loss
  plt.figure(figsize=(14, 5
                      *imgrows))
  plt.suptitle(title)
  plt.subplot(imgrows, 2, 1)
  plt.title('Loss')
  plt.plot(history['loss'][a:b], label='Training', linewidth=2)
  if plot_validation:
    plt.plot(history['val_loss'][a:b], label='Validation', linewidth=2)
  plt.legend()
  plt.xlabel('Epoch')
  plt.ylabel(f'Loss')
  quantiles = np.quantile(range(a, b), 
                          [.2, .4, .6, .8]).round(0).astype(int)
  quantiles = np.insert(quantiles, 0, [a])
  quantiles += 1
  quantiles = np.append(quantiles, [b-1])
  plt.xticks(ticks=quantiles-a,
              labels=quantiles)
  plt.grid(True)

  # Plot accuracy
  for i, metric in enumerate(metrics): 
    #print(f"metric: {metric}, i: {i}")
    #print(f"mean metric: {np.mean(history[metric])}")
    plt.subplot(imgrows, 2, i+2)
    plt.title(metric)
    plt.plot(history[metric][a:b], label='Training', 
              linewidth=2)
    if plot_validation:
      plt.plot(history["val_" + metric][a:b], 
                label='Validation', linewidth=2)
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel(metric)
    #plt.xlim(a, b)
    #print(range(0, b-a))
    plt.xticks(ticks=quantiles-a, 
                labels=quantiles)
    plt.grid(True)

  plt.show()

#render_mpl_table(df.head().applymap(shorten), col_width=5)

airdata = pd.read_pickle(uploaded)
airdata.head()


Markdown(f"Estso datos son una serie de tiempo de \
{(airdata['datetime'].tail(1).iloc[0] - airdata['datetime'].head(1).iloc[0]).days}\
 días. Con {airdata.shape[0]:3,} observaciones.")


Markdown(f"Dichas observaciones fueron \
{airdata['datetime'].head(1).iloc[0]} \
al {airdata['datetime'].tail(1).iloc[0]}.")


excluded_columns = ["iaqAccuracy", "datetime", "datetime-1", "delta", 
                    "imputated", "year"]
train, test = train_test_split(airdata[[x 
                                        for x in airdata.columns 
                                        if x not in excluded_columns]], 
                               train_size=0.8, random_state=175904, shuffle=False)
display(Markdown(f"* Observaciones en el set de entrenamiento: \
{train.shape[0]:3,} ({100*train.shape[0]/airdata.shape[0]:.2f}%)."))
display(Markdown(f"* Observaciones en el set de pruebas: \
{test.shape[0]:3,} ({100*test.shape[0]/airdata.shape[0]:.2f}%)."))


scaler = MinMaxScaler()
scaler_f = scaler.fit(train)
train2 = scaler_f.transform(train)
test2 = scaler_f.transform(test)
X_cols = [i for i, x in enumerate(train.columns) 
          if x not in ["IAQ", "gasResistance"]]
Y_cols = [i for i, x in enumerate(train.columns) 
          if x in ["IAQ", "gasResistance"]]
X_train = train2[:, X_cols]
Y_train = train2[:, Y_cols]
X_test  = test2[:, X_cols]
Y_test = test2[:, Y_cols]
display(
    Markdown(f"X_train.shape = {X_train.shape}, Y_train.shape = {Y_train.shape}."
    )
)
display(
    Markdown(f"X_test.shape = {X_test.shape}, Y_test.shape = {Y_test.shape}."
    )
)


def train_model(model, train_data,  validation_data,
                epochs=10, batch_size=512, 
                steps_per_epoch=100, loss='mse', optimizer='adam', 
                metrics=['mse'], verbose=0, base_dir=""):
  model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
  cbk = TqdmCallback()
  tiempo = time.time()
  history = model.fit(train_data, validation_data=validation_data,
                      epochs=epochs, steps_per_epoch=steps_per_epoch, 
                      batch_size=batch_size, verbose=verbose, callbacks=[cbk])
  clear_output()
  tiempo = time.time() - tiempo
  print(f"Tiempo de procesamiento: {tiempo:.2f} segundos.")

  #### Guardar el modelo
  base_dir = os.path.join(base_url, "models", model.name)
  model.save(f"{base_dir}.h5")
  dill.dump(tiempo, open(f"{base_dir}.time.dill", 'wb'))
  dill.dump(history.history, open(f"{base_dir}.hist.dill", 'wb'))
  #### fin sección guardar modelo

  return history

# sampling_rate: 3 seconds per sample
sampling_rate = 3
# 1 hours * minutes * seconds / sampling_rate in seconds
past = int(1 * 60 * 60 / sampling_rate) 
#display(f"X_train shape = {X_train.shape}")
train3_gas = tf.keras.preprocessing.timeseries_dataset_from_array(
  X_train, 
  Y_train[:, 0],
  sequence_length=past,
  sampling_rate=sampling_rate,
  batch_size=512,
  seed=175904
)
train3_iaq = tf.keras.preprocessing.timeseries_dataset_from_array(
  X_train, 
  Y_train[:, 1],
  sequence_length=past,
  sampling_rate=sampling_rate,
  batch_size=512,
  seed=175904
)
test3_gas = tf.keras.preprocessing.timeseries_dataset_from_array(
  X_test, 
  Y_test[:, 0],
  sequence_length=past,
  sequence_stride=60*60,
  sampling_rate=60,
  batch_size=512,
  seed=175904
)
test3_iaq = tf.keras.preprocessing.timeseries_dataset_from_array(
  X_test, 
  Y_test[:,1],
  sequence_length=past,
  sequence_stride=60*60,
  sampling_rate=60,
  batch_size=512,
  seed=175904
)


train3_gas

<BatchDataset shapes: ((None, None, 7), (None,)), types: (tf.float64, tf.float64)>


model_conv00 = Sequential(name="model_conv00")
model_conv00.add(Input(shape=(X_train.shape[0], X_train.shape[1], ), 
                       name="input00"))
model_conv00.add(Conv1D(512, X_train.shape[1], activation='relu', name="conv00"))
model_conv00.add(Dense(units=1, activation=None, name="output"))
plot_model(model_conv00, to_file=os.path.join(base_url, "data/model.png"), 
           dpi=72, rankdir="LR", show_shapes=True, expand_nested=True)


def train_model(model, train_data,  validation_data,
                epochs=10, batch_size=512, 
                steps_per_epoch=100, loss='mse', optimizer='adam', 
                metrics=['mse'], verbose=0, base_dir=""):
  model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
  cbk = TqdmCallback()
  tiempo = time.time()
  history = model.fit(train_data, validation_data=validation_data,
                      epochs=epochs, steps_per_epoch=steps_per_epoch, 
                      batch_size=batch_size, verbose=verbose, callbacks=[cbk])
  clear_output()
  tiempo = time.time() - tiempo
  print(f"Tiempo de procesamiento: {tiempo:.2f} segundos.")

  #### Guardar el modelo
  base_dir = os.path.join(base_url, "models", model.name)
  model.save(f"{base_dir}.h5")
  dill.dump(tiempo, open(f"{base_dir}.time.dill", 'wb'))
  dill.dump(history.history, open(f"{base_dir}.hist.dill", 'wb'))
  #### fin sección guardar modelo

  return history
trained_conv00 = train_model(model_conv00, train3_gas,
                            validation_data=test3_gas,
                            metrics=["mse", "mae", "mean_squared_logarithmic_error"],
                            epochs=100, steps_per_epoch=5, 
                            base_dir=base_url)

Tiempo de procesamiento: 120.29 segundos.


performance_plot(trained_conv00.history, metrics=["mae", "mean_squared_logarithmic_error"], 
                 plot_validation=False)


performance_plot(trained_conv00.history, metrics=['mae', "mean_squared_logarithmic_error"])


model_conv01 = Sequential(name="model_conv01")
model_conv01.add(Input(shape=(X_train.shape[0], X_train.shape[1], ), 
                       name="input00"))
model_conv01.add(Conv1D(512, X_train.shape[1], activation='relu', name="conv00"))
model_conv01.add(Dense(units=1, activation=None, name="output"))
plot_model(model_conv01, to_file=os.path.join(base_url, "data/model.png"), 
           dpi=72, rankdir="LR", show_shapes=True, expand_nested=True)


trained_conv01 = train_model(model_conv00, train3_gas,
                            validation_data=test3_gas,
                            metrics=["mse", "mae", "mean_squared_logarithmic_error"],
                            epochs=100, steps_per_epoch=5, 
                            base_dir=base_url)

Tiempo de procesamiento: 118.18 segundos.


performance_plot(trained_conv01.history, metrics=["mae", "mean_squared_logarithmic_error"], 
                 plot_validation=False)


performance_plot(trained_conv01.history, metrics=['mae', "mean_squared_logarithmic_error"])


model_conv02 = Sequential(name="model_conv02")
model_conv02.add(Input(shape=(X_train.shape[0], X_train.shape[1], ), 
                       name="input00"))
model_conv02.add(Conv1D(512, X_train.shape[1], activation='relu', name="conv00"))
model_conv02.add(Dropout(0.5, name="dropout00"))
model_conv02.add(Dense(units=512, activation='relu', name="dnn"))
model_conv02.add(Dropout(0.5))
model_conv02.add(Dense(units=256, activation='relu'))
model_conv02.add(Dense(units=1, activation=None, name="output"))
plot_model(model_conv02, to_file=os.path.join(base_url, "data/model.png"), 
           dpi=72, rankdir="TB", show_shapes=True, expand_nested=True)


trained_conv02 = train_model(model_conv02, train3_gas,
                            validation_data=test3_gas,
                            metrics=["mse", "mae", "mean_squared_logarithmic_error"],
                            epochs=100, steps_per_epoch=5, 
                            base_dir=base_url)

Tiempo de procesamiento: 394.26 segundos.


performance_plot(trained_conv02.history, metrics=["mae", "mean_squared_logarithmic_error"], 
                 plot_validation=False)


performance_plot(trained_conv02.history, metrics=['mae', "mean_squared_logarithmic_error"])


model_conv03 = Sequential(name="model_conv03")
model_conv03.add(Input(shape=(X_train.shape[0], X_train.shape[1], ), 
                       name="input00"))
model_conv03.add(Conv1D(512, X_train.shape[1], activation='relu', name="conv00"))
model_conv03.add(Dropout(0.5, name="dropout00"))
model_conv03.add(Dense(units=512, activation='relu', name="dnn"))
model_conv03.add(Dropout(0.5))
model_conv03.add(Dense(units=256, activation='relu'))
model_conv03.add(Dense(units=1, activation=None, name="output"))
plot_model(model_conv03, to_file=os.path.join(base_url, "data/model.png"), 
           dpi=72, rankdir="TB", show_shapes=True, expand_nested=True)


trained_conv03 = train_model(model_conv03, train3_gas,
                            validation_data=test3_gas,
                            metrics=["mse", "mae", "mean_squared_logarithmic_error"],
                            epochs=100, steps_per_epoch=5, 
                            base_dir=base_url)

Tiempo de procesamiento: 393.58 segundos.


performance_plot(trained_conv03.history, metrics=["mae", "mean_squared_logarithmic_error"], 
                 plot_validation=False)


performance_plot(trained_conv03.history, metrics=['mae', "mean_squared_logarithmic_error"])

	temperature	pressure	humidity	gasResistance	IAQ	iaqAccuracy	datetime	year	month	day	hour	minute	datetime-1	delta	imputated
0	21.51	777.41	44.04	152149.0	34.7	1	2021-02-12 06:05:35.846304417	2021	2	12	6	5	NaT	NaN	False
1	21.51	777.41	43.98	152841.0	33.6	1	2021-02-12 06:05:38.837326527	2021	2	12	6	5	2021-02-12 06:05:35.846304417	2.0	False
2	21.54	777.41	43.73	153259.0	31.5	1	2021-02-12 06:05:47.812360048	2021	2	12	6	5	2021-02-12 06:05:38.837326527	8.0	False
3	21.53	777.41	43.70	152841.0	31.5	1	2021-02-12 06:05:50.803695202	2021	2	12	6	5	2021-02-12 06:05:47.812360048	2.0	False
4	21.52	777.41	43.70	153399.0	30.2	1	2021-02-12 06:05:53.795462847	2021	2	12	6	5	2021-02-12 06:05:50.803695202	2.0	False

Proyecto Final: Red Convolucional (Datos Secuenciales)¶

Preparación de los Datos¶

Separación de Sets Entrenamiento, Pruebas¶

Escalamos los Datos¶

Creación del Modelo 00 con Variable gasResistance¶

Creación del Modelo 00 Variable IAQ¶

Creación del Modelo 01 con Variable gasResistance¶

Creación del Modelo 01 Variable IAQ¶

Referencias¶