import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import os, gzip, json, re
import seaborn as sns
from dplython import (DplyFrame, X, diamonds, select, sift,
sample_n, sample_frac, head, arrange, mutate, group_by,
summarize, DelayFunction, dfilter)
import dplython
from plotnine import *
import pandas as pd
from IPython.display import display, Markdown
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/patsy/constraint.py:13: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated since Python 3.3,and in 3.9 it will stop working from collections import Mapping /home/jaa6766/.conda/envs/cuda/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject /home/jaa6766/.conda/envs/cuda/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject /home/jaa6766/.conda/envs/cuda/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject /home/jaa6766/.conda/envs/cuda/lib/python3.7/importlib/_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject
base_dir = "data/airdata"
data_dir = os.path.join(
os.getcwd(),
base_dir
)
airdata = []
display(Markdown(f"Listing data files from: {data_dir}"))
generator = (file for file in os.listdir(data_dir) if (file.find(".json.gz") > 0))
for filegz in generator:
display(Markdown(f"* Loading {filegz}"))
try:
with gzip.open(os.path.join(data_dir, filegz), 'rt') as file:
for (i, line) in enumerate(file):
#print(f"{filegz}:{i}", line.strip())
json_line = json.loads(line.strip())
airdata.append(json_line)
except EOFError:
continue
except Exception as e:
print(f"Error while reading file {filegz}", type(e))
raise e
display(Markdown("Done!"))
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
Listing data files from: /home/jaa6766/Documents/jorge3a/itam/deeplearning/dlfinal/data/airdata
Done!
%%time
df = pd.DataFrame(airdata)
df["datetime"] = pd.to_datetime(df["datetime"], unit='s')
df["year"] = [dt.year for dt in df.datetime]
df["month"] = [dt.month for dt in df.datetime]
df["day"] = [dt.day for dt in df.datetime]
df["hour"] = [dt.hour for dt in df.datetime]
df["minute"] = [dt.minute for dt in df.datetime]
df.head()
CPU times: user 2min 57s, sys: 4.93 s, total: 3min 2s Wall time: 3min 2s
temperature | pressure | humidity | gasResistance | IAQ | iaqAccuracy | datetime | year | month | day | hour | minute | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 21.54 | 777.41 | 43.93 | 151328 | 37.5 | 1 | 2021-02-12 06:04:09.089621067 | 2021 | 2 | 12 | 6 | 4 |
1 | 21.56 | 777.41 | 43.89 | 152702 | 35.6 | 1 | 2021-02-12 06:04:12.087778807 | 2021 | 2 | 12 | 6 | 4 |
2 | 21.53 | 777.41 | 43.97 | 151328 | 37.5 | 1 | 2021-02-12 06:04:15.072475433 | 2021 | 2 | 12 | 6 | 4 |
3 | 21.51 | 777.41 | 44.03 | 151464 | 38.5 | 1 | 2021-02-12 06:04:18.070170164 | 2021 | 2 | 12 | 6 | 4 |
4 | 21.51 | 777.41 | 44.05 | 152425 | 36.9 | 1 | 2021-02-12 06:04:21.061994791 | 2021 | 2 | 12 | 6 | 4 |
df.shape
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
(6285103, 12)
(
ggplot(df, aes(x = "datetime", y = "IAQ", color="iaqAccuracy")) +
geom_jitter(alpha=0.05, size=1.25) +
theme(axis_text_x=element_text(angle=45))
)
<ggplot: (8757982147873)>
(
ggplot(df, aes(x = "datetime", y = "temperature", color="pressure")) +
geom_jitter(alpha=0.05) +
theme(axis_text_x=element_text(angle=45)) +
labs(y="Temp (C)", color="Pressure (hPa)")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757872894413)>
(
ggplot(df, aes(x = "datetime", y = "IAQ", color="hour")) +
geom_jitter(alpha=0.05, size=1.25) +
theme(axis_text_x=element_text(angle=45))
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757896670165)>
(
ggplot(df, aes(x = "datetime", y = "temperature", color="hour")) +
geom_jitter(alpha=0.05) +
theme(axis_text_x=element_text(angle=45)) +
labs(y="Temp (C)")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757896634977)>
(
ggplot(df, aes(x = "datetime", y = "pressure", color="hour")) +
geom_jitter(alpha=0.05) +
theme(axis_text_x=element_text(angle=45)) +
labs(y="Pressure (hPa)")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757982086333)>
(
ggplot(df, aes(x = "datetime", y = "iaqAccuracy", color="hour")) +
geom_jitter(alpha=0.05, size=1.25) +
theme(axis_text_x=element_text(angle=45))
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757896273573)>
df.to_pickle("data/airdata/air.pickle")
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
base_dir = "data/sinaica/"
data_dir = os.path.join(
os.getcwd(),
base_dir
)
df = pd.read_pickle("data/airdata/air.pickle")
display(Markdown(f"Listing data files from: {data_dir}..."))
generator = (file for file in os.listdir(data_dir)
if (re.match(r"Datos SINAICA - [-A-ZáéíóúÁÉÍÓÚa-z0-9. ]{30,80}\.csv", file) is not None))
sinaica = None
for (j, file_csv) in enumerate(generator):
#display(Markdown(f"{i+1}. File \"{file_csv}\""))
try:
station, sensor = re.match(
r"Datos SINAICA - ([A-ZáéíóúÁÉÍÓÚa-z. ]{3,20}) - ([A-Z0-9a-z.]+) - [-0-9 ]+\.csv",
file_csv).groups()
#display(Markdown(f" * {station}, {sensor}"))
df2 = pd.read_csv(os.path.join(data_dir, file_csv))
df2 = df2.assign(Estacion=station)
sinaica = pd.concat([sinaica, df2])
sinaica["Hora"] = sinaica[["Hora"]].replace("- .*$", "", regex=True)
sinaica["Fecha"] = pd.to_datetime(sinaica["Fecha"] + " " + sinaica["Hora"])
#display(df2.head(3))
except Exception as e:
print(f"Error while reading file {file_csv}", type(e))
raise e
if sinaica is None:
print("Loading pickle prev data...")
sinaica = pd.read_pickle("data/sinaica/sinaica.pickle")
sinaica = sinaica.sort_values(by=["Fecha", "Estacion", "Parámetro"])
sinaica = sinaica[(sinaica["Fecha"] >= "2021-01-01")].copy()
#display(Markdown(f"Done reading {j+1} files!"))
display(pd.concat([sinaica.head(5), sinaica.tail(5)]).rename(
columns={"Parámetro": "Parameter", "Fecha": "Date", "Valor": "Value",
"Unidad": "Units", "Estacion": "Monitoring Station"}
))
Listing data files from: /home/jaa6766/Documents/jorge3a/itam/deeplearning/dlfinal/data/sinaica/...
Loading pickle prev data...
Parameter | Date | Value | Units | Monitoring Station | |
---|---|---|---|---|---|
1 | CO | 2021-01-01 | 0.600 | ppm | Camarones |
1 | NO | 2021-01-01 | 0.006 | ppm | Camarones |
1 | NO2 | 2021-01-01 | 0.029 | ppm | Camarones |
1 | NOx | 2021-01-01 | 0.034 | ppm | Camarones |
1 | O3 | 2021-01-01 | 0.011 | ppm | Camarones |
34 | SO2 | 2021-10-08 | 0.002 | ppm | Merced |
35 | SO2 | 2021-10-08 | 0.001 | ppm | Merced |
36 | SO2 | 2021-10-08 | 0.001 | ppm | Merced |
37 | SO2 | 2021-10-08 | 0.000 | ppm | Merced |
38 | SO2 | 2021-10-08 | 0.001 | ppm | Merced |
sinaica.rename(
columns={"Parámetro": "Parameter", "Fecha": "Date", "Valor": "Value",
"Unidad": "Units", "Estacion": "Monitoring Station"}
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
Parameter | Date | Value | Units | Monitoring Station | |
---|---|---|---|---|---|
1 | CO | 2021-01-01 | 0.600 | ppm | Camarones |
1 | NO | 2021-01-01 | 0.006 | ppm | Camarones |
1 | NO2 | 2021-01-01 | 0.029 | ppm | Camarones |
1 | NOx | 2021-01-01 | 0.034 | ppm | Camarones |
1 | O3 | 2021-01-01 | 0.011 | ppm | Camarones |
... | ... | ... | ... | ... | ... |
34 | SO2 | 2021-10-08 | 0.002 | ppm | Merced |
35 | SO2 | 2021-10-08 | 0.001 | ppm | Merced |
36 | SO2 | 2021-10-08 | 0.001 | ppm | Merced |
37 | SO2 | 2021-10-08 | 0.000 | ppm | Merced |
38 | SO2 | 2021-10-08 | 0.001 | ppm | Merced |
196289 rows × 5 columns
These are the air quality monitoring stations that are close to "Camarones", which is the one nearby to our sensor:
(
ggplot(sinaica.rename(columns={"Parámetro": "Parameters"})) +
geom_point(aes(x="Fecha", y="Valor", color="Parameters")) +
facet_wrap("Parameters", scales="free") +
labs(title="Pollulant Variables Visualization") +
theme(axis_text_x=element_text(angle=90),
subplots_adjust={'wspace': 0.25, 'hspace': 0.25}
)
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757896202205)>
(
ggplot(sinaica[(sinaica["Estacion"] == "Camarones")].rename(columns={"Parámetro": "Parameters"})) +
geom_point(aes(x="Fecha", y="Valor", color="Parameters")) +
facet_wrap("Parameters", scales="free") +
labs(title="Estación Camarones") +
theme(axis_text_x=element_text(angle=90),
subplots_adjust={'wspace': 0.25, 'hspace': 0.25}
)
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757978345469)>
sinaica.to_pickle("data/sinaica/sinaica.pickle")
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
These are the hourly averages of the sensor in order to make them match the government air quality monitoring stations that report hourly vs every 3 seconds.
ddf = DplyFrame(df)
ddf.head(5)
temperature | pressure | humidity | gasResistance | IAQ | iaqAccuracy | datetime | year | month | day | hour | minute | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 21.54 | 777.41 | 43.93 | 151328 | 37.5 | 1 | 2021-02-12 06:04:09.089621067 | 2021 | 2 | 12 | 6 | 4 |
1 | 21.56 | 777.41 | 43.89 | 152702 | 35.6 | 1 | 2021-02-12 06:04:12.087778807 | 2021 | 2 | 12 | 6 | 4 |
2 | 21.53 | 777.41 | 43.97 | 151328 | 37.5 | 1 | 2021-02-12 06:04:15.072475433 | 2021 | 2 | 12 | 6 | 4 |
3 | 21.51 | 777.41 | 44.03 | 151464 | 38.5 | 1 | 2021-02-12 06:04:18.070170164 | 2021 | 2 | 12 | 6 | 4 |
4 | 21.51 | 777.41 | 44.05 | 152425 | 36.9 | 1 | 2021-02-12 06:04:21.061994791 | 2021 | 2 | 12 | 6 | 4 |
(
ddf >>
dfilter(X.datetime >= '2021-03-01 06:00:00',
X.datetime <= '2021-03-01 06:02:00',
) >>
dplython.head(3)
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. /home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/dplython/dplython.py:196: DeprecationWarning: 'dfilter' is deprecated. Please use 'sift' instead.
temperature | pressure | humidity | gasResistance | IAQ | iaqAccuracy | datetime | year | month | day | hour | minute | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
490473 | 28.06 | 780.7 | 30.37 | 175863 | 198.1 | 3 | 2021-03-01 06:00:01.807887316 | 2021 | 3 | 1 | 6 | 0 |
490474 | 28.05 | 780.7 | 30.38 | 176417 | 197.7 | 3 | 2021-03-01 06:00:04.803511858 | 2021 | 3 | 1 | 6 | 0 |
490475 | 28.05 | 780.7 | 30.41 | 175313 | 198.0 | 3 | 2021-03-01 06:00:07.798833609 | 2021 | 3 | 1 | 6 | 0 |
%%time
ddf = (
ddf >>
sift(X.iaqAccuracy > 0) >> ## descartamos las lecturas del sensor incorrectas
group_by(X.year, X.month, X.day, X.hour) >>
summarize(temperature=X.temperature.mean(),
pressure=X.pressure.mean(),
humidity=X.humidity.mean(),
gasResistance=X.gasResistance.mean(),
IAQ=X.IAQ.mean(),
iaqAccuracy=X.iaqAccuracy.mode()
)
)
ddf
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
CPU times: user 23.1 s, sys: 2.9 s, total: 26 s Wall time: 26 s
year | month | day | hour | temperature | pressure | humidity | gasResistance | IAQ | iaqAccuracy | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2021 | 2 | 12 | 6 | 21.557391 | 777.271496 | 44.289745 | 1.439648e+05 | 90.755292 | 1 |
1 | 2021 | 2 | 12 | 7 | 21.153699 | 777.077872 | 43.183375 | 1.497397e+05 | 81.831588 | 1 |
2 | 2021 | 2 | 12 | 8 | 20.653242 | 776.620657 | 42.604564 | 1.537118e+05 | 86.220615 | 1 |
3 | 2021 | 2 | 12 | 9 | 20.406470 | 776.213214 | 42.223995 | 1.491061e+05 | 138.266030 | 1 |
4 | 2021 | 2 | 12 | 10 | 20.051380 | 776.202968 | 42.269584 | 1.428894e+05 | 198.164339 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5223 | 2021 | 9 | 17 | 21 | 26.476714 | 780.191339 | 50.048186 | 1.043343e+06 | 37.047504 | 1 |
5224 | 2021 | 9 | 17 | 22 | 26.849135 | 780.496165 | 50.588394 | 1.050633e+06 | 38.850749 | 1 |
5225 | 2021 | 9 | 17 | 23 | 26.281820 | 782.067298 | 54.032219 | 9.918547e+05 | 81.164589 | 1 |
5226 | 2021 | 9 | 18 | 0 | 26.222995 | 782.853860 | 55.496814 | 9.266802e+05 | 128.776123 | 1 |
5227 | 2021 | 9 | 18 | 1 | 25.928134 | 782.818660 | 56.467943 | 9.204316e+05 | 134.620335 | 1 |
5228 rows × 10 columns
Estadísticas de los valores de las lecturas
ddf.describe()
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
year | month | day | hour | temperature | pressure | humidity | gasResistance | IAQ | iaqAccuracy | |
---|---|---|---|---|---|---|---|---|---|---|
count | 5228.0 | 5228.000000 | 5228.000000 | 5228.000000 | 5228.000000 | 5228.000000 | 5228.000000 | 5.228000e+03 | 5228.000000 | 5228.000000 |
mean | 2021.0 | 5.509946 | 15.633512 | 11.506121 | 24.349677 | 781.624137 | 43.442600 | 6.950943e+05 | 157.429759 | 2.551071 |
std | 0.0 | 2.093404 | 8.655932 | 6.921888 | 2.489229 | 2.187106 | 12.554327 | 3.111298e+05 | 69.918421 | 0.817749 |
min | 2021.0 | 2.000000 | 1.000000 | 0.000000 | 17.282542 | 774.004780 | 8.750125 | 9.540458e+04 | 22.751331 | 1.000000 |
25% | 2021.0 | 4.000000 | 8.000000 | 6.000000 | 22.518177 | 780.216749 | 32.662378 | 5.044701e+05 | 98.951158 | 3.000000 |
50% | 2021.0 | 6.000000 | 16.000000 | 12.000000 | 24.198175 | 781.726205 | 43.999339 | 6.915713e+05 | 171.485025 | 3.000000 |
75% | 2021.0 | 7.000000 | 23.000000 | 18.000000 | 26.116874 | 783.156527 | 54.322785 | 8.638399e+05 | 219.580799 | 3.000000 |
max | 2021.0 | 9.000000 | 31.000000 | 23.000000 | 31.066215 | 787.963968 | 70.166841 | 2.716493e+06 | 255.292928 | 3.000000 |
sinaica.rename(
columns={"Parámetro": "Parameter", "Fecha": "Date", "Valor": "Value",
"Unidad": "Units", "Estacion": "Monitoring Station"}
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
Parameter | Date | Value | Units | Monitoring Station | |
---|---|---|---|---|---|
1 | CO | 2021-01-01 | 0.600 | ppm | Camarones |
1 | NO | 2021-01-01 | 0.006 | ppm | Camarones |
1 | NO2 | 2021-01-01 | 0.029 | ppm | Camarones |
1 | NOx | 2021-01-01 | 0.034 | ppm | Camarones |
1 | O3 | 2021-01-01 | 0.011 | ppm | Camarones |
... | ... | ... | ... | ... | ... |
34 | SO2 | 2021-10-08 | 0.002 | ppm | Merced |
35 | SO2 | 2021-10-08 | 0.001 | ppm | Merced |
36 | SO2 | 2021-10-08 | 0.001 | ppm | Merced |
37 | SO2 | 2021-10-08 | 0.000 | ppm | Merced |
38 | SO2 | 2021-10-08 | 0.001 | ppm | Merced |
196289 rows × 5 columns
dsinaica = DplyFrame(sinaica)
dsinaica2 = (
dsinaica.
pivot_table(index=["Fecha", ], columns=["Estacion", "Parámetro"], values="Valor",
)
)
dsinaica2.columns = ["_".join(x).strip() for x in dsinaica2.columns]
dsinaica2.insert(0, "Fecha", dsinaica2.index)
dsinaica2.reset_index(drop=True, inplace=True)
dsinaica2.rename(
columns={"Parámetro": "Parameter", "Fecha": "Date", "Valor": "Value",
"Unidad": "Units", "Estacion": "Monitoring Station"}
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
Date | Camarones_CO | Camarones_NO | Camarones_NO2 | Camarones_NOx | Camarones_O3 | Camarones_PM10 | Camarones_PM2.5 | Camarones_SO2 | FES Acatlán_CO | ... | Miguel Hidalgo_O3 | Miguel Hidalgo_SO2 | Tlalnepantla_CO | Tlalnepantla_NO | Tlalnepantla_NO2 | Tlalnepantla_NOx | Tlalnepantla_O3 | Tlalnepantla_PM10 | Tlalnepantla_PM2.5 | Tlalnepantla_SO2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2021-01-01 00:00:00 | 0.600000 | 0.006000 | 0.029000 | 0.034 | 0.011000 | NaN | NaN | 0.002000 | 0.400000 | ... | 0.009 | 0.003 | 0.6 | NaN | 0.030 | 0.034 | 0.012 | 37.0 | 19.0 | 0.002 |
1 | 2021-01-01 01:00:00 | 1.000000 | 0.021000 | 0.038000 | 0.059 | 0.002000 | NaN | NaN | 0.002000 | 0.600000 | ... | 0.006 | 0.003 | 0.6 | NaN | 0.026 | 0.029 | 0.013 | 42.0 | 29.0 | 0.003 |
2 | 2021-01-01 02:00:00 | 0.800000 | 0.013000 | 0.035000 | 0.049 | 0.003000 | NaN | NaN | 0.001000 | 0.900000 | ... | 0.003 | 0.002 | 0.7 | NaN | 0.032 | 0.036 | 0.006 | 58.0 | 43.0 | 0.002 |
3 | 2021-01-01 03:00:00 | 1.000000 | 0.031000 | 0.034000 | 0.065 | 0.002000 | NaN | NaN | 0.001000 | 0.800000 | ... | 0.004 | 0.002 | 0.7 | NaN | 0.033 | 0.039 | 0.004 | 59.0 | 41.0 | 0.002 |
4 | 2021-01-01 04:00:00 | 0.600000 | 0.005000 | 0.029000 | 0.034 | 0.005000 | NaN | NaN | 0.001000 | 1.000000 | ... | 0.006 | 0.002 | 0.7 | NaN | 0.032 | 0.038 | 0.004 | 64.0 | 46.0 | 0.002 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2347 | 2021-10-04 00:00:00 | 0.441667 | 0.008292 | 0.015833 | NaN | 0.017167 | 22.173913 | 10.952381 | 0.000125 | 0.315000 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2348 | 2021-10-05 00:00:00 | 0.490000 | 0.010000 | 0.017000 | NaN | 0.013947 | 22.142857 | 8.736842 | 0.000000 | 0.466667 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2349 | 2021-10-06 00:00:00 | 0.542857 | 0.007571 | 0.022571 | NaN | 0.014333 | 25.150000 | 10.150000 | 0.000000 | 0.347619 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2350 | 2021-10-07 00:00:00 | 0.582609 | 0.011565 | 0.023130 | NaN | 0.021304 | 33.500000 | 15.428571 | 0.001783 | 0.447826 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2351 | 2021-10-08 00:00:00 | 0.738889 | 0.023778 | 0.026778 | NaN | 0.019667 | 41.266667 | 18.800000 | 0.010500 | 0.566667 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2352 rows × 45 columns
def describe_with_na(dataframe):
dna = dataframe.isna()
dna = dna.astype('int').sum()
dna.name = "NAs"
dataframe = dataframe.describe()
dataframe = dataframe.append(dna)
dataframe = dataframe.T
dataframe.insert(0, "Estacion", dataframe.index)
return dataframe
dsinaica2_describe = describe_with_na(dsinaica2)
dsinaica2_describe
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
Estacion | count | mean | std | min | 25% | 50% | 75% | max | NAs | |
---|---|---|---|---|---|---|---|---|---|---|
Camarones_CO | Camarones_CO | 2241.0 | 0.767037 | 0.412628 | 0.000000 | 0.500000 | 0.700000 | 0.9000 | 3.200 | 111.0 |
Camarones_NO | Camarones_NO | 2227.0 | 0.024599 | 0.043639 | 0.000000 | 0.003000 | 0.007000 | 0.0260 | 0.432 | 125.0 |
Camarones_NO2 | Camarones_NO2 | 2227.0 | 0.031139 | 0.015144 | 0.003000 | 0.020000 | 0.029000 | 0.0400 | 0.111 | 125.0 |
Camarones_NOx | Camarones_NOx | 2042.0 | 0.056961 | 0.054418 | 0.004000 | 0.022000 | 0.039000 | 0.0710 | 0.499 | 310.0 |
Camarones_O3 | Camarones_O3 | 2233.0 | 0.026094 | 0.022991 | 0.001000 | 0.005000 | 0.021000 | 0.0390 | 0.103 | 119.0 |
Camarones_PM10 | Camarones_PM10 | 1749.0 | 56.638261 | 26.729443 | 0.000000 | 40.000000 | 54.000000 | 70.0000 | 437.000 | 603.0 |
Camarones_PM2.5 | Camarones_PM2.5 | 1765.0 | 24.872841 | 12.620210 | 0.000000 | 16.000000 | 24.000000 | 32.0000 | 126.000 | 587.0 |
Camarones_SO2 | Camarones_SO2 | 2190.0 | 0.006284 | 0.012464 | -0.000048 | 0.001217 | 0.003000 | 0.0050 | 0.162 | 162.0 |
FES Acatlán_CO | FES Acatlán_CO | 2200.0 | 0.631953 | 0.359163 | 0.100000 | 0.400000 | 0.547913 | 0.8000 | 2.900 | 152.0 |
FES Acatlán_NO | FES Acatlán_NO | 1477.0 | 0.014508 | 0.023519 | 0.000000 | 0.002000 | 0.005000 | 0.0160 | 0.215 | 875.0 |
FES Acatlán_NO2 | FES Acatlán_NO2 | 2195.0 | 0.025301 | 0.013814 | 0.002000 | 0.015279 | 0.022280 | 0.0320 | 0.092 | 157.0 |
FES Acatlán_NOx | FES Acatlán_NOx | 2199.0 | 0.041105 | 0.034157 | 0.002000 | 0.018000 | 0.029000 | 0.0510 | 0.260 | 153.0 |
FES Acatlán_O3 | FES Acatlán_O3 | 2015.0 | 0.033841 | 0.026037 | 0.003000 | 0.013000 | 0.027000 | 0.0480 | 0.137 | 337.0 |
FES Acatlán_PM10 | FES Acatlán_PM10 | 2158.0 | 46.423940 | 31.656199 | 0.000000 | 26.729067 | 41.000000 | 60.0000 | 388.000 | 194.0 |
FES Acatlán_SO2 | FES Acatlán_SO2 | 2195.0 | 0.006255 | 0.009587 | 0.000000 | 0.002000 | 0.003000 | 0.0060 | 0.136 | 157.0 |
Gustavo A. Madero_NO2 | Gustavo A. Madero_NO2 | 2085.0 | 0.025859 | 0.014361 | 0.003000 | 0.013000 | 0.026000 | 0.0360 | 0.081 | 267.0 |
Gustavo A. Madero_O3 | Gustavo A. Madero_O3 | 2076.0 | 0.031555 | 0.029610 | 0.001000 | 0.004000 | 0.023000 | 0.0510 | 0.128 | 276.0 |
Gustavo A. Madero_PM10 | Gustavo A. Madero_PM10 | 2035.0 | 52.755283 | 27.911189 | 0.000000 | 34.500000 | 50.000000 | 67.0000 | 495.000 | 317.0 |
Gustavo A. Madero_PM2.5 | Gustavo A. Madero_PM2.5 | 2026.0 | 23.785291 | 13.182547 | 0.000000 | 14.000000 | 22.000000 | 31.0000 | 117.000 | 326.0 |
La Presa_CO | La Presa_CO | 2034.0 | 0.933628 | 0.513861 | 0.100000 | 0.600000 | 0.800000 | 1.1000 | 3.500 | 318.0 |
La Presa_O3 | La Presa_O3 | 1863.0 | 0.029086 | 0.025911 | 0.001000 | 0.006000 | 0.024000 | 0.0455 | 0.118 | 489.0 |
La Presa_SO2 | La Presa_SO2 | 2017.0 | 0.005163 | 0.009749 | 0.000000 | 0.001000 | 0.002000 | 0.0050 | 0.118 | 335.0 |
Merced_CO | Merced_CO | 2282.0 | 1.120228 | 0.409799 | 0.117391 | 0.852444 | 1.000000 | 1.3000 | 3.900 | 70.0 |
Merced_NO | Merced_NO | 1381.0 | 0.021361 | 0.034552 | 0.000000 | 0.003000 | 0.008000 | 0.0220 | 0.318 | 971.0 |
Merced_NO2 | Merced_NO2 | 2264.0 | 0.032059 | 0.013544 | 0.005000 | 0.022000 | 0.031000 | 0.0400 | 0.087 | 88.0 |
Merced_NOx | Merced_NOx | 2264.0 | 0.055272 | 0.042945 | 0.006000 | 0.027000 | 0.042000 | 0.0670 | 0.386 | 88.0 |
Merced_O3 | Merced_O3 | 2270.0 | 0.028855 | 0.027825 | 0.000000 | 0.005000 | 0.022000 | 0.0440 | 0.140 | 82.0 |
Merced_PM10 | Merced_PM10 | 2321.0 | 52.696149 | 24.127886 | 0.000000 | 37.000000 | 51.000000 | 66.0000 | 411.000 | 31.0 |
Merced_PM2.5 | Merced_PM2.5 | 2316.0 | 26.268777 | 12.682280 | 0.000000 | 18.000000 | 25.000000 | 33.0000 | 122.000 | 36.0 |
Merced_SO2 | Merced_SO2 | 2288.0 | 0.006037 | 0.009778 | 0.000000 | 0.002000 | 0.003000 | 0.0060 | 0.146 | 64.0 |
Miguel Hidalgo_CO | Miguel Hidalgo_CO | 2090.0 | 0.544785 | 0.343697 | 0.000000 | 0.300000 | 0.500000 | 0.7000 | 2.600 | 262.0 |
Miguel Hidalgo_NO | Miguel Hidalgo_NO | 2077.0 | 0.021710 | 0.038798 | 0.000000 | 0.002000 | 0.005000 | 0.0220 | 0.368 | 275.0 |
Miguel Hidalgo_NO2 | Miguel Hidalgo_NO2 | 2078.0 | 0.029400 | 0.013166 | 0.004000 | 0.019000 | 0.028000 | 0.0390 | 0.086 | 274.0 |
Miguel Hidalgo_NOx | Miguel Hidalgo_NOx | 2078.0 | 0.051113 | 0.046729 | 0.005000 | 0.022000 | 0.034000 | 0.0620 | 0.395 | 274.0 |
Miguel Hidalgo_O3 | Miguel Hidalgo_O3 | 2082.0 | 0.033624 | 0.028588 | 0.002000 | 0.009000 | 0.027000 | 0.0500 | 0.145 | 270.0 |
Miguel Hidalgo_SO2 | Miguel Hidalgo_SO2 | 2081.0 | 0.005228 | 0.008925 | 0.000000 | 0.001000 | 0.002000 | 0.0050 | 0.099 | 271.0 |
Tlalnepantla_CO | Tlalnepantla_CO | 2023.0 | 0.740287 | 0.360615 | 0.100000 | 0.500000 | 0.600000 | 0.9000 | 2.900 | 329.0 |
Tlalnepantla_NO | Tlalnepantla_NO | 1039.0 | 0.020687 | 0.031055 | 0.000000 | 0.003000 | 0.007000 | 0.0220 | 0.219 | 1313.0 |
Tlalnepantla_NO2 | Tlalnepantla_NO2 | 1751.0 | 0.031346 | 0.014525 | 0.004000 | 0.021000 | 0.030000 | 0.0390 | 0.097 | 601.0 |
Tlalnepantla_NOx | Tlalnepantla_NOx | 1752.0 | 0.052864 | 0.040767 | 0.005000 | 0.026000 | 0.039000 | 0.0660 | 0.274 | 600.0 |
Tlalnepantla_O3 | Tlalnepantla_O3 | 2077.0 | 0.027766 | 0.025335 | 0.000000 | 0.007000 | 0.020000 | 0.0430 | 0.125 | 275.0 |
Tlalnepantla_PM10 | Tlalnepantla_PM10 | 1989.0 | 48.649573 | 27.959538 | 0.000000 | 33.000000 | 45.000000 | 59.0000 | 423.000 | 363.0 |
Tlalnepantla_PM2.5 | Tlalnepantla_PM2.5 | 1973.0 | 22.273188 | 12.075484 | 0.000000 | 14.000000 | 21.000000 | 29.0000 | 90.000 | 379.0 |
Tlalnepantla_SO2 | Tlalnepantla_SO2 | 2066.0 | 0.008569 | 0.014821 | 0.001000 | 0.002000 | 0.004000 | 0.0080 | 0.179 | 286.0 |
Fecha | Fecha | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 |
dsinaica2_describe = (
dsinaica2_describe[["Estacion", "NAs"]]
)[dsinaica2_describe["Estacion"] != "Fecha"]
dsinaica2_describe.index.name = ""
dsinaica2_describe.reset_index(inplace=True)
dsinaica2_describe[["Estacion", "NAs"]].sort_values("NAs",
ascending=False,
inplace=True)
dsinaica2_describe
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above. /home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Estacion | NAs | ||
---|---|---|---|
0 | Camarones_CO | Camarones_CO | 111.0 |
1 | Camarones_NO | Camarones_NO | 125.0 |
2 | Camarones_NO2 | Camarones_NO2 | 125.0 |
3 | Camarones_NOx | Camarones_NOx | 310.0 |
4 | Camarones_O3 | Camarones_O3 | 119.0 |
5 | Camarones_PM10 | Camarones_PM10 | 603.0 |
6 | Camarones_PM2.5 | Camarones_PM2.5 | 587.0 |
7 | Camarones_SO2 | Camarones_SO2 | 162.0 |
8 | FES Acatlán_CO | FES Acatlán_CO | 152.0 |
9 | FES Acatlán_NO | FES Acatlán_NO | 875.0 |
10 | FES Acatlán_NO2 | FES Acatlán_NO2 | 157.0 |
11 | FES Acatlán_NOx | FES Acatlán_NOx | 153.0 |
12 | FES Acatlán_O3 | FES Acatlán_O3 | 337.0 |
13 | FES Acatlán_PM10 | FES Acatlán_PM10 | 194.0 |
14 | FES Acatlán_SO2 | FES Acatlán_SO2 | 157.0 |
15 | Gustavo A. Madero_NO2 | Gustavo A. Madero_NO2 | 267.0 |
16 | Gustavo A. Madero_O3 | Gustavo A. Madero_O3 | 276.0 |
17 | Gustavo A. Madero_PM10 | Gustavo A. Madero_PM10 | 317.0 |
18 | Gustavo A. Madero_PM2.5 | Gustavo A. Madero_PM2.5 | 326.0 |
19 | La Presa_CO | La Presa_CO | 318.0 |
20 | La Presa_O3 | La Presa_O3 | 489.0 |
21 | La Presa_SO2 | La Presa_SO2 | 335.0 |
22 | Merced_CO | Merced_CO | 70.0 |
23 | Merced_NO | Merced_NO | 971.0 |
24 | Merced_NO2 | Merced_NO2 | 88.0 |
25 | Merced_NOx | Merced_NOx | 88.0 |
26 | Merced_O3 | Merced_O3 | 82.0 |
27 | Merced_PM10 | Merced_PM10 | 31.0 |
28 | Merced_PM2.5 | Merced_PM2.5 | 36.0 |
29 | Merced_SO2 | Merced_SO2 | 64.0 |
30 | Miguel Hidalgo_CO | Miguel Hidalgo_CO | 262.0 |
31 | Miguel Hidalgo_NO | Miguel Hidalgo_NO | 275.0 |
32 | Miguel Hidalgo_NO2 | Miguel Hidalgo_NO2 | 274.0 |
33 | Miguel Hidalgo_NOx | Miguel Hidalgo_NOx | 274.0 |
34 | Miguel Hidalgo_O3 | Miguel Hidalgo_O3 | 270.0 |
35 | Miguel Hidalgo_SO2 | Miguel Hidalgo_SO2 | 271.0 |
36 | Tlalnepantla_CO | Tlalnepantla_CO | 329.0 |
37 | Tlalnepantla_NO | Tlalnepantla_NO | 1313.0 |
38 | Tlalnepantla_NO2 | Tlalnepantla_NO2 | 601.0 |
39 | Tlalnepantla_NOx | Tlalnepantla_NOx | 600.0 |
40 | Tlalnepantla_O3 | Tlalnepantla_O3 | 275.0 |
41 | Tlalnepantla_PM10 | Tlalnepantla_PM10 | 363.0 |
42 | Tlalnepantla_PM2.5 | Tlalnepantla_PM2.5 | 379.0 |
43 | Tlalnepantla_SO2 | Tlalnepantla_SO2 | 286.0 |
(
ggplot(dsinaica2_describe) +
geom_col(aes(x="reorder(Estacion, (NAs))", y="NAs", fill="Estacion"),
show_legend=False) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Estación", y="# Missing Records",
title="Missing Records Histogram\n"+
"by Pollulant and Government Station")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757977309781)>
(
ggplot(dsinaica2_describe) +
geom_col(aes(x="reorder(Estacion, (NAs))", y="100*NAs/dsinaica2.shape[0]",
fill="Estacion"),
show_legend=False) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Estación", y="% Missing Records",
title="Percentages of Missing Records\n"+
"by Pollulant and Monitoring Station")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757789809049)>
(
ggplot(dsinaica2_describe[dsinaica2_describe["Estacion"].str.match("Camarones")]) +
geom_col(aes(x="reorder(Estacion, (NAs))", y="NAs", fill="Estacion"),
show_legend=False) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Pollulant", y="# Missing Records",
title="Histogram of Missing Records\n"+
"by Pollulant in Camarones Monitoring Station")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757977295893)>
(
ggplot(dsinaica2_describe[dsinaica2_describe["Estacion"].str.match("Camarones")]) +
geom_col(aes(x="reorder(Estacion, (NAs))", y="100*NAs/dsinaica2.shape[0]",
fill="Estacion"),
show_legend=False) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Pollulant", y="% Missing Records",
title="Percentage of Missing Records\n"+
"by Pollulants in Camarones")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757895815993)>
dsinaica2_describe[dsinaica2_describe["Estacion"].str.match("Camarones")]
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
Estacion | NAs | ||
---|---|---|---|
0 | Camarones_CO | Camarones_CO | 111.0 |
1 | Camarones_NO | Camarones_NO | 125.0 |
2 | Camarones_NO2 | Camarones_NO2 | 125.0 |
3 | Camarones_NOx | Camarones_NOx | 310.0 |
4 | Camarones_O3 | Camarones_O3 | 119.0 |
5 | Camarones_PM10 | Camarones_PM10 | 603.0 |
6 | Camarones_PM2.5 | Camarones_PM2.5 | 587.0 |
7 | Camarones_SO2 | Camarones_SO2 | 162.0 |
dsinaica2_describe[dsinaica2_describe["Estacion"].str.match("Merced")]
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
Estacion | NAs | ||
---|---|---|---|
22 | Merced_CO | Merced_CO | 70.0 |
23 | Merced_NO | Merced_NO | 971.0 |
24 | Merced_NO2 | Merced_NO2 | 88.0 |
25 | Merced_NOx | Merced_NOx | 88.0 |
26 | Merced_O3 | Merced_O3 | 82.0 |
27 | Merced_PM10 | Merced_PM10 | 31.0 |
28 | Merced_PM2.5 | Merced_PM2.5 | 36.0 |
29 | Merced_SO2 | Merced_SO2 | 64.0 |
(
ggplot(dsinaica2_describe[dsinaica2_describe["Estacion"].str.match("Merced")]) +
geom_col(aes(x="reorder(Estacion, (NAs))", y="NAs", fill="Estacion"),
show_legend=False) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Pollulant", y="# Missing Records",
title="Histogram of Missing Records\n"+
"by Pollulant in La Merced")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757896201989)>
(
ggplot(dsinaica2_describe[dsinaica2_describe["Estacion"].str.match("Miguel Hidalgo")]) +
geom_col(aes(x="reorder(Estacion, (NAs))", y="NAs", fill="Estacion"),
show_legend=False) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Pollulant", y="# Missing Records",
title="Histogram of Missing Records\n"+
"by Pollulant in Miguel Hidalgo")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757977289453)>
dsinaica2.to_pickle("data/sinaica/dsinaica.pickle")
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
OpenWeatherMap Data
%%time
#read OpenWeatherData
weather = pd.read_csv("data/openweathermap/2f101ea00e7759ea8723b848ac8b18d0.csv")
#get right date time
weather["dt"] = pd.to_datetime(weather["dt"], unit='s')
#select columns to reduce size
weather = weather[["dt", "temp", "feels_like", "temp_min", "temp_max",
"pressure", "humidity", "wind_speed", "wind_deg", "rain_1h", "rain_3h",
"clouds_all", "weather_id", "weather_main"]]
#impute rain data with zeroes instead of NaN
weather.fillna(value=0, inplace=True)
#select only the right time frame
weather = weather[weather.dt >= df.datetime.min()].reset_index(drop=True)
weather.head(2)
CPU times: user 1.01 s, sys: 88 ms, total: 1.1 s Wall time: 1.1 s
dt | temp | feels_like | temp_min | temp_max | pressure | humidity | wind_speed | wind_deg | rain_1h | rain_3h | clouds_all | weather_id | weather_main | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2021-02-12 07:00:00 | 13.87 | 12.46 | 5.21 | 13.92 | 1020 | 44 | 0.0 | 0 | 0.0 | 0.0 | 1 | 800 | Clear |
1 | 2021-02-12 08:00:00 | 12.81 | 11.37 | 4.21 | 12.92 | 1020 | 47 | 0.0 | 0 | 0.0 | 0.0 | 1 | 800 | Clear |
(
ggplot(weather) +
geom_point(aes(x="dt", y="temp", colour="temp"),
show_legend=False) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Date", y="Temp (c)",
title="Scatter Plot of Weather\n" +
"Conditions: Temperature")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757896490037)>
(
ggplot(weather) +
geom_histogram(aes(x="temp"), fill="blue",
show_legend=False, binwidth=1) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Temp (C)", y="Freq",
title="Histogram Plot of Weather\n" +
"Conditions: Temperature")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757896639785)>
(
ggplot(weather) +
geom_point(aes(x="dt", y="pressure", colour="pressure"),
show_legend=False) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Date", y="Pressure (hPa)",
title="Scatter Plot of Weather\n" +
"Conditions: Pressure")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757896637977)>
(
ggplot(weather) +
geom_histogram(aes(x="pressure"), fill="blue",
show_legend=False, bins=34) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Pressure (hPa)", y="Freq",
title="Histogram Plot of Weather\n" +
"Conditions: Pressure")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757896489301)>
(
ggplot(weather) +
geom_point(aes(x="dt", y="humidity", colour="humidity"),
show_legend=False) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Date", y="Humidity (% rh)",
title="Scatter Plot of Weather\n" +
"Conditions: Humidity")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757896637225)>
(
ggplot(weather) +
geom_histogram(aes(x="humidity", colour="humidity"),
show_legend=False, fill="blue", bins=25) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Humidity (% rh)", y="Freq",
title="Histogram Plot of Weather\n" +
"Conditions: Humidity")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757973545229)>
(
ggplot(weather) +
geom_point(aes(x="dt", y="wind_speed", colour="wind_speed"),
show_legend=False) +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Date", y="Wind Speed (m/s)",
title="Scatter Plot of Weather\n" +
"Conditions: Wind Speeed")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757896494013)>
(
ggplot(weather) +
geom_histogram(aes(x="wind_speed"),
show_legend=False, bins=43, fill="blue") +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Wind Speed (m/s)", y="Freq",
title="Histogram Plot of Weather\n" +
"Conditions: Wind Speed")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757982114029)>
(
ggplot(weather) +
geom_histogram(aes(x="weather_main"),
show_legend=False, bins=15, fill="blue") +
theme(axis_text_x=element_text(rotation=90)) +
labs(x="Main Weather Conditions", y="Freq",
title="Histogram Plot of Weather\n" +
"Conditions: General Weather Conditions")
)
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
<ggplot: (8757549624317)>
weather.to_pickle("data/openweathermap/weather.pickle.gz")
/home/jaa6766/.conda/envs/cuda/lib/python3.7/site-packages/ipykernel/ipkernel.py:287: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
Bosch BME680 Datasheet. 2021.
Mancuso, Daniel. Indoor Air Quality Monitor | Hackster.io. 2019.