import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg
import questdb.ingress as qi


http_port = 9000  # Web Console and REST API
ilp_port = 9009  # Fast data ingestion port 
pg_port = 8812  # PostgreSQL-compatible endpoint


df = pd.read_parquet('energy.parquet.gzip')


df.dtypes #column names and types

timestamp        datetime64[ns]
load_actual             float64
load_forecast           float64
country_code             object
dtype: object

df


with qi.Sender('localhost', ilp_port) as sender:
  sender.dataframe(df, table_name='energy_total', at='timestamp')


plt.figure(figsize=(12,8)) # we are reading raw data from Pandas here
df.set_index('timestamp').groupby('country_code')['load_actual'].plot(legend=True, ylabel="MW")

country_code
AT    Axes(0.125,0.11;0.775x0.77)
BE    Axes(0.125,0.11;0.775x0.77)
DE    Axes(0.125,0.11;0.775x0.77)
HU    Axes(0.125,0.11;0.775x0.77)
LU    Axes(0.125,0.11;0.775x0.77)
NL    Axes(0.125,0.11;0.775x0.77)
Name: load_actual, dtype: object


qdb_conn_params = dict(user="admin", password="quest", host='127.0.0.1', port=pg_port, dbname="qdb")


with psycopg.connect(**qdb_conn_params) as conn:
    with conn.cursor() as cur:
        cur.execute("""\
                    SELECT timestamp, 
                    country_code, 
                    SUM(load_actual) as total_mw,                      
                    avg(load_actual) as average_mw 
                    FROM energy_total 
                    SAMPLE by 6M -- We downsample/aggregate to 6M periods
                    """)
        records = cur.fetchall()
        column_names = [desc[0] for desc in cur.description]

res_df = pd.DataFrame(records, columns= column_names )


res_df #now we have only 24 rows (4 semesters, 6 countries)


# of course once your results come back from QuestDB and converted into a DataFrame, just use them as usual
plt.figure(figsize=(12,8))
plt.hist(res_df['total_mw'], bins=100)
plt.title("Distribution of the consumption")
plt.xlabel("Electricity consumption in MW")
plt.ylabel("MW")
plt.show()


with psycopg.connect(**qdb_conn_params) as conn:
    with conn.cursor() as cur:
        cur.execute("""\
                    WITH de_data AS (
                        SELECT timestamp, SUM(load_actual) AS load_actual 
                        FROM 'energy_total' 
                        WHERE country_code = 'DE' SAMPLE BY 15m FILL(null) 
                        -- FILL allows to find gaps when there are no rows for a particular interval
                    ) 
                    SELECT * FROM de_data WHERE load_actual IS NULL
                    """)
        records = cur.fetchall()
        column_names = [desc[0] for desc in cur.description]

res_df = pd.DataFrame(records, columns= column_names )
res_df #you can play with other countries, for example AT or LU to see if you can find any missing rows


with psycopg.connect(**qdb_conn_params) as conn:
    with conn.cursor() as cur:
        cur.execute("""\
                    WITH interpolated_de_data AS (
                        SELECT timestamp, SUM(load_actual) AS load_actual 
                        FROM 'energy_total' 
                        WHERE country_code = 'DE' 
                        SAMPLE BY 15m FILL(PREV) 
                        -- FILL also allows to use LINEAR interpolation, PREVious value, NULL, or a constant
                    ) 
                    SELECT * FROM interpolated_de_data WHERE timestamp > '2018-09-23T21'
                    """)
        records = cur.fetchall()
        column_names = [desc[0] for desc in cur.description]

res_df = pd.DataFrame(records, columns= column_names )
res_df #you can see now some gaps that were empty earlier are now showing results


with psycopg.connect(**qdb_conn_params) as conn:
    with conn.cursor() as cur:
        cur.execute("""\
                    WITH germany_daily AS (
                        SELECT timestamp, SUM(load_forecast) AS daily_forecast, SUM(load_actual) AS daily_actual 
                        FROM 'energy_total' 
                        WHERE country_code = 'DE' 
                        SAMPLE BY 1d ALIGN TO CALENDAR 
                        -- load_forecast makes reference to same period next day. Here we get the total actual and next day forecast for each day
                        )
                    SELECT a.timestamp, SUM(a.daily_actual) AS load_mw, SUM(f.daily_forecast) AS forecast_mw 
                    FROM germany_daily a LT JOIN germany_daily f 
                    SAMPLE BY 7d -- Aggregate in 7d chunks
                    
                    -- LT JOIN joins each row in the left table with the row from the right table,
                    -- so we can compare daily_load for today with the forecasted_load we did yesterday 
                    
                    """)
        records = cur.fetchall()
        column_names = [desc[0] for desc in cur.description]

res_df = pd.DataFrame(records, columns= column_names ).set_index('timestamp')


res_df #For each 7 day period we have now the actual_load, and the forecasted load for the same period


plt.figure(figsize=(12,8))
plt.plot(res_df['forecast_mw'], '-b', label='forecast', linewidth = 0.5)
ax=plt.plot(res_df['load_mw'], '-r', label='actual', linewidth = 0.5)
plt.title("Consumption vs. Forecast")
plt.legend(loc="lower left")
plt.ylabel('MW')
plt.show()


 with psycopg.connect(**qdb_conn_params) as conn:
    with conn.cursor() as cur:
        cur.execute("""\
                    SELECT timestamp, month(timestamp) as month, to_str(timestamp, 'E') as day, 
                           SUM(load_forecast) AS daily_forecast, 
                           SUM(load_actual) AS daily_actual 
                    FROM 'energy_total' 
                    WHERE country_code = 'DE' 
                    SAMPLE BY 1d ALIGN TO CALENDAR 
                    """)
        records = cur.fetchall()
        column_names = [desc[0] for desc in cur.description]

res_df = pd.DataFrame(records, columns= column_names )


plt.figure(figsize=(12,8))
sns.boxplot(data=res_df, x='day', y='daily_actual')
plt.show() #there seem to be some daily patterns


plt.figure(figsize=(12,8))
sns.boxplot(data=res_df, x='month', y='daily_actual')
plt.show() #patterns are clearer when checking by month

	timestamp	load_actual	load_forecast	country_code
105220	2018-01-01 00:00:00	6060.40	6497.24	AT
105221	2018-01-01 00:15:00	5971.20	6399.35	AT
105222	2018-01-01 00:30:00	5875.20	6321.08	AT
105223	2018-01-01 00:45:00	5830.80	6252.09	AT
105224	2018-01-01 01:00:00	5872.00	6328.14	AT
...	...	...	...	...
175295	2019-12-31 22:45:00	11284.98	9672.61	NL
175296	2019-12-31 23:00:00	11185.59	9573.25	NL
175297	2019-12-31 23:15:00	11207.35	9593.16	NL
175298	2019-12-31 23:30:00	11237.45	9583.39	NL
175299	2019-12-31 23:45:00	11228.79	9500.96	NL

	timestamp	country_code	total_mw	average_mw
0	2018-01-01	AT	2.571409e+08	7399.313283
1	2018-01-01	BE	3.545552e+08	10202.440696
2	2018-01-01	DE	1.998554e+09	57509.038702
3	2018-01-01	HU	1.737028e+08	4998.354505
4	2018-01-01	LU	1.637962e+07	471.328803
5	2018-01-01	NL	4.557404e+08	13114.076830
6	2018-07-01	AT	2.520463e+08	7134.462568
7	2018-07-01	BE	3.453483e+08	9775.484108
8	2018-07-01	DE	1.981274e+09	56388.726173
9	2018-07-01	HU	1.727741e+08	4890.570046
10	2018-07-01	LU	1.186526e+07	466.805487
11	2018-07-01	NL	4.620295e+08	13078.281776
12	2019-01-01	AT	2.555894e+08	7354.666459
13	2019-01-01	BE	3.442869e+08	9906.966312
14	2019-01-01	DE	1.982713e+09	57053.200110
15	2019-01-01	HU	1.742692e+08	5041.345153
16	2019-01-01	LU	1.442343e+07	455.313736
17	2019-01-01	NL	4.541964e+08	13069.648307
18	2019-07-01	AT	2.490932e+08	7050.872577
19	2019-07-01	BE	3.353089e+08	9491.305765
20	2019-07-01	DE	1.941084e+09	54944.646206
21	2019-07-01	HU	1.722451e+08	4902.240536
22	2019-07-01	LU	1.608098e+07	463.615972
23	2019-07-01	NL	4.570121e+08	12936.258440

	timestamp	load_actual
0	2018-09-23 22:00:00	None
1	2018-09-23 22:15:00	None
2	2018-09-23 22:30:00	None
3	2018-09-23 22:45:00	None
4	2018-09-23 23:00:00	None
...	...	...
91	2018-09-24 20:45:00	None
92	2018-09-24 21:00:00	None
93	2018-09-24 21:15:00	None
94	2018-09-24 21:30:00	None
95	2018-09-24 21:45:00	None

	timestamp	load_actual
0	2018-09-23 21:15:00	92435.52
1	2018-09-23 21:30:00	91305.70
2	2018-09-23 21:45:00	89529.18
3	2018-09-23 22:00:00	89529.18
4	2018-09-23 22:15:00	89529.18
...	...	...
44550	2019-12-31 22:45:00	86890.04
44551	2019-12-31 23:00:00	85566.24
44552	2019-12-31 23:15:00	84747.74
44553	2019-12-31 23:30:00	84205.66
44554	2019-12-31 23:45:00	84156.48

	load_mw	forecast_mw
timestamp
2018-01-01	73729717.19	63620571.36
2018-01-08	80709976.53	79560945.74
2018-01-15	83150951.93	85114412.46
2018-01-22	80812357.34	84301519.72
2018-01-29	81736847.06	82750740.76
...	...	...
2019-12-02	82063727.36	79007431.28
2019-12-09	82309507.76	81084675.92
2019-12-16	76388605.32	79319440.40
2019-12-23	62503123.90	65297914.32
2019-12-30	18599810.66	18725307.66

Try our time series database in this easy to launch notebook

Exploring the dataset¶

Inserting your dataframe into QuestDB¶

Reading data¶

Finding (and interpolating) missing data¶

Downsampling, joining by time, then downsampling again¶

Exploring Seasonality¶

Wrapping up¶